--- old/usr/src/uts/common/rpc/clnt_cots.c Wed Jun 17 00:15:45 2009 +++ new/usr/src/uts/common/rpc/clnt_cots.c Wed Jun 17 00:15:44 2009 @@ -190,6 +190,7 @@ #include #include #include +#include #define COTS_DEFAULT_ALLOCSIZE 2048 @@ -381,6 +382,10 @@ int, calllist_t *, int *, bool_t reconnect, const struct timeval *, bool_t, cred_t *); +static bool_t connmgr_getopt_int(queue_t *wq, int level, int name, int *val, + calllist_t *e, cred_t *cr); +static bool_t connmgr_setopt_int(queue_t *, int, int, int, + calllist_t *, cred_t *cr); static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr); static void connmgr_sndrel(struct cm_xprt *); static void connmgr_snddis(struct cm_xprt *); @@ -502,7 +507,23 @@ static zone_key_t zone_cots_key; +#define TWO_GIGB 0x80000000 +int nfsd_port = NFS_PORT; /* + * Defaults TCP send and receive buffer size for NFS connections. + * These values can be tuned by /etc/default. + */ +int nfs_send_bufsz = 1024*1024; +int nfs_recv_bufsz = 1024*1024; +/* + * To use system-wide default for TCP send and receive buffer size, + * use /etc/system to set nfs_default_bufsz to 1: + * + * set rpcmod:nfs_default_bufsz=1 + */ +int nfs_default_bufsz = 0; + +/* * We need to do this after all kernel threads in the zone have exited. */ /* ARGSUSED */ @@ -2558,6 +2579,60 @@ } /* + * Set TCP receive and xmit buffer size for NFS connections. + */ +static bool_t +connmgr_nfs_setbufsz(calllist_t *e, int addrfmly, struct netbuf *addr, + queue_t *wq, cred_t *cr) +{ + struct sockaddr_in *sa; + int ok = FALSE; + int val; + uint32_t sbufsz, rbufsz; + + if (nfs_default_bufsz || + (addrfmly != AF_INET && addrfmly != AF_INET6)) + return (FALSE); + + sa = (struct sockaddr_in *)addr->buf; + if (ntohs(sa->sin_port) != nfsd_port) + return (FALSE); + /* + * For system with 2GB, or less, of physical memory set send + * and receive buffer size to half of nfs_send_bufsz and + * nfs_recv_bufsz respectively. + */ + if (ptob(physmem) <= TWO_GIGB) { + sbufsz = nfs_send_bufsz >> 1; + rbufsz = nfs_recv_bufsz >> 1; + } else { + sbufsz = nfs_send_bufsz; + rbufsz = nfs_recv_bufsz; + } + /* + * Only set new buffer size if it's larger than the system + * default buffer size. If smaller buffer size is needed + * then use /etc/system to set nfs_default_bufsz to 1. + */ + ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr); + if ((ok == TRUE) && (val < sbufsz)) { + ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF, + sbufsz, e, cr); + DTRACE_PROBE2(connmgr_nfs_rcvbufsz__setopt, + int, ok, calllist_t *, e); + } + + ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr); + if ((ok == TRUE) && (val < rbufsz)) { + ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF, + rbufsz, e, cr); + DTRACE_PROBE2(connmgr_nfs_sndbufsz__setopt, + int, ok, calllist_t *, e); + } + return (TRUE); +} + +/* * Given an open stream, connect to the remote. Returns true if connected, * false otherwise. */ @@ -2609,6 +2684,9 @@ return (FALSE); } + /* Set TCP buffer size for NFS connections if needed */ + (void) connmgr_nfs_setbufsz(e, addrfmly, addr, wq, cr); + mp->b_datap->db_type = M_PROTO; tcr = (struct T_conn_req *)mp->b_rptr; bzero(tcr, sizeof (*tcr)); @@ -2764,10 +2842,125 @@ } /* + * Verify that the specified offset falls within the mblk and + * that the resulting pointer is aligned. + * Returns NULL if not. + * + * code from fs/sockfs/socksubr.c + */ +static void * +connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset, + t_uscalar_t length, uint_t align_size) +{ + uintptr_t ptr1, ptr2; + + ASSERT(mp && mp->b_wptr >= mp->b_rptr); + ptr1 = (uintptr_t)mp->b_rptr + offset; + ptr2 = (uintptr_t)ptr1 + length; + if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) { + return (NULL); + } + if ((ptr1 & (align_size - 1)) != 0) { + return (NULL); + } + return ((void *)ptr1); +} + +static bool_t +connmgr_getopt_int(queue_t *wq, int level, int name, int *val, + calllist_t *e, cred_t *cr) +{ + mblk_t *mp; + struct opthdr *opt, *opt_res; + struct T_optmgmt_req *tor; + struct T_optmgmt_ack *opt_ack; + struct timeval waitp; + int error; + + mp = allocb_cred(sizeof (struct T_optmgmt_req) + + sizeof (struct opthdr) + sizeof (int), cr, NOPID); + if (mp == NULL) { + RPCLOG0(1, "connmgr_getopt: cannot alloc mp for option " + "request\n"); + return (FALSE); + } + + mp->b_datap->db_type = M_PROTO; + tor = (struct T_optmgmt_req *)(mp->b_rptr); + tor->PRIM_type = T_SVR4_OPTMGMT_REQ; + tor->MGMT_flags = T_CURRENT; + tor->OPT_length = sizeof (struct opthdr) + sizeof (int); + tor->OPT_offset = sizeof (struct T_optmgmt_req); + + opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req)); + opt->level = level; + opt->name = name; + opt->len = sizeof (int); + mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + + sizeof (int); + + /* + * We will use this connection regardless + * of whether or not the option is readable. + */ + if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) { + DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend); + freemsg(mp); + return (FALSE); + } + + mutex_enter(&clnt_pending_lock); + + waitp.tv_sec = clnt_cots_min_conntout; + waitp.tv_usec = 0; + error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1); + + if (e->call_prev) + e->call_prev->call_next = e->call_next; + else + clnt_pending = e->call_next; + if (e->call_next) + e->call_next->call_prev = e->call_prev; + mutex_exit(&clnt_pending_lock); + + /* get reply message */ + mp = e->call_reply; + e->call_reply = NULL; + + if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) { + + DTRACE_PROBE4(connmgr_getopt__failed, int, name, + int, e->call_status, int, error, mblk_t *, mp); + + if (mp) + freemsg(mp); + return (FALSE); + } + + opt_ack = (struct T_optmgmt_ack *)mp->b_rptr; + opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset, + opt_ack->OPT_length, __TPI_ALIGN_SIZE); + + if (!opt_res) { + DTRACE_PROBE4(connmgr_getopt__optres, mblk_t *, mp, int, name, + int, opt_ack->OPT_offset, int, opt_ack->OPT_length); + freemsg(mp); + return (FALSE); + } + *val = *(int *)&opt_res[1]; + + DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val); + + freemsg(mp); + return (TRUE); +} + +/* * Called by connmgr_connect to set an option on the new stream. */ static bool_t -connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) +connmgr_setopt_int(queue_t *wq, int level, int name, int val, + calllist_t *e, cred_t *cr) { mblk_t *mp; struct opthdr *opt; @@ -2794,7 +2987,7 @@ opt->level = level; opt->name = name; opt->len = sizeof (int); - *(int *)((char *)opt + sizeof (*opt)) = 1; + *(int *)((char *)opt + sizeof (*opt)) = val; mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) + sizeof (int); @@ -2835,6 +3028,12 @@ return (TRUE); } +static bool_t +connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr) +{ + return (connmgr_setopt_int(wq, level, name, 1, e, cr)); +} + #ifdef DEBUG /*