Print this page
4953763 Need way to configure NFS window sizes without changing system wide defaults
6216670 NFS server needs a bigger transmit buffer

@@ -188,10 +188,11 @@
 #include <rpc/types.h>
 #include <rpc/xdr.h>
 #include <rpc/auth.h>
 #include <rpc/clnt.h>
 #include <rpc/rpc_msg.h>
+#include <nfs/nfs.h>
 
 #define COTS_DEFAULT_ALLOCSIZE  2048
 
 #define WIRE_HDR_SIZE   20      /* serialized call header, sans proc number */
 #define MSG_OFFSET      128     /* offset of call into the mblk */

@@ -379,10 +380,14 @@
 
 static bool_t   connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *,
                                 int, calllist_t *, int *, bool_t reconnect,
                                 const struct timeval *, bool_t, cred_t *);
 
+static bool_t  connmgr_getopt_int(queue_t *wq, int level, int name, int *val,
+                                calllist_t *e, cred_t *cr);
+static bool_t   connmgr_setopt_int(queue_t *, int, int, int,
+                                calllist_t *, cred_t *cr);
 static bool_t   connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr);
 static void     connmgr_sndrel(struct cm_xprt *);
 static void     connmgr_snddis(struct cm_xprt *);
 static void     connmgr_close(struct cm_xprt *);
 static void     connmgr_release(struct cm_xprt *);

@@ -500,11 +505,27 @@
  */
 int clnt_cots_do_bindresvport = 1;
 
 static zone_key_t zone_cots_key;
 
+#define TWO_GIGB        0x80000000
+int nfsd_port = NFS_PORT;
 /*
+ * Defaults TCP send and receive buffer size for NFS connections.
+ * These values can be tuned by /etc/default.
+ */
+int nfs_send_bufsz = 1024*1024;
+int nfs_recv_bufsz = 1024*1024;
+/*
+ * To use system-wide default for TCP send and receive buffer size,
+ * use /etc/system to set nfs_default_bufsz to 1:
+ *
+ * set rpcmod:nfs_default_bufsz=1
+ */
+int nfs_default_bufsz = 0;
+
+/*
  * We need to do this after all kernel threads in the zone have exited.
  */
 /* ARGSUSED */
 static void
 clnt_zone_destroy(zoneid_t zoneid, void *unused)

@@ -2556,10 +2577,64 @@
                 cv_signal(&cm_entry->x_cv);
         mutex_exit(&cm_entry->x_lock);
 }
 
 /*
+ * Set TCP receive and xmit buffer size for NFS connections.
+ */
+static bool_t
+connmgr_nfs_setbufsz(calllist_t *e, int addrfmly, struct netbuf *addr,
+    queue_t *wq, cred_t *cr)
+{
+        struct sockaddr_in *sa;
+        int ok = FALSE;
+        int val;
+        uint32_t sbufsz, rbufsz;
+
+        if (nfs_default_bufsz ||
+            (addrfmly != AF_INET && addrfmly != AF_INET6))
+                return (FALSE);
+
+        sa = (struct sockaddr_in *)addr->buf;
+        if (ntohs(sa->sin_port) != nfsd_port)
+                return (FALSE);
+        /*
+         * For system with 2GB, or less, of physical memory set send
+         * and receive buffer size to half of nfs_send_bufsz and
+         * nfs_recv_bufsz respectively.
+         */
+        if (ptob(physmem) <= TWO_GIGB) {
+                sbufsz = nfs_send_bufsz >> 1;
+                rbufsz = nfs_recv_bufsz >> 1;
+        } else {
+                sbufsz = nfs_send_bufsz;
+                rbufsz = nfs_recv_bufsz;
+        }
+        /*
+         * Only set new buffer size if it's larger than the system
+         * default buffer size. If smaller buffer size is needed
+         * then use /etc/system to set nfs_default_bufsz to 1.
+         */
+        ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_RCVBUF, &val, e, cr);
+        if ((ok == TRUE) && (val < sbufsz)) {
+                ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_RCVBUF,
+                    sbufsz, e, cr);
+                DTRACE_PROBE2(connmgr_nfs_rcvbufsz__setopt,
+                    int, ok, calllist_t *, e);
+        }
+
+        ok = connmgr_getopt_int(wq, SOL_SOCKET, SO_SNDBUF, &val, e, cr);
+        if ((ok == TRUE) && (val < rbufsz)) {
+                ok = connmgr_setopt_int(wq, SOL_SOCKET, SO_SNDBUF,
+                    rbufsz, e, cr);
+                DTRACE_PROBE2(connmgr_nfs_sndbufsz__setopt,
+                    int, ok, calllist_t *, e);
+        }
+        return (TRUE);
+}
+
+/*
  * Given an open stream, connect to the remote.  Returns true if connected,
  * false otherwise.
  */
 static bool_t
 connmgr_connect(

@@ -2607,10 +2682,13 @@
                 e->call_status = RPC_SYSTEMERROR;
                 e->call_reason = ENOSR;
                 return (FALSE);
         }
 
+        /* Set TCP buffer size for NFS connections if needed */
+        (void) connmgr_nfs_setbufsz(e, addrfmly, addr, wq, cr);
+
         mp->b_datap->db_type = M_PROTO;
         tcr = (struct T_conn_req *)mp->b_rptr;
         bzero(tcr, sizeof (*tcr));
         tcr->PRIM_type = T_CONN_REQ;
         tcr->DEST_length = addr->len;

@@ -2762,14 +2840,129 @@
         kstat_install(cm_entry->x_ksp);
         return (TRUE);
 }
 
 /*
+ * Verify that the specified offset falls within the mblk and
+ * that the resulting pointer is aligned.
+ * Returns NULL if not.
+ *
+ * code from fs/sockfs/socksubr.c
+ */
+static void *
+connmgr_opt_getoff(mblk_t *mp, t_uscalar_t offset,
+    t_uscalar_t length, uint_t align_size)
+{
+        uintptr_t ptr1, ptr2;
+
+        ASSERT(mp && mp->b_wptr >= mp->b_rptr);
+        ptr1 = (uintptr_t)mp->b_rptr + offset;
+        ptr2 = (uintptr_t)ptr1 + length;
+        if (ptr1 < (uintptr_t)mp->b_rptr || ptr2 > (uintptr_t)mp->b_wptr) {
+                return (NULL);
+        }
+        if ((ptr1 & (align_size - 1)) != 0) {
+                return (NULL);
+        }
+        return ((void *)ptr1);
+}
+
+static bool_t
+connmgr_getopt_int(queue_t *wq, int level, int name, int *val,
+    calllist_t *e, cred_t *cr)
+{
+        mblk_t *mp;
+        struct opthdr *opt, *opt_res;
+        struct T_optmgmt_req *tor;
+        struct T_optmgmt_ack *opt_ack;
+        struct timeval waitp;
+        int error;
+
+        mp = allocb_cred(sizeof (struct T_optmgmt_req) +
+            sizeof (struct opthdr) + sizeof (int), cr, NOPID);
+        if (mp == NULL) {
+                RPCLOG0(1, "connmgr_getopt: cannot alloc mp for option "
+                    "request\n");
+                return (FALSE);
+        }
+
+        mp->b_datap->db_type = M_PROTO;
+        tor = (struct T_optmgmt_req *)(mp->b_rptr);
+        tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
+        tor->MGMT_flags = T_CURRENT;
+        tor->OPT_length = sizeof (struct opthdr) + sizeof (int);
+        tor->OPT_offset = sizeof (struct T_optmgmt_req);
+
+        opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req));
+        opt->level = level;
+        opt->name = name;
+        opt->len = sizeof (int);
+        mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) +
+            sizeof (int);
+
+        /*
+         * We will use this connection regardless
+         * of whether or not the option is readable.
+         */
+        if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) {
+                DTRACE_PROBE(krpc__e__connmgr__getopt__cantsend);
+                freemsg(mp);
+                return (FALSE);
+        }
+
+        mutex_enter(&clnt_pending_lock);
+
+        waitp.tv_sec = clnt_cots_min_conntout;
+        waitp.tv_usec = 0;
+        error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1);
+
+        if (e->call_prev)
+                e->call_prev->call_next = e->call_next;
+        else
+                clnt_pending = e->call_next;
+        if (e->call_next)
+                e->call_next->call_prev = e->call_prev;
+        mutex_exit(&clnt_pending_lock);
+
+        /* get reply message */
+        mp = e->call_reply;
+        e->call_reply = NULL;
+
+        if ((!mp) || (e->call_status != RPC_SUCCESS) || (error != 0)) {
+
+                DTRACE_PROBE4(connmgr_getopt__failed, int, name,
+                    int, e->call_status, int, error, mblk_t *, mp);
+
+                if (mp)
+                        freemsg(mp);
+                return (FALSE);
+        }
+
+        opt_ack = (struct T_optmgmt_ack *)mp->b_rptr;
+        opt_res = (struct opthdr *)connmgr_opt_getoff(mp, opt_ack->OPT_offset,
+            opt_ack->OPT_length, __TPI_ALIGN_SIZE);
+
+        if (!opt_res) {
+                DTRACE_PROBE4(connmgr_getopt__optres, mblk_t *, mp, int, name,
+                    int, opt_ack->OPT_offset, int, opt_ack->OPT_length);
+                freemsg(mp);
+                return (FALSE);
+        }
+        *val = *(int *)&opt_res[1];
+
+        DTRACE_PROBE2(connmgr_getopt__ok, int, name, int, *val);
+
+        freemsg(mp);
+        return (TRUE);
+}
+
+/*
  * Called by connmgr_connect to set an option on the new stream.
  */
 static bool_t
-connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr)
+connmgr_setopt_int(queue_t *wq, int level, int name, int val,
+    calllist_t *e, cred_t *cr)
 {
         mblk_t *mp;
         struct opthdr *opt;
         struct T_optmgmt_req *tor;
         struct timeval waitp;

@@ -2792,11 +2985,11 @@
 
         opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req));
         opt->level = level;
         opt->name = name;
         opt->len = sizeof (int);
-        *(int *)((char *)opt + sizeof (*opt)) = 1;
+        *(int *)((char *)opt + sizeof (*opt)) = val;
         mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) +
             sizeof (int);
 
         /*
          * We will use this connection regardless

@@ -2833,10 +3026,16 @@
         }
         RPCLOG(8, "connmgr_setopt: successfully set option: %d\n", name);
         return (TRUE);
 }
 
+static bool_t
+connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr)
+{
+        return (connmgr_setopt_int(wq, level, name, 1, e, cr));
+}
+
 #ifdef  DEBUG
 
 /*
  * This is a knob to let us force code coverage in allocation failure
  * case.