Move CallBack Server thread creation, initial processing and destruction to RPC
Cleanup some RPC code.
Remove extraneous fields from nfs41_cb_info and clean up the code.
Change KM_SLEEP in mir_nfs41_callback_thread to KM_NOSLEEP.
Fix lint warnings
Incorporate code review comments.
Remove un-needed variable.

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All Rights Reserved
  29  */
  30 
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cred.h>
  35 #include <sys/vfs.h>
  36 #include <sys/vfs_opreg.h>
  37 #include <sys/vnode.h>
  38 #include <sys/pathname.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/kmem.h>
  41 #include <sys/mkdev.h>
  42 #include <sys/mount.h>
  43 #include <sys/statvfs.h>
  44 #include <sys/errno.h>
  45 #include <sys/debug.h>
  46 #include <sys/cmn_err.h>
  47 #include <sys/utsname.h>
  48 #include <sys/bootconf.h>
  49 #include <sys/modctl.h>
  50 #include <sys/acl.h>
  51 #include <sys/flock.h>
  52 #include <sys/time.h>
  53 #include <sys/disp.h>
  54 #include <sys/policy.h>
  55 #include <sys/socket.h>
  56 #include <sys/netconfig.h>
  57 #include <sys/dnlc.h>
  58 #include <sys/list.h>
  59 #include <sys/mntent.h>
  60 #include <sys/atomic.h>
  61 #include <sys/tsol/label.h>
  62 #include <sys/sdt.h>
  63 #include <sys/avl.h>
  64 
  65 #include <rpc/types.h>
  66 #include <rpc/auth.h>
  67 #include <rpc/rpcsec_gss.h>
  68 #include <rpc/clnt.h>
  69 
  70 #include <nfs/nfs.h>
  71 #include <nfs/nfs_clnt.h>
  72 #include <nfs/mount.h>
  73 #include <nfs/nfs_acl.h>
  74 
  75 #include <fs/fs_subr.h>
  76 
  77 #include <nfs/nfs4.h>
  78 #include <nfs/rnode4.h>
  79 #include <nfs/nfs4_clnt.h>
  80 #include <nfs/nfs4_clnt_impl.h>
  81 #include <sys/fs/autofs.h>
  82 
  83 
  84 /*
  85  * Arguments passed to thread to free data structures from forced unmount.
  86  */
  87 
  88 typedef struct {
  89         vfs_t   *fm_vfsp;
  90         int     fm_flag;
  91         cred_t  *fm_cr;
  92 } freemountargs_t;
  93 
  94 static void     async_free_mount(vfs_t *, int, cred_t *);
  95 static void     nfs4_free_mount(vfs_t *, int, cred_t *);
  96 static void     nfs4_free_mount_thread(freemountargs_t *);
  97 static int nfs4_chkdup_servinfo4(servinfo4_t *, servinfo4_t *);
  98 
  99 /*
 100  * From rpcsec module (common/rpcsec).
 101  */
 102 extern int sec_clnt_loadinfo(struct sec_data *, struct sec_data **, model_t);
 103 extern void sec_clnt_freeinfo(struct sec_data *);
 104 
 105 /*
 106  * The order and contents of this structure must be kept in sync with that of
 107  * rfsreqcnt_v4_tmpl in nfs_stats.c
 108  */
 109 static char *v40_ops[] = {
 110         "null", "compound", "reserved", "access", "close", "commit",
 111         "create", "delegpurge", "delegreturn", "getattr", "getfh", "link",
 112         "lock", "lockt", "locku", "lookup", "lookupp", "nverify", "open",
 113         "openattr", "open_confirm", "open_downgrade", "putfh", "putpubfh",
 114         "putrootfh", "read", "readdir", "readlink", "remove", "rename",
 115         "renew", "restorefh", "savefh", "secinfo", "setattr", "setclientid",
 116         "setclientid_confirm", "verify", "write", "release_lockowner"
 117 };
 118 
 119 /*
 120  * The order and contents of this structure must be kept in sync with that of
 121  * rfsreqcnt_v41_tmpl in nfs_stats.c
 122  */
 123 static char *v41_ops[] = {
 124         "null", "compound", "reserved", "access",
 125         "close", "commit", "create", "delegpurge", "delegreturn",
 126         "getattr",      "getfh", "link", "lock", "lockt", "locku",
 127         "lookup", "lookupp", "nverify", "open", "openattr",
 128         "open_confirm", "open_downgrade", "putfh", "putpubfh",
 129         "putrootfh", "read", "readdir", "readlink", "remove", "rename",
 130         "renew", "restorefh", "savefh", "secinfo", "setattr",
 131         "setclientid", "setclientid_confirm", "verify", "write",
 132         "release_lockowner", "backchannel_ctl", "bind_conn_to_session",
 133         "exchange_id", "create_session", "destroy_session",
 134         "free_stateid", "get_dir_delegation", "getdeviceinfo",
 135         "getdevicelist", "layoutcommit", "layoutget", "layoutreturn",
 136         "secinfo_no_name", "sequence", "set_ssv", "test_stateid",
 137         "want_delegation", "destroy_clientid", "reclaim_complete"};
 138 
 139 static char **rfsnames_v4[NFS4_MINORVERSMAX + 1] = {v40_ops, v41_ops};
 140 /*
 141  * nfs4_max_mount_retry is the number of times the client will redrive a
 142  * mount compound before giving up and returning failure.  The intent is
 143  * to redrive mount compounds which fail NFS4ERR_STALE so that if a
 144  * component of the server path being mounted goes stale, it can
 145  * "recover" by redriving the mount compund (LOOKUP ops).  This recovery
 146  * code is needed outside of the recovery framework because mount is a
 147  * special case.  The client doesn't create vnodes/rnodes for components
 148  * of the server path being mounted.  The recovery code recovers real
 149  * client objects, not STALE FHs which map to components of the server
 150  * path being mounted.
 151  *
 152  * We could just fail the mount on the first time, but that would
 153  * instantly trigger failover (from nfs4_mount), and the client should
 154  * try to re-lookup the STALE FH before doing failover.  The easiest way
 155  * to "re-lookup" is to simply redrive the mount compound.
 156  */
 157 static int nfs4_max_mount_retry = 2;
 158 
 159 uint32_t nfs4_max_minor_version = NFS4_MINORVERSMAX;
 160 uint32_t nfs4_min_minor_version = 0;
 161 
 162 /*
 163  * nfs4 vfs operations.
 164  */
 165 int             nfs4_mount(vfs_t *, vnode_t *, struct mounta *, cred_t *);
 166 static int      nfs4_unmount(vfs_t *, int, cred_t *);
 167 static int      nfs4_root(vfs_t *, vnode_t **);
 168 static int      nfs4_statvfs(vfs_t *, struct statvfs64 *);
 169 static int      nfs4_sync(vfs_t *, short, cred_t *);
 170 static int      nfs4_vget(vfs_t *, vnode_t **, fid_t *);
 171 static int      nfs4_mountroot(vfs_t *, whymountroot_t);
 172 static void     nfs4_freevfs(vfs_t *);
 173 
 174 static int      nfs4rootvp(vnode_t **, vfs_t *, struct servinfo4 *,
 175                     int, cred_t *, zone_t *);
 176 
 177 vfsops_t        *nfs4_vfsops;
 178 
 179 int nfs4_vfsinit(void);
 180 void nfs4_vfsfini(void);
 181 void nfs4_minorops_init(void);
 182 
 183 static void nfs4setclientid_init(void);
 184 static void nfs4setclientid_fini(void);
 185 static void     destroy_nfs4_server(nfs4_server_t *);
 186 static void     remove_mi(nfs4_server_t *, mntinfo4_t *);
 187 
 188 extern void nfs4_ephemeral_init(void);
 189 extern void nfs4_ephemeral_fini(void);
 190 
 191 /*
 192  * Initialize the vfs structure
 193  */
 194 
 195 static int nfs4fstyp;
 196 
 197 
 198 /*
 199  * Debug variable to check for rdma based
 200  * transport startup and cleanup. Controlled
 201  * through /etc/system. Off by default.
 202  */
 203 extern int rdma_debug;
 204 
 205 extern int nfs41_birpc;
 206 
 207 int
 208 nfs4init(int fstyp, char *name)
 209 {
 210         static const fs_operation_def_t nfs4_vfsops_template[] = {
 211                 VFSNAME_MOUNT,          { .vfs_mount = nfs4_mount },
 212                 VFSNAME_UNMOUNT,        { .vfs_unmount = nfs4_unmount },
 213                 VFSNAME_ROOT,           { .vfs_root = nfs4_root },
 214                 VFSNAME_STATVFS,        { .vfs_statvfs = nfs4_statvfs },
 215                 VFSNAME_SYNC,           { .vfs_sync = nfs4_sync },
 216                 VFSNAME_VGET,           { .vfs_vget = nfs4_vget },
 217                 VFSNAME_MOUNTROOT,      { .vfs_mountroot = nfs4_mountroot },
 218                 VFSNAME_FREEVFS,        { .vfs_freevfs = nfs4_freevfs },
 219                 NULL,                   NULL
 220         };
 221         int error;
 222 
 223         nfs4_vfsops = NULL;
 224         nfs4_vnodeops = NULL;
 225         nfs4_trigger_vnodeops = NULL;
 226 
 227         error = vfs_setfsops(fstyp, nfs4_vfsops_template, &nfs4_vfsops);
 228         if (error != 0) {
 229                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 230                     "nfs4init: bad vfs ops template");
 231                 goto out;
 232         }
 233 
 234         error = vn_make_ops(name, nfs4_vnodeops_template, &nfs4_vnodeops);
 235         if (error != 0) {
 236                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 237                     "nfs4init: bad vnode ops template");
 238                 goto out;
 239         }
 240 
 241         error = vn_make_ops("nfs4_trigger", nfs4_trigger_vnodeops_template,
 242             &nfs4_trigger_vnodeops);
 243         if (error != 0) {
 244                 zcmn_err(GLOBAL_ZONEID, CE_WARN,
 245                     "nfs4init: bad trigger vnode ops template");
 246                 goto out;
 247         }
 248 
 249         nfs4fstyp = fstyp;
 250         (void) nfs4_vfsinit();
 251         (void) nfs4_init_dot_entries();
 252 
 253         nfs4_minorops_init();
 254 
 255 out:
 256         if (error) {
 257                 if (nfs4_trigger_vnodeops != NULL)
 258                         vn_freevnodeops(nfs4_trigger_vnodeops);
 259 
 260                 if (nfs4_vnodeops != NULL)
 261                         vn_freevnodeops(nfs4_vnodeops);
 262 
 263                 (void) vfs_freevfsops_by_type(fstyp);
 264         }
 265 
 266         return (error);
 267 }
 268 
 269 void
 270 nfs4fini(void)
 271 {
 272         (void) nfs4_destroy_dot_entries();
 273         nfs4_vfsfini();
 274 }
 275 
 276 void
 277 nfs4_minorops_init(void)
 278 {
 279         int nmops;
 280 
 281         nmops = nfs4_max_minor_version + 1;
 282 
 283         nfs4protosw = (nfs4_minorvers_ops_t **)kmem_alloc(
 284             nmops * sizeof (nfs4_minorvers_ops_t *), KM_SLEEP);
 285         nfs4_protosw_init(nfs4protosw);
 286 }
 287 
 288 /*
 289  * Create a new sec_data structure to store AUTH_DH related data:
 290  * netname, syncaddr, knetconfig. There is no AUTH_F_RPCTIMESYNC
 291  * flag set for NFS V4 since we are avoiding to contact the rpcbind
 292  * daemon and is using the IP time service (IPPORT_TIMESERVER).
 293  *
 294  * sec_data can be freed by sec_clnt_freeinfo().
 295  */
 296 static struct sec_data *
 297 create_authdh_data(char *netname, int nlen, struct netbuf *syncaddr,
 298                 struct knetconfig *knconf) {
 299         struct sec_data *secdata;
 300         dh_k4_clntdata_t *data;
 301         char *pf, *p;
 302 
 303         if (syncaddr == NULL || syncaddr->buf == NULL || nlen == 0)
 304                 return (NULL);
 305 
 306         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
 307         secdata->flags = 0;
 308 
 309         data = kmem_alloc(sizeof (*data), KM_SLEEP);
 310 
 311         data->syncaddr.maxlen = syncaddr->maxlen;
 312         data->syncaddr.len = syncaddr->len;
 313         data->syncaddr.buf = (char *)kmem_alloc(syncaddr->len, KM_SLEEP);
 314         bcopy(syncaddr->buf, data->syncaddr.buf, syncaddr->len);
 315 
 316         /*
 317          * duplicate the knconf information for the
 318          * new opaque data.
 319          */
 320         data->knconf = kmem_alloc(sizeof (*knconf), KM_SLEEP);
 321         *data->knconf = *knconf;
 322         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 323         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 324         bcopy(knconf->knc_protofmly, pf, KNC_STRSIZE);
 325         bcopy(knconf->knc_proto, p, KNC_STRSIZE);
 326         data->knconf->knc_protofmly = pf;
 327         data->knconf->knc_proto = p;
 328 
 329         /* move server netname to the sec_data structure */
 330         data->netname = kmem_alloc(nlen, KM_SLEEP);
 331         bcopy(netname, data->netname, nlen);
 332         data->netnamelen = (int)nlen;
 333 
 334         secdata->secmod = AUTH_DH;
 335         secdata->rpcflavor = AUTH_DH;
 336         secdata->data = (caddr_t)data;
 337 
 338         return (secdata);
 339 }
 340 
 341 /*
 342  * Returns (deep) copy of sec_data_t. Allocates all memory required; caller
 343  * is responsible for freeing.
 344  */
 345 sec_data_t *
 346 copy_sec_data(sec_data_t *fsecdata) {
 347         sec_data_t *tsecdata;
 348 
 349         if (fsecdata == NULL)
 350                 return (NULL);
 351 
 352         if (fsecdata->rpcflavor == AUTH_DH) {
 353                 dh_k4_clntdata_t *fdata = (dh_k4_clntdata_t *)fsecdata->data;
 354 
 355                 if (fdata == NULL)
 356                         return (NULL);
 357 
 358                 tsecdata = (sec_data_t *)create_authdh_data(fdata->netname,
 359                     fdata->netnamelen, &fdata->syncaddr, fdata->knconf);
 360 
 361                 return (tsecdata);
 362         }
 363 
 364         tsecdata = kmem_zalloc(sizeof (sec_data_t), KM_SLEEP);
 365 
 366         tsecdata->secmod = fsecdata->secmod;
 367         tsecdata->rpcflavor = fsecdata->rpcflavor;
 368         tsecdata->flags = fsecdata->flags;
 369         tsecdata->uid = fsecdata->uid;
 370 
 371         if (fsecdata->rpcflavor == RPCSEC_GSS) {
 372                 gss_clntdata_t *gcd = (gss_clntdata_t *)fsecdata->data;
 373 
 374                 tsecdata->data = (caddr_t)copy_sec_data_gss(gcd);
 375         } else {
 376                 tsecdata->data = NULL;
 377         }
 378 
 379         return (tsecdata);
 380 }
 381 
 382 gss_clntdata_t *
 383 copy_sec_data_gss(gss_clntdata_t *fdata)
 384 {
 385         gss_clntdata_t *tdata;
 386 
 387         if (fdata == NULL)
 388                 return (NULL);
 389 
 390         tdata = kmem_zalloc(sizeof (gss_clntdata_t), KM_SLEEP);
 391 
 392         tdata->mechanism.length = fdata->mechanism.length;
 393         tdata->mechanism.elements = kmem_zalloc(fdata->mechanism.length,
 394             KM_SLEEP);
 395         bcopy(fdata->mechanism.elements, tdata->mechanism.elements,
 396             fdata->mechanism.length);
 397 
 398         tdata->service = fdata->service;
 399 
 400         (void) strcpy(tdata->uname, fdata->uname);
 401         (void) strcpy(tdata->inst, fdata->inst);
 402         (void) strcpy(tdata->realm, fdata->realm);
 403 
 404         tdata->qop = fdata->qop;
 405 
 406         return (tdata);
 407 }
 408 
 409 static int
 410 nfs4_chkdup_servinfo4(servinfo4_t *svp_head, servinfo4_t *svp)
 411 {
 412         servinfo4_t *si;
 413 
 414         /*
 415          * Iterate over the servinfo4 list to make sure
 416          * we do not have a duplicate. Skip any servinfo4
 417          * that has been marked "NOT IN USE"
 418          */
 419         for (si = svp_head; si; si = si->sv_next) {
 420                 (void) nfs_rw_enter_sig(&si->sv_lock, RW_READER, 0);
 421                 if (si->sv_flags & SV4_NOTINUSE) {
 422                         nfs_rw_exit(&si->sv_lock);
 423                         continue;
 424                 }
 425                 nfs_rw_exit(&si->sv_lock);
 426                 if (si == svp)
 427                         continue;
 428                 if (si->sv_addr.len == svp->sv_addr.len &&
 429                     strcmp(si->sv_knconf->knc_protofmly,
 430                     svp->sv_knconf->knc_protofmly) == 0 &&
 431                     bcmp(si->sv_addr.buf, svp->sv_addr.buf,
 432                     si->sv_addr.len) == 0) {
 433                         /* it's a duplicate */
 434                         return (1);
 435                 }
 436         }
 437         /* it's not a duplicate */
 438         return (0);
 439 }
 440 
 441 void
 442 nfs4_free_args(struct nfs_args *nargs)
 443 {
 444         if (nargs->knconf) {
 445                 if (nargs->knconf->knc_protofmly)
 446                         kmem_free(nargs->knconf->knc_protofmly,
 447                             KNC_STRSIZE);
 448                 if (nargs->knconf->knc_proto)
 449                         kmem_free(nargs->knconf->knc_proto, KNC_STRSIZE);
 450                 kmem_free(nargs->knconf, sizeof (*nargs->knconf));
 451                 nargs->knconf = NULL;
 452         }
 453 
 454         if (nargs->fh) {
 455                 kmem_free(nargs->fh, strlen(nargs->fh) + 1);
 456                 nargs->fh = NULL;
 457         }
 458 
 459         if (nargs->hostname) {
 460                 kmem_free(nargs->hostname, strlen(nargs->hostname) + 1);
 461                 nargs->hostname = NULL;
 462         }
 463 
 464         if (nargs->addr) {
 465                 if (nargs->addr->buf) {
 466                         ASSERT(nargs->addr->len);
 467                         kmem_free(nargs->addr->buf, nargs->addr->len);
 468                 }
 469                 kmem_free(nargs->addr, sizeof (struct netbuf));
 470                 nargs->addr = NULL;
 471         }
 472 
 473         if (nargs->syncaddr) {
 474                 ASSERT(nargs->syncaddr->len);
 475                 if (nargs->syncaddr->buf) {
 476                         ASSERT(nargs->syncaddr->len);
 477                         kmem_free(nargs->syncaddr->buf, nargs->syncaddr->len);
 478                 }
 479                 kmem_free(nargs->syncaddr, sizeof (struct netbuf));
 480                 nargs->syncaddr = NULL;
 481         }
 482 
 483         if (nargs->netname) {
 484                 kmem_free(nargs->netname, strlen(nargs->netname) + 1);
 485                 nargs->netname = NULL;
 486         }
 487 
 488         if (nargs->nfs_ext_u.nfs_extA.secdata) {
 489                 sec_clnt_freeinfo(
 490                     nargs->nfs_ext_u.nfs_extA.secdata);
 491                 nargs->nfs_ext_u.nfs_extA.secdata = NULL;
 492         }
 493 }
 494 
 495 
 496 int
 497 nfs4_copyin(char *data, int datalen, struct nfs_args *nargs)
 498 {
 499 
 500         int error;
 501         size_t hlen;                    /* length of hostname */
 502         size_t nlen;                    /* length of netname */
 503         char netname[MAXNETNAMELEN+1];  /* server's netname */
 504         struct netbuf addr;             /* server's address */
 505         struct netbuf syncaddr;         /* AUTH_DES time sync addr */
 506         struct knetconfig *knconf;              /* transport structure */
 507         struct sec_data *secdata = NULL;        /* security data */
 508         STRUCT_DECL(nfs_args, args);            /* nfs mount arguments */
 509         STRUCT_DECL(knetconfig, knconf_tmp);
 510         STRUCT_DECL(netbuf, addr_tmp);
 511         int flags;
 512         char *p, *pf;
 513         struct pathname pn;
 514         char *userbufptr;
 515 
 516 
 517         bzero(nargs, sizeof (*nargs));
 518 
 519         STRUCT_INIT(args, get_udatamodel());
 520         bzero(STRUCT_BUF(args), SIZEOF_STRUCT(nfs_args, DATAMODEL_NATIVE));
 521         if (copyin(data, STRUCT_BUF(args), MIN(datalen,
 522             STRUCT_SIZE(args))))
 523                 return (EFAULT);
 524 
 525         nargs->wsize = STRUCT_FGET(args, wsize);
 526         nargs->rsize = STRUCT_FGET(args, rsize);
 527         nargs->timeo = STRUCT_FGET(args, timeo);
 528         nargs->retrans = STRUCT_FGET(args, retrans);
 529         nargs->acregmin = STRUCT_FGET(args, acregmin);
 530         nargs->acregmax = STRUCT_FGET(args, acregmax);
 531         nargs->acdirmin = STRUCT_FGET(args, acdirmin);
 532         nargs->acdirmax = STRUCT_FGET(args, acdirmax);
 533 
 534         flags = STRUCT_FGET(args, flags);
 535         nargs->flags = flags;
 536 
 537         addr.buf = NULL;
 538         syncaddr.buf = NULL;
 539 
 540 
 541         /*
 542          * Allocate space for a knetconfig structure and
 543          * its strings and copy in from user-land.
 544          */
 545         knconf = kmem_zalloc(sizeof (*knconf), KM_SLEEP);
 546         STRUCT_INIT(knconf_tmp, get_udatamodel());
 547         if (copyin(STRUCT_FGETP(args, knconf), STRUCT_BUF(knconf_tmp),
 548             STRUCT_SIZE(knconf_tmp))) {
 549                 kmem_free(knconf, sizeof (*knconf));
 550                 return (EFAULT);
 551         }
 552 
 553         knconf->knc_semantics = STRUCT_FGET(knconf_tmp, knc_semantics);
 554         knconf->knc_protofmly = STRUCT_FGETP(knconf_tmp, knc_protofmly);
 555         knconf->knc_proto = STRUCT_FGETP(knconf_tmp, knc_proto);
 556         if (get_udatamodel() != DATAMODEL_LP64) {
 557                 knconf->knc_rdev = expldev(STRUCT_FGET(knconf_tmp, knc_rdev));
 558         } else {
 559                 knconf->knc_rdev = STRUCT_FGET(knconf_tmp, knc_rdev);
 560         }
 561 
 562         pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 563         p = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
 564         error = copyinstr(knconf->knc_protofmly, pf, KNC_STRSIZE, NULL);
 565         if (error) {
 566                 kmem_free(pf, KNC_STRSIZE);
 567                 kmem_free(p, KNC_STRSIZE);
 568                 kmem_free(knconf, sizeof (*knconf));
 569                 return (error);
 570         }
 571 
 572         error = copyinstr(knconf->knc_proto, p, KNC_STRSIZE, NULL);
 573         if (error) {
 574                 kmem_free(pf, KNC_STRSIZE);
 575                 kmem_free(p, KNC_STRSIZE);
 576                 kmem_free(knconf, sizeof (*knconf));
 577                 return (error);
 578         }
 579 
 580 
 581         knconf->knc_protofmly = pf;
 582         knconf->knc_proto = p;
 583 
 584         nargs->knconf = knconf;
 585 
 586         /*
 587          * Get server address
 588          */
 589         STRUCT_INIT(addr_tmp, get_udatamodel());
 590         if (copyin(STRUCT_FGETP(args, addr), STRUCT_BUF(addr_tmp),
 591             STRUCT_SIZE(addr_tmp))) {
 592                 error = EFAULT;
 593                 goto errout;
 594         }
 595 
 596         nargs->addr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 597         userbufptr = STRUCT_FGETP(addr_tmp, buf);
 598         addr.len = STRUCT_FGET(addr_tmp, len);
 599         addr.buf = kmem_alloc(addr.len, KM_SLEEP);
 600         addr.maxlen = addr.len;
 601         if (copyin(userbufptr, addr.buf, addr.len)) {
 602                 kmem_free(addr.buf, addr.len);
 603                 error = EFAULT;
 604                 goto errout;
 605         }
 606         bcopy(&addr, nargs->addr, sizeof (struct netbuf));
 607 
 608         /*
 609          * Get the root fhandle
 610          */
 611         error = pn_get(STRUCT_FGETP(args, fh), UIO_USERSPACE, &pn);
 612         if (error)
 613                 goto errout;
 614 
 615         /* Volatile fh: keep server paths, so use actual-size strings */
 616         nargs->fh = kmem_alloc(pn.pn_pathlen + 1, KM_SLEEP);
 617         bcopy(pn.pn_path, nargs->fh, pn.pn_pathlen);
 618         nargs->fh[pn.pn_pathlen] = '\0';
 619         pn_free(&pn);
 620 
 621 
 622         /*
 623          * Get server's hostname
 624          */
 625         if (flags & NFSMNT_HOSTNAME) {
 626                 error = copyinstr(STRUCT_FGETP(args, hostname),
 627                     netname, sizeof (netname), &hlen);
 628                 if (error)
 629                         goto errout;
 630                 nargs->hostname = kmem_zalloc(hlen, KM_SLEEP);
 631                 (void) strcpy(nargs->hostname, netname);
 632 
 633         } else {
 634                 nargs->hostname = NULL;
 635         }
 636 
 637 
 638         /*
 639          * If there are syncaddr and netname data, load them in. This is
 640          * to support data needed for NFSV4 when AUTH_DH is the negotiated
 641          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
 642          */
 643         netname[0] = '\0';
 644         if (flags & NFSMNT_SECURE) {
 645 
 646                 /* get syncaddr */
 647                 STRUCT_INIT(addr_tmp, get_udatamodel());
 648                 if (copyin(STRUCT_FGETP(args, syncaddr), STRUCT_BUF(addr_tmp),
 649                     STRUCT_SIZE(addr_tmp))) {
 650                         error = EINVAL;
 651                         goto errout;
 652                 }
 653                 userbufptr = STRUCT_FGETP(addr_tmp, buf);
 654                 syncaddr.len = STRUCT_FGET(addr_tmp, len);
 655                 syncaddr.buf = kmem_alloc(syncaddr.len, KM_SLEEP);
 656                 syncaddr.maxlen = syncaddr.len;
 657                 if (copyin(userbufptr, syncaddr.buf, syncaddr.len)) {
 658                         kmem_free(syncaddr.buf, syncaddr.len);
 659                         error = EFAULT;
 660                         goto errout;
 661                 }
 662 
 663                 nargs->syncaddr = kmem_alloc(sizeof (struct netbuf), KM_SLEEP);
 664                 bcopy(&syncaddr, nargs->syncaddr, sizeof (struct netbuf));
 665 
 666                 /* get server's netname */
 667                 if (copyinstr(STRUCT_FGETP(args, netname), netname,
 668                     sizeof (netname), &nlen)) {
 669                         error = EFAULT;
 670                         goto errout;
 671                 }
 672 
 673                 netname[nlen] = '\0';
 674                 nargs->netname = kmem_zalloc(nlen, KM_SLEEP);
 675                 (void) strcpy(nargs->netname, netname);
 676         }
 677 
 678         /*
 679          * Get the extention data which has the security data structure.
 680          * This includes data for AUTH_SYS as well.
 681          */
 682         if (flags & NFSMNT_NEWARGS) {
 683                 nargs->nfs_args_ext = STRUCT_FGET(args, nfs_args_ext);
 684                 if (nargs->nfs_args_ext == NFS_ARGS_EXTA ||
 685                     nargs->nfs_args_ext == NFS_ARGS_EXTB) {
 686                         /*
 687                          * Indicating the application is using the new
 688                          * sec_data structure to pass in the security
 689                          * data.
 690                          */
 691                         if (STRUCT_FGETP(args,
 692                             nfs_ext_u.nfs_extA.secdata) != NULL) {
 693                                 error = sec_clnt_loadinfo(
 694                                     (struct sec_data *)STRUCT_FGETP(args,
 695                                     nfs_ext_u.nfs_extA.secdata),
 696                                     &secdata, get_udatamodel());
 697                         }
 698                         nargs->nfs_ext_u.nfs_extA.secdata = secdata;
 699                 }
 700         }
 701 
 702         if (error)
 703                 goto errout;
 704 
 705         /*
 706          * Failover support:
 707          *
 708          * We may have a linked list of nfs_args structures,
 709          * which means the user is looking for failover.  If
 710          * the mount is either not "read-only" or "soft",
 711          * we want to bail out with EINVAL.
 712          */
 713         if (nargs->nfs_args_ext == NFS_ARGS_EXTB)
 714                 nargs->nfs_ext_u.nfs_extB.next =
 715                     STRUCT_FGETP(args, nfs_ext_u.nfs_extB.next);
 716 
 717 errout:
 718         if (error)
 719                 nfs4_free_args(nargs);
 720 
 721         return (error);
 722 }
 723 
 724 
 725 /*
 726  * nfs mount vfsop
 727  * Set up mount info record and attach it to vfs struct.
 728  */
 729 int
 730 nfs4_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 731 {
 732         char *data = uap->dataptr;
 733         int error;
 734         vnode_t *rtvp;                  /* the server's root */
 735         mntinfo4_t *mi;                 /* mount info, pointed at by vfs */
 736         struct knetconfig *rdma_knconf; /* rdma transport structure */
 737         rnode4_t *rp;
 738         struct servinfo4 *svp;          /* nfs server info */
 739         struct servinfo4 *svp_tail = NULL; /* previous nfs server info */
 740         struct servinfo4 *svp_head;     /* first nfs server info */
 741         struct servinfo4 *svp_2ndlast;  /* 2nd last in server info list */
 742         struct sec_data *secdata;       /* security data */
 743         struct nfs_args *args = NULL;
 744         int flags, addr_type, removed;
 745         zone_t *zone = nfs_zone();
 746         zone_t *mntzone = NULL;
 747 
 748         if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
 749                 return (EPERM);
 750         if (mvp->v_type != VDIR)
 751                 return (ENOTDIR);
 752 
 753         /*
 754          * get arguments
 755          *
 756          * nfs_args is now versioned and is extensible, so
 757          * uap->datalen might be different from sizeof (args)
 758          * in a compatible situation.
 759          */
 760 more:
 761         if (!(uap->flags & MS_SYSSPACE)) {
 762                 if (args == NULL)
 763                         args = kmem_zalloc(sizeof (struct nfs_args), KM_SLEEP);
 764                 else
 765                         nfs4_free_args(args);
 766                 error = nfs4_copyin(data, uap->datalen, args);
 767                 if (error) {
 768                         if (args) {
 769                                 kmem_free(args, sizeof (*args));
 770                         }
 771                         return (error);
 772                 }
 773         } else {
 774                 args = (struct nfs_args *)data;
 775         }
 776 
 777         flags = args->flags;
 778 
 779         /*
 780          * If the request changes the locking type, disallow the remount,
 781          * because it's questionable whether we can transfer the
 782          * locking state correctly.
 783          */
 784         if (uap->flags & MS_REMOUNT) {
 785                 if (!(uap->flags & MS_SYSSPACE)) {
 786                         nfs4_free_args(args);
 787                         kmem_free(args, sizeof (*args));
 788                 }
 789                 if ((mi = VFTOMI4(vfsp)) != NULL) {
 790                         uint_t new_mi_llock;
 791                         uint_t old_mi_llock;
 792                         new_mi_llock = (flags & NFSMNT_LLOCK) ? 1 : 0;
 793                         old_mi_llock = (mi->mi_flags & MI4_LLOCK) ? 1 : 0;
 794                         if (old_mi_llock != new_mi_llock)
 795                                 return (EBUSY);
 796                 }
 797                 return (0);
 798         }
 799 
 800         /*
 801          * For ephemeral mount trigger stub vnodes, we have two problems
 802          * to solve: racing threads will likely fail the v_count check, and
 803          * we want only one to proceed with the mount.
 804          *
 805          * For stubs, if the mount has already occurred (via a racing thread),
 806          * just return success. If not, skip the v_count check and proceed.
 807          * Note that we are already serialised at this point.
 808          */
 809         mutex_enter(&mvp->v_lock);
 810         if (vn_matchops(mvp, nfs4_trigger_vnodeops)) {
 811                 /* mntpt is a v4 stub vnode */
 812                 ASSERT(RP_ISSTUB(VTOR4(mvp)));
 813                 ASSERT(!(uap->flags & MS_OVERLAY));
 814                 ASSERT(!(mvp->v_flag & VROOT));
 815                 if (vn_mountedvfs(mvp) != NULL) {
 816                         /* ephemeral mount has already occurred */
 817                         ASSERT(uap->flags & MS_SYSSPACE);
 818                         mutex_exit(&mvp->v_lock);
 819                         return (0);
 820                 }
 821         } else {
 822                 /* mntpt is a non-v4 or v4 non-stub vnode */
 823                 if (!(uap->flags & MS_OVERLAY) &&
 824                     (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 825                         mutex_exit(&mvp->v_lock);
 826                         if (!(uap->flags & MS_SYSSPACE)) {
 827                                 nfs4_free_args(args);
 828                                 kmem_free(args, sizeof (*args));
 829                         }
 830                         return (EBUSY);
 831                 }
 832         }
 833         mutex_exit(&mvp->v_lock);
 834 
 835         /* make sure things are zeroed for errout: */
 836         rtvp = NULL;
 837         mi = NULL;
 838         secdata = NULL;
 839 
 840         /*
 841          * A valid knetconfig structure is required.
 842          */
 843         if (!(flags & NFSMNT_KNCONF) ||
 844             args->knconf == NULL || args->knconf->knc_protofmly == NULL ||
 845             args->knconf->knc_proto == NULL ||
 846             (strcmp(args->knconf->knc_proto, NC_UDP) == 0)) {
 847                 if (!(uap->flags & MS_SYSSPACE)) {
 848                         nfs4_free_args(args);
 849                         kmem_free(args, sizeof (*args));
 850                 }
 851                 return (EINVAL);
 852         }
 853 
 854         if ((strlen(args->knconf->knc_protofmly) >= KNC_STRSIZE) ||
 855             (strlen(args->knconf->knc_proto) >= KNC_STRSIZE)) {
 856                 if (!(uap->flags & MS_SYSSPACE)) {
 857                         nfs4_free_args(args);
 858                         kmem_free(args, sizeof (*args));
 859                 }
 860                 return (EINVAL);
 861         }
 862 
 863         /*
 864          * Allocate a servinfo4 struct.
 865          */
 866         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
 867         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
 868         if (svp_tail) {
 869                 svp_2ndlast = svp_tail;
 870                 svp_tail->sv_next = svp;
 871         } else {
 872                 svp_head = svp;
 873                 svp_2ndlast = svp;
 874         }
 875 
 876         svp_tail = svp;
 877         svp->sv_knconf = args->knconf;
 878         args->knconf = NULL;
 879 
 880         /*
 881          * Get server address
 882          */
 883         if (args->addr == NULL || args->addr->buf == NULL) {
 884                 error = EINVAL;
 885                 goto errout;
 886         }
 887 
 888         svp->sv_addr.maxlen = args->addr->maxlen;
 889         svp->sv_addr.len = args->addr->len;
 890         svp->sv_addr.buf = args->addr->buf;
 891         args->addr->buf = NULL;
 892 
 893         /*
 894          * Get the root fhandle
 895          */
 896         if (args->fh == NULL || (strlen(args->fh) >= MAXPATHLEN)) {
 897                 error = EINVAL;
 898                 goto errout;
 899         }
 900 
 901         svp->sv_path = args->fh;
 902         svp->sv_pathlen = strlen(args->fh) + 1;
 903         args->fh = NULL;
 904 
 905         /*
 906          * Get server's hostname
 907          */
 908         if (flags & NFSMNT_HOSTNAME) {
 909                 if (args->hostname == NULL || (strlen(args->hostname) >
 910                     MAXNETNAMELEN)) {
 911                         error = EINVAL;
 912                         goto errout;
 913                 }
 914                 svp->sv_hostnamelen = strlen(args->hostname) + 1;
 915                 svp->sv_hostname = args->hostname;
 916                 args->hostname = NULL;
 917         } else {
 918                 char *p = "unknown-host";
 919                 svp->sv_hostnamelen = strlen(p) + 1;
 920                 svp->sv_hostname = kmem_zalloc(svp->sv_hostnamelen, KM_SLEEP);
 921                 (void) strcpy(svp->sv_hostname, p);
 922         }
 923 
 924         /*
 925          * RDMA MOUNT SUPPORT FOR NFS v4.
 926          * Establish, is it possible to use RDMA, if so overload the
 927          * knconf with rdma specific knconf and free the orignal knconf.
 928          */
 929         if ((flags & NFSMNT_TRYRDMA) || (flags & NFSMNT_DORDMA)) {
 930                 /*
 931                  * Determine the addr type for RDMA, IPv4 or v6.
 932                  */
 933                 if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET) == 0)
 934                         addr_type = AF_INET;
 935                 else if (strcmp(svp->sv_knconf->knc_protofmly, NC_INET6) == 0)
 936                         addr_type = AF_INET6;
 937 
 938                 if (rdma_reachable(addr_type, &svp->sv_addr,
 939                     &rdma_knconf) == 0) {
 940                         /*
 941                          * If successful, hijack the orignal knconf and
 942                          * replace with the new one, depending on the flags.
 943                          */
 944                         svp->sv_origknconf = svp->sv_knconf;
 945                         svp->sv_knconf = rdma_knconf;
 946                 } else {
 947                         if (flags & NFSMNT_TRYRDMA) {
 948 #ifdef  DEBUG
 949                                 if (rdma_debug)
 950                                         zcmn_err(getzoneid(), CE_WARN,
 951                                             "no RDMA onboard, revert\n");
 952 #endif
 953                         }
 954 
 955                         if (flags & NFSMNT_DORDMA) {
 956                                 /*
 957                                  * If proto=rdma is specified and no RDMA
 958                                  * path to this server is avialable then
 959                                  * ditch this server.
 960                                  * This is not included in the mountable
 961                                  * server list or the replica list.
 962                                  * Check if more servers are specified;
 963                                  * Failover case, otherwise bail out of mount.
 964                                  */
 965                                 if (args->nfs_args_ext == NFS_ARGS_EXTB &&
 966                                     args->nfs_ext_u.nfs_extB.next != NULL) {
 967                                         data = (char *)
 968                                             args->nfs_ext_u.nfs_extB.next;
 969                                         if (uap->flags & MS_RDONLY &&
 970                                             !(flags & NFSMNT_SOFT)) {
 971                                                 if (svp_head->sv_next == NULL) {
 972                                                         svp_tail = NULL;
 973                                                         svp_2ndlast = NULL;
 974                                                         sv4_free(svp_head);
 975                                                         goto more;
 976                                                 } else {
 977                                                         svp_tail = svp_2ndlast;
 978                                                         svp_2ndlast->sv_next =
 979                                                             NULL;
 980                                                         sv4_free(svp);
 981                                                         goto more;
 982                                                 }
 983                                         }
 984                                 } else {
 985                                         /*
 986                                          * This is the last server specified
 987                                          * in the nfs_args list passed down
 988                                          * and its not rdma capable.
 989                                          */
 990                                         if (svp_head->sv_next == NULL) {
 991                                                 /*
 992                                                  * Is this the only one
 993                                                  */
 994                                                 error = EINVAL;
 995 #ifdef  DEBUG
 996                                                 if (rdma_debug)
 997                                                         zcmn_err(getzoneid(),
 998                                                             CE_WARN,
 999                                                             "No RDMA srv");
1000 #endif
1001                                                 goto errout;
1002                                         } else {
1003                                                 /*
1004                                                  * There is list, since some
1005                                                  * servers specified before
1006                                                  * this passed all requirements
1007                                                  */
1008                                                 svp_tail = svp_2ndlast;
1009                                                 svp_2ndlast->sv_next = NULL;
1010                                                 sv4_free(svp);
1011                                                 goto proceed;
1012                                         }
1013                                 }
1014                         }
1015                 }
1016         }
1017 
1018         /*
1019          * If there are syncaddr and netname data, load them in. This is
1020          * to support data needed for NFSV4 when AUTH_DH is the negotiated
1021          * flavor via SECINFO. (instead of using MOUNT protocol in V3).
1022          */
1023         if (args->flags & NFSMNT_SECURE) {
1024                 svp->sv_dhsec = create_authdh_data(args->netname,
1025                     strlen(args->netname),
1026                     args->syncaddr, svp->sv_knconf);
1027         }
1028 
1029         /*
1030          * Get the extention data which has the security data structure.
1031          * This includes data for AUTH_SYS as well.
1032          */
1033         if (flags & NFSMNT_NEWARGS) {
1034                 switch (args->nfs_args_ext) {
1035                 case NFS_ARGS_EXTA:
1036                 case NFS_ARGS_EXTB:
1037                         /*
1038                          * Indicating the application is using the new
1039                          * sec_data structure to pass in the security
1040                          * data.
1041                          */
1042                         secdata = args->nfs_ext_u.nfs_extA.secdata;
1043                         if (secdata == NULL) {
1044                                 error = EINVAL;
1045                         } else if (uap->flags & MS_SYSSPACE) {
1046                                 /*
1047                                  * Need to validate the flavor here if
1048                                  * sysspace, userspace was already
1049                                  * validate from the nfs_copyin function.
1050                                  */
1051                                 switch (secdata->rpcflavor) {
1052                                 case AUTH_NONE:
1053                                 case AUTH_UNIX:
1054                                 case AUTH_LOOPBACK:
1055                                 case AUTH_DES:
1056                                 case RPCSEC_GSS:
1057                                         break;
1058                                 default:
1059                                         error = EINVAL;
1060                                         goto errout;
1061                                 }
1062                         }
1063                         args->nfs_ext_u.nfs_extA.secdata = NULL;
1064                         break;
1065 
1066                 default:
1067                         error = EINVAL;
1068                         break;
1069                 }
1070 
1071         } else if (flags & NFSMNT_SECURE) {
1072                 /*
1073                  * NFSMNT_SECURE is deprecated but we keep it
1074                  * to support the rogue user-generated application
1075                  * that may use this undocumented interface to do
1076                  * AUTH_DH security, e.g. our own rexd.
1077                  *
1078                  * Also note that NFSMNT_SECURE is used for passing
1079                  * AUTH_DH info to be used in negotiation.
1080                  */
1081                 secdata = create_authdh_data(args->netname,
1082                     strlen(args->netname), args->syncaddr, svp->sv_knconf);
1083 
1084         } else {
1085                 secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
1086                 secdata->secmod = secdata->rpcflavor = AUTH_SYS;
1087                 secdata->data = NULL;
1088         }
1089 
1090         svp->sv_secdata = secdata;
1091 
1092         /*
1093          * User does not explictly specify a flavor, and a user
1094          * defined default flavor is passed down.
1095          */
1096         if (flags & NFSMNT_SECDEFAULT) {
1097                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1098                 svp->sv_flags |= SV4_TRYSECDEFAULT;
1099                 nfs_rw_exit(&svp->sv_lock);
1100         }
1101 
1102         /*
1103          * Failover support:
1104          *
1105          * We may have a linked list of nfs_args structures,
1106          * which means the user is looking for failover.  If
1107          * the mount is either not "read-only" or "soft",
1108          * we want to bail out with EINVAL.
1109          */
1110         if (args->nfs_args_ext == NFS_ARGS_EXTB &&
1111             args->nfs_ext_u.nfs_extB.next != NULL) {
1112                 if (uap->flags & MS_RDONLY && !(flags & NFSMNT_SOFT)) {
1113                         data = (char *)args->nfs_ext_u.nfs_extB.next;
1114                         goto more;
1115                 }
1116                 error = EINVAL;
1117                 goto errout;
1118         }
1119 
1120         /*
1121          * Determine the zone we're being mounted into.
1122          */
1123         zone_hold(mntzone = zone);              /* start with this assumption */
1124         if (getzoneid() == GLOBAL_ZONEID) {
1125                 zone_rele(mntzone);
1126                 mntzone = zone_find_by_path(refstr_value(vfsp->vfs_mntpt));
1127                 ASSERT(mntzone != NULL);
1128                 if (mntzone != zone) {
1129                         error = EBUSY;
1130                         goto errout;
1131                 }
1132         }
1133 
1134         if (is_system_labeled()) {
1135                 error = nfs_mount_label_policy(vfsp, &svp->sv_addr,
1136                     svp->sv_knconf, cr);
1137 
1138                 if (error > 0)
1139                         goto errout;
1140 
1141                 if (error == -1) {
1142                         /* change mount to read-only to prevent write-down */
1143                         vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
1144                 }
1145         }
1146 
1147         /*
1148          * Stop the mount from going any further if the zone is going away.
1149          */
1150         if (zone_status_get(mntzone) >= ZONE_IS_SHUTTING_DOWN) {
1151                 error = EBUSY;
1152                 goto errout;
1153         }
1154 
1155         /*
1156          * Get root vnode.
1157          */
1158 proceed:
1159         error = nfs4rootvp(&rtvp, vfsp, svp_head, flags, cr, mntzone);
1160         if (error) {
1161                 /* if nfs4rootvp failed, it will free svp_head */
1162                 svp_head = NULL;
1163                 goto errout;
1164         }
1165 
1166         mi = VTOMI4(rtvp);
1167 
1168         /*
1169          * Set option fields in the mount info record
1170          */
1171 
1172         if (svp_head->sv_next) {
1173                 mutex_enter(&mi->mi_lock);
1174                 mi->mi_flags |= MI4_LLOCK;
1175                 mutex_exit(&mi->mi_lock);
1176         }
1177         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, args);
1178         if (error)
1179                 goto errout;
1180 
1181         /*
1182          * Time to tie in the mirror mount info at last!
1183          */
1184         if (flags & NFSMNT_EPHEMERAL)
1185                 error = nfs4_record_ephemeral_mount(mi, mvp);
1186 
1187 errout:
1188         if (error) {
1189                 if (rtvp != NULL) {
1190                         rp = VTOR4(rtvp);
1191                         if (rp->r_flags & R4HASHED)
1192                                 rp4_rmhash(rp);
1193                 }
1194                 if (mi != NULL) {
1195                         nfs4_async_stop(vfsp);
1196                         nfs4_async_manager_stop(vfsp);
1197                         nfs4_remove_mi_from_server(mi, NULL);
1198                         if (rtvp != NULL)
1199                                 VN_RELE(rtvp);
1200                         if (mntzone != NULL)
1201                                 zone_rele(mntzone);
1202                         /* need to remove it from the zone */
1203                         removed = nfs4_mi_zonelist_remove(mi);
1204                         if (removed)
1205                                 zone_rele(mi->mi_zone);
1206                         MI4_RELE(mi);
1207                         if (!(uap->flags & MS_SYSSPACE) && args) {
1208                                 nfs4_free_args(args);
1209                                 kmem_free(args, sizeof (*args));
1210                         }
1211                         return (error);
1212                 }
1213                 if (svp_head)
1214                         sv4_free(svp_head);
1215         }
1216 
1217         if (!(uap->flags & MS_SYSSPACE) && args) {
1218                 nfs4_free_args(args);
1219                 kmem_free(args, sizeof (*args));
1220         }
1221         if (rtvp != NULL)
1222                 VN_RELE(rtvp);
1223 
1224         if (mntzone != NULL)
1225                 zone_rele(mntzone);
1226 
1227         return (error);
1228 }
1229 
1230 #ifdef  DEBUG
1231 #define VERS_MSG        "NFS4 server "
1232 #else
1233 #define VERS_MSG        "NFS server "
1234 #endif
1235 
1236 #define READ_MSG        \
1237         VERS_MSG "%s returned 0 for read transfer size"
1238 #define WRITE_MSG       \
1239         VERS_MSG "%s returned 0 for write transfer size"
1240 #define SIZE_MSG        \
1241         VERS_MSG "%s returned 0 for maximum file size"
1242 
1243 /*
1244  * Get the symbolic link text from the server for a given filehandle
1245  * of that symlink.
1246  *
1247  *      (get symlink text) PUTFH READLINK
1248  */
1249 static int
1250 getlinktext_otw(mntinfo4_t *mi, nfs_fh4 *fh, char **linktextp, cred_t *cr,
1251     int flags)
1252 {
1253         COMPOUND4args_clnt args;
1254         COMPOUND4res_clnt res;
1255         int doqueue;
1256         nfs_argop4 argop[2];
1257         nfs_resop4 *resop;
1258         READLINK4res *lr_res;
1259         uint_t len;
1260         bool_t needrecov = FALSE;
1261         nfs4_recov_state_t recov_state;
1262         nfs4_sharedfh_t *sfh;
1263         nfs4_error_t e;
1264         int num_retry = nfs4_max_mount_retry;
1265         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1266 
1267         sfh = sfh4_get(fh, mi);
1268         recov_state.rs_flags = 0;
1269         recov_state.rs_num_retry_despite_err = 0;
1270 
1271 recov_retry:
1272         nfs4_error_zinit(&e);
1273 
1274         args.array_len = 2;
1275         args.array = argop;
1276         args.ctag = TAG_GET_SYMLINK;
1277 
1278         if (! recovery) {
1279                 e.error = nfs4_start_op(mi, NULL, NULL, &recov_state);
1280                 if (e.error) {
1281                         sfh4_rele(&sfh);
1282                         return (e.error);
1283                 }
1284         }
1285 
1286         /* 0. putfh symlink fh */
1287         argop[0].argop = OP_CPUTFH;
1288         argop[0].nfs_argop4_u.opcputfh.sfh = sfh;
1289 
1290         /* 1. readlink */
1291         argop[1].argop = OP_READLINK;
1292 
1293         doqueue = 1;
1294 
1295         rfs4call(mi, NULL, &args, &res, cr, &doqueue, 0, &e);
1296 
1297         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
1298 
1299         if (needrecov && !recovery && num_retry-- > 0) {
1300 
1301                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1302                     "getlinktext_otw: initiating recovery\n"));
1303 
1304                 if (nfs4_start_recovery(&e, mi, NULL, NULL, NULL, NULL,
1305                     OP_READLINK, NULL) == FALSE) {
1306                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1307                         if (!e.error)
1308                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
1309                                     (caddr_t)&res);
1310                         goto recov_retry;
1311                 }
1312         }
1313 
1314         /*
1315          * If non-NFS4 pcol error and/or we weren't able to recover.
1316          */
1317         if (e.error != 0) {
1318                 if (! recovery)
1319                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1320                 sfh4_rele(&sfh);
1321                 return (e.error);
1322         }
1323 
1324         if (res.status) {
1325                 e.error = geterrno4(res.status);
1326                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1327                 if (! recovery)
1328                         nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1329                 sfh4_rele(&sfh);
1330                 return (e.error);
1331         }
1332 
1333         /* res.status == NFS4_OK */
1334         ASSERT(res.status == NFS4_OK);
1335 
1336         resop = &res.array[1];  /* readlink res */
1337         lr_res = &resop->nfs_resop4_u.opreadlink;
1338 
1339         /* treat symlink name as data */
1340         *linktextp = utf8_to_str(&lr_res->link, &len, NULL);
1341 
1342         if (! recovery)
1343                 nfs4_end_op(mi, NULL, NULL, &recov_state, needrecov);
1344         sfh4_rele(&sfh);
1345         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1346         return (0);
1347 }
1348 
1349 /*
1350  * Skip over consecutive slashes and "/./" in a pathname.
1351  */
1352 void
1353 pathname_skipslashdot(struct pathname *pnp)
1354 {
1355         char *c1, *c2;
1356 
1357         while (pnp->pn_pathlen > 0 && *pnp->pn_path == '/') {
1358 
1359                 c1 = pnp->pn_path + 1;
1360                 c2 = pnp->pn_path + 2;
1361 
1362                 if (*c1 == '.' && (*c2 == '/' || *c2 == '\0')) {
1363                         pnp->pn_path = pnp->pn_path + 2; /* skip "/." */
1364                         pnp->pn_pathlen = pnp->pn_pathlen - 2;
1365                 } else {
1366                         pnp->pn_path++;
1367                         pnp->pn_pathlen--;
1368                 }
1369         }
1370 }
1371 
1372 /*
1373  * Resolve a symbolic link path. The symlink is in the nth component of
1374  * svp->sv_path and has an nfs4 file handle "fh".
1375  * Upon return, the sv_path will point to the new path that has the nth
1376  * component resolved to its symlink text.
1377  */
1378 int
1379 resolve_sympath(mntinfo4_t *mi, servinfo4_t *svp, int nth, nfs_fh4 *fh,
1380     cred_t *cr, int flags)
1381 {
1382         char *oldpath;
1383         char *symlink, *newpath;
1384         struct pathname oldpn, newpn;
1385         char component[MAXNAMELEN];
1386         int i, addlen, error = 0;
1387         int oldpathlen;
1388 
1389         /* Get the symbolic link text over the wire. */
1390         error = getlinktext_otw(mi, fh, &symlink, cr, flags);
1391 
1392         if (error || symlink == NULL || strlen(symlink) == 0)
1393                 return (error);
1394 
1395         /*
1396          * Compose the new pathname.
1397          * Note:
1398          *    - only the nth component is resolved for the pathname.
1399          *    - pathname.pn_pathlen does not count the ending null byte.
1400          */
1401         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1402         oldpath = svp->sv_path;
1403         oldpathlen = svp->sv_pathlen;
1404         if (error = pn_get(oldpath, UIO_SYSSPACE, &oldpn)) {
1405                 nfs_rw_exit(&svp->sv_lock);
1406                 kmem_free(symlink, strlen(symlink) + 1);
1407                 return (error);
1408         }
1409         nfs_rw_exit(&svp->sv_lock);
1410         pn_alloc(&newpn);
1411 
1412         /*
1413          * Skip over previous components from the oldpath so that the
1414          * oldpn.pn_path will point to the symlink component. Skip
1415          * leading slashes and "/./" (no OP_LOOKUP on ".") so that
1416          * pn_getcompnent can get the component.
1417          */
1418         for (i = 1; i < nth; i++) {
1419                 pathname_skipslashdot(&oldpn);
1420                 error = pn_getcomponent(&oldpn, component);
1421                 if (error)
1422                         goto out;
1423         }
1424 
1425         /*
1426          * Copy the old path upto the component right before the symlink
1427          * if the symlink is not an absolute path.
1428          */
1429         if (symlink[0] != '/') {
1430                 addlen = oldpn.pn_path - oldpn.pn_buf;
1431                 bcopy(oldpn.pn_buf, newpn.pn_path, addlen);
1432                 newpn.pn_pathlen += addlen;
1433                 newpn.pn_path += addlen;
1434                 newpn.pn_buf[newpn.pn_pathlen] = '/';
1435                 newpn.pn_pathlen++;
1436                 newpn.pn_path++;
1437         }
1438 
1439         /* copy the resolved symbolic link text */
1440         addlen = strlen(symlink);
1441         if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1442                 error = ENAMETOOLONG;
1443                 goto out;
1444         }
1445         bcopy(symlink, newpn.pn_path, addlen);
1446         newpn.pn_pathlen += addlen;
1447         newpn.pn_path += addlen;
1448 
1449         /*
1450          * Check if there is any remaining path after the symlink component.
1451          * First, skip the symlink component.
1452          */
1453         pathname_skipslashdot(&oldpn);
1454         if (error = pn_getcomponent(&oldpn, component))
1455                 goto out;
1456 
1457         addlen = pn_pathleft(&oldpn); /* includes counting the slash */
1458 
1459         /*
1460          * Copy the remaining path to the new pathname if there is any.
1461          */
1462         if (addlen > 0) {
1463                 if (newpn.pn_pathlen + addlen >= newpn.pn_bufsize) {
1464                         error = ENAMETOOLONG;
1465                         goto out;
1466                 }
1467                 bcopy(oldpn.pn_path, newpn.pn_path, addlen);
1468                 newpn.pn_pathlen += addlen;
1469         }
1470         newpn.pn_buf[newpn.pn_pathlen] = '\0';
1471 
1472         /* get the newpath and store it in the servinfo4_t */
1473         newpath = kmem_alloc(newpn.pn_pathlen + 1, KM_SLEEP);
1474         bcopy(newpn.pn_buf, newpath, newpn.pn_pathlen);
1475         newpath[newpn.pn_pathlen] = '\0';
1476 
1477         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1478         svp->sv_path = newpath;
1479         svp->sv_pathlen = strlen(newpath) + 1;
1480         nfs_rw_exit(&svp->sv_lock);
1481 
1482         kmem_free(oldpath, oldpathlen);
1483 out:
1484         kmem_free(symlink, strlen(symlink) + 1);
1485         pn_free(&newpn);
1486         pn_free(&oldpn);
1487 
1488         return (error);
1489 }
1490 
1491 /*
1492  * Checks for minorversion mismatch and if we can retry.
1493  * returns 1 with mi_minorversion downgraded if true
1494  * or 0 otherwise
1495  */
1496 
1497 int
1498 nfs4check_minorvers_mismatch(mntinfo4_t *mi, nfs4_error_t *ep)
1499 {
1500         struct nfs_stats *nfsstatsp;
1501 
1502         if (ep->stat == NFS4ERR_MINOR_VERS_MISMATCH ||
1503             ep->rpc_status == RPC_CANTDECODEARGS) {
1504                 mutex_enter(&mi->mi_lock);
1505                 if (NFS4_MINORVERSION(mi) > nfs4_min_minor_version) {
1506                         mi->mi_minorversion -= 1;
1507                         mi->mi_attrvers = mi->mi_minorversion;
1508                         nfsstatsp = zone_getspecific(nfsstat_zone_key,
1509                             nfs_zone());
1510                         ASSERT(nfsstatsp != NULL);
1511                         /*
1512                          * Update the mi fields to that of the correct
1513                          * minor version.  Note that we are not adjusting
1514                          * the kstat count for the previous MISMATCHED
1515                          * compound since we want the mismatched compound
1516                          * to be accounted against the mismatched
1517                          * version.
1518                          */
1519                         mi->mi_reqs = nfsstatsp->
1520                             nfs_stats_v4[mi->mi_minorversion].rfsreqcnt_ptr;
1521                         mi->mi_rfsnames = rfsnames_v4[mi->mi_minorversion];
1522                         mutex_exit(&mi->mi_lock);
1523                         return (1);
1524                 }
1525                 mutex_exit(&mi->mi_lock);
1526         }
1527         return (0);
1528 }
1529 
1530 void
1531 nfs4_set_minorversion(mntinfo4_t *mi, int minorversion)
1532 {
1533         struct nfs_stats *nfsstatsp;
1534 
1535         mutex_enter(&mi->mi_lock);
1536         mi->mi_minorversion = minorversion;
1537         mi->mi_attrvers = minorversion;
1538 
1539         nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
1540         ASSERT(nfsstatsp != NULL);
1541         mi->mi_reqs = nfsstatsp->
1542             nfs_stats_v4[mi->mi_minorversion].rfsreqcnt_ptr;
1543         mi->mi_rfsnames = rfsnames_v4[mi->mi_minorversion];
1544 
1545         mutex_exit(&mi->mi_lock);
1546 }
1547 
1548 /*
1549  * Get the root filehandle for the given filesystem and server, and update
1550  * svp.
1551  *
1552  * If NFS4_GETFH_NEEDSOP is set, then use nfs4_start_fop and nfs4_end_fop
1553  * to coordinate with recovery.  Otherwise, the caller is assumed to be
1554  * the recovery thread or have already done a start_fop.
1555  *
1556  * Errors are returned by the nfs4_error_t parameter.
1557  */
1558 
1559 static void
1560 nfs4getfh_otw(struct mntinfo4 *mi, servinfo4_t *svp, vtype_t *vtp,
1561     int flags, cred_t *cr, nfs4_error_t *ep)
1562 {
1563         COMPOUND4args_clnt args;
1564         COMPOUND4res_clnt res;
1565         int doqueue = 1;
1566         nfs_argop4 *argop;
1567         nfs_resop4 *resop;
1568         nfs4_ga_res_t *garp;
1569         int num_argops;
1570         lookup4_param_t lookuparg;
1571         nfs_fh4 *tmpfhp;
1572         nfs_fh4 *resfhp;
1573         bool_t needrecov = FALSE;
1574         nfs4_recov_state_t recov_state;
1575         int llndx;
1576         int nthcomp;
1577         int recovery = !(flags & NFS4_GETFH_NEEDSOP);
1578         int versmismatch = 0;
1579 
1580         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1581         ASSERT(svp->sv_path != NULL);
1582         if (svp->sv_path[0] == '\0') {
1583                 nfs_rw_exit(&svp->sv_lock);
1584                 nfs4_error_init(ep, EINVAL);
1585                 return;
1586         }
1587         nfs_rw_exit(&svp->sv_lock);
1588 
1589         do {
1590                 nfs4_set_clientid(mi, NULL, cr, recovery, ep);
1591 
1592                 if (ep->error == 0)
1593                         break;
1594                 /*
1595                  * Return if in recovery or if not a minorversion mismatch
1596                  * error. Else retry.
1597                  */
1598 
1599                 if (recovery ||
1600                     !(versmismatch = nfs4check_minorvers_mismatch(mi, ep)))
1601                         return;
1602 
1603         } while (versmismatch);
1604 
1605         recov_state.rs_flags = 0;
1606         recov_state.rs_num_retry_despite_err = 0;
1607 recov_retry:
1608         nfs4_error_zinit(ep);
1609 
1610         if (!recovery) {
1611                 ep->error = nfs4_start_fop(mi, NULL, NULL, OH_MOUNT,
1612                     &recov_state, NULL);
1613 
1614                 /*
1615                  * If recovery has been started and this request as
1616                  * initiated by a mount, then we must wait for recovery
1617                  * to finish before proceeding, otherwise, the error
1618                  * cleanup would remove data structures needed by the
1619                  * recovery thread.
1620                  */
1621                 if (ep->error) {
1622                         mutex_enter(&mi->mi_lock);
1623                         if (mi->mi_flags & MI4_MOUNTING) {
1624                                 mi->mi_flags |= MI4_RECOV_FAIL;
1625                                 mi->mi_error = EIO;
1626 
1627                                 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1628                                     "nfs4getfh_otw: waiting 4 recovery\n"));
1629 
1630                                 while (mi->mi_flags & MI4_RECOV_ACTIV)
1631                                         cv_wait(&mi->mi_failover_cv,
1632                                             &mi->mi_lock);
1633                         }
1634                         mutex_exit(&mi->mi_lock);
1635                         return;
1636                 }
1637 
1638                 /*
1639                  * If the client does not specify a specific flavor to use
1640                  * and has not gotten a secinfo list from the server yet,
1641                  * retrieve the secinfo list from the server and use a
1642                  * flavor from the list to mount.
1643                  *
1644                  * If fail to get the secinfo list from the server, then
1645                  * try the default flavor.
1646                  */
1647                 if ((svp->sv_flags & SV4_TRYSECDEFAULT) &&
1648                     svp->sv_secinfo == NULL) {
1649                         (void) nfs4_secinfo_path(mi, cr, FALSE);
1650                 }
1651         }
1652 
1653         if (recovery)
1654                 args.ctag = TAG_REMAP_MOUNT;
1655         else
1656                 args.ctag = TAG_MOUNT;
1657 
1658         lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1659         lookuparg.argsp = &args;
1660         lookuparg.resp = &res;
1661         lookuparg.header_len = 2;       /* Putrootfh, getfh */
1662         lookuparg.trailer_len = 0;
1663         lookuparg.ga_bits = MI4_FSINFO_ATTRMAP(mi);
1664         lookuparg.mi = mi;
1665 
1666         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1667         ASSERT(svp->sv_path != NULL);
1668         llndx = nfs4lookup_setup(svp->sv_path, &lookuparg, 0);
1669         nfs_rw_exit(&svp->sv_lock);
1670 
1671         argop = args.array;
1672         num_argops = args.array_len;
1673 
1674         /* choose public or root filehandle */
1675         if (flags & NFS4_GETFH_PUBLIC)
1676                 argop[0].argop = OP_PUTPUBFH;
1677         else
1678                 argop[0].argop = OP_PUTROOTFH;
1679 
1680         /* get fh */
1681         argop[1].argop = OP_GETFH;
1682 
1683         NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1684             "nfs4getfh_otw: %s call, mi 0x%p",
1685             needrecov ? "recov" : "first", (void *)mi));
1686 
1687         rfs4call(mi, NULL, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
1688 
1689         needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
1690 
1691         if (needrecov) {
1692                 bool_t abort;
1693 
1694                 if (recovery) {
1695                         nfs4args_lookup_free(argop, num_argops);
1696                         kmem_free(argop,
1697                             lookuparg.arglen * sizeof (nfs_argop4));
1698                         if (!ep->error)
1699                                 (void) xdr_free(xdr_COMPOUND4res_clnt,
1700                                     (caddr_t)&res);
1701                         return;
1702                 }
1703 
1704                 NFS4_DEBUG(nfs4_client_recov_debug,
1705                     (CE_NOTE, "nfs4getfh_otw: initiating recovery\n"));
1706 
1707                 abort = nfs4_start_recovery(ep, mi, NULL,
1708                     NULL, NULL, NULL, OP_GETFH, NULL);
1709                 if (!ep->error) {
1710                         ep->error = geterrno4(res.status);
1711                         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1712                 }
1713                 nfs4args_lookup_free(argop, num_argops);
1714                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1715                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1716                 /* have another go? */
1717                 if (abort == FALSE)
1718                         goto recov_retry;
1719                 return;
1720         }
1721 
1722         /*
1723          * No recovery, but check if error is set.
1724          */
1725         if (ep->error)  {
1726                 nfs4args_lookup_free(argop, num_argops);
1727                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1728                 if (!recovery)
1729                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1730                             needrecov);
1731                 return;
1732         }
1733 
1734 is_link_err:
1735 
1736         /* for non-recovery errors */
1737         if (res.status && res.status != NFS4ERR_SYMLINK) {
1738                 if (!recovery) {
1739                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1740                             needrecov);
1741                 }
1742                 nfs4args_lookup_free(argop, num_argops);
1743                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1744                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1745                 return;
1746         }
1747 
1748         /*
1749          * If any intermediate component in the path is a symbolic link,
1750          * resolve the symlink, then try mount again using the new path.
1751          */
1752         if (res.status == NFS4ERR_SYMLINK) {
1753                 int where;
1754 
1755                 /*
1756                  * This must be from OP_LOOKUP failure. The (cfh) for this
1757                  * OP_LOOKUP is a symlink node. Found out where the
1758                  * OP_GETFH is for the (cfh) that is a symlink node.
1759                  *
1760                  * Example:
1761                  * (mount) PUTROOTFH, GETFH, LOOKUP comp1, GETFH, GETATTR,
1762                  * LOOKUP comp2, GETFH, GETATTR, LOOKUP comp3, GETFH, GETATTR
1763                  *
1764                  * LOOKUP comp3 fails with SYMLINK because comp2 is a symlink.
1765                  * In this case, where = 7, nthcomp = 2.
1766                  */
1767                 where = res.array_len - 2;
1768                 ASSERT(where > 0);
1769 
1770                 resop = &res.array[where - 1];
1771                 ASSERT(resop->resop == OP_GETFH);
1772                 tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1773                 nthcomp = res.array_len/3 - 1;
1774 
1775                 /*
1776                  * Need to call nfs4_end_op before resolve_sympath to avoid
1777                  * potential nfs4_start_op deadlock.
1778                  */
1779                 if (!recovery)
1780                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1781                             needrecov);
1782 
1783                 ep->error = resolve_sympath(mi, svp, nthcomp, tmpfhp, cr,
1784                     flags);
1785 
1786                 nfs4args_lookup_free(argop, num_argops);
1787                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1788                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1789 
1790                 if (ep->error)
1791                         return;
1792 
1793                 goto recov_retry;
1794         }
1795 
1796         /* getfh */
1797         resop = &res.array[res.array_len - 2];
1798         ASSERT(resop->resop == OP_GETFH);
1799         resfhp = &resop->nfs_resop4_u.opgetfh.object;
1800 
1801         /* getattr fsinfo res */
1802         resop++;
1803         garp = &resop->nfs_resop4_u.opgetattr.ga_res;
1804 
1805         /*
1806          * verify attrs successfully decoded before
1807          * referencing anything in n4g_ext_res.
1808          */
1809         if (garp->n4g_attrerr != NFS4_GETATTR_OP_OK) {
1810                 if (!recovery)
1811                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1812                             needrecov);
1813                 ep->error = garp->n4g_attrerr;
1814                 nfs4args_lookup_free(argop, num_argops);
1815                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1816                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1817                 return;
1818         }
1819 
1820         *vtp = garp->n4g_va.va_type;
1821 
1822         mi->mi_fh_expire_type = garp->n4g_ext_res->n4g_fet;
1823 
1824         mutex_enter(&mi->mi_lock);
1825         if (garp->n4g_ext_res->n4g_pc4.pc4_link_support)
1826                 mi->mi_flags |= MI4_LINK;
1827         if (garp->n4g_ext_res->n4g_pc4.pc4_symlink_support)
1828                 mi->mi_flags |= MI4_SYMLINK;
1829 
1830         /*
1831          * XXX Currently does not handle change in server personas
1832          */
1833         if (ATTR_ISSET(garp->n4g_ext_res->n4g_suppattrs, LAYOUT_TYPE) &&
1834             !(mi->mi_flags & MI4_PNFS)) {
1835                 DTRACE_PROBE4(nfsc__i_getfhotw, char *,
1836                     "non pNFS server:", char *, svp->sv_hostname,
1837                     char *, "supports FATTR4_FS_LAYOUTTYPE_MASK for ",
1838                     char *, svp->sv_path);
1839         }
1840 
1841         /* XXX conditionalize lines above */
1842         if (ATTR_ISSET(garp->n4g_ext_res->n4g_suppattrs, ACL))
1843                 mi->mi_flags |= MI4_ACL;
1844         mutex_exit(&mi->mi_lock);
1845 
1846         if (garp->n4g_ext_res->n4g_maxread == 0)
1847                 mi->mi_tsize =
1848                     MIN(MAXBSIZE, mi->mi_tsize);
1849         else
1850                 mi->mi_tsize =
1851                     MIN(garp->n4g_ext_res->n4g_maxread,
1852                     mi->mi_tsize);
1853 
1854         if (garp->n4g_ext_res->n4g_maxwrite == 0)
1855                 mi->mi_stsize =
1856                     MIN(MAXBSIZE, mi->mi_stsize);
1857         else
1858                 mi->mi_stsize =
1859                     MIN(garp->n4g_ext_res->n4g_maxwrite,
1860                     mi->mi_stsize);
1861 
1862         if (garp->n4g_ext_res->n4g_maxfilesize != 0)
1863                 mi->mi_maxfilesize =
1864                     MIN(garp->n4g_ext_res->n4g_maxfilesize,
1865                     mi->mi_maxfilesize);
1866 
1867         /*
1868          * If the final component is a a symbolic link, resolve the symlink,
1869          * then try mount again using the new path.
1870          *
1871          * Assume no symbolic link for root filesysm "/".
1872          */
1873         if (*vtp == VLNK) {
1874                 /*
1875                  * nthcomp is the total result length minus
1876                  * the 1st 2 OPs (PUTROOTFH, GETFH),
1877                  * then divided by 3 (LOOKUP,GETFH,GETATTR)
1878                  *
1879                  * e.g. PUTROOTFH GETFH LOOKUP 1st-comp GETFH GETATTR
1880                  *      LOOKUP 2nd-comp GETFH GETATTR
1881                  *
1882                  *      (8 - 2)/3 = 2
1883                  */
1884                 nthcomp = (res.array_len - 2)/3;
1885 
1886                 /*
1887                  * Need to call nfs4_end_op before resolve_sympath to avoid
1888                  * potential nfs4_start_op deadlock. See RFE 4777612.
1889                  */
1890                 if (!recovery)
1891                         nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state,
1892                             needrecov);
1893 
1894                 ep->error = resolve_sympath(mi, svp, nthcomp, resfhp, cr,
1895                     flags);
1896 
1897                 nfs4args_lookup_free(argop, num_argops);
1898                 kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1899                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1900 
1901                 if (ep->error)
1902                         return;
1903 
1904                 goto recov_retry;
1905         }
1906 
1907         /*
1908          * We need to figure out where in the compound the getfh
1909          * for the parent directory is. If the object to be mounted is
1910          * the root, then there is no lookup at all:
1911          * PUTROOTFH, GETFH.
1912          * If the object to be mounted is in the root, then the compound is:
1913          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR.
1914          * In either of these cases, the index of the GETFH is 1.
1915          * If it is not at the root, then it's something like:
1916          * PUTROOTFH, GETFH, LOOKUP, GETFH, GETATTR,
1917          * LOOKUP, GETFH, GETATTR
1918          * In this case, the index is llndx (last lookup index) - 2.
1919          */
1920         if (llndx == -1 || llndx == 2)
1921                 resop = &res.array[1];
1922         else {
1923                 ASSERT(llndx > 2);
1924                 resop = &res.array[llndx-2];
1925         }
1926 
1927         ASSERT(resop->resop == OP_GETFH);
1928         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
1929 
1930         /* save the filehandles for the replica */
1931         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
1932         ASSERT(tmpfhp->nfs_fh4_len <= NFS4_FHSIZE);
1933         svp->sv_pfhandle.fh_len = tmpfhp->nfs_fh4_len;
1934         bcopy(tmpfhp->nfs_fh4_val, svp->sv_pfhandle.fh_buf,
1935             tmpfhp->nfs_fh4_len);
1936         ASSERT(resfhp->nfs_fh4_len <= NFS4_FHSIZE);
1937         svp->sv_fhandle.fh_len = resfhp->nfs_fh4_len;
1938         bcopy(resfhp->nfs_fh4_val, svp->sv_fhandle.fh_buf, resfhp->nfs_fh4_len);
1939 
1940         /* initialize fsid and supp_attrs for server fs */
1941         svp->sv_fsid = garp->n4g_fsid;
1942         svp->sv_supp_attrs = garp->n4g_ext_res->n4g_suppattrs;
1943         ATTRMAP_SET(svp->sv_supp_attrs, MI4_MAND_ATTRMAP(mi));
1944         svp->sv_supp_exclcreat = garp->n4g_ext_res->n4g_supp_exclcreat;
1945 
1946         nfs_rw_exit(&svp->sv_lock);
1947 
1948         nfs4args_lookup_free(argop, num_argops);
1949         kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
1950         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1951         if (!recovery)
1952                 nfs4_end_fop(mi, NULL, NULL, OH_MOUNT, &recov_state, needrecov);
1953 }
1954 
1955 static ushort_t nfs4_max_threads = 8;   /* max number of active async threads */
1956 static uint_t nfs4_bsize = 32 * 1024;   /* client `block' size */
1957 static uint_t nfs4_async_clusters = 1;  /* # of reqs from each async queue */
1958 static uint_t nfs4_cots_timeo = NFS_COTS_TIMEO;
1959 
1960 /*
1961  * Remap the root filehandle for the given filesystem.
1962  *
1963  * results returned via the nfs4_error_t parameter.
1964  */
1965 void
1966 nfs4_remap_root(mntinfo4_t *mi, nfs4_error_t *ep, int flags)
1967 {
1968         struct servinfo4 *svp;
1969         vtype_t vtype;
1970         nfs_fh4 rootfh;
1971         int getfh_flags;
1972         char *orig_sv_path;
1973         int orig_sv_pathlen, num_retry;
1974 
1975         mutex_enter(&mi->mi_lock);
1976 
1977 remap_retry:
1978         svp = mi->mi_curr_serv;
1979         getfh_flags =
1980             (flags & NFS4_REMAP_NEEDSOP) ? NFS4_GETFH_NEEDSOP : 0;
1981         getfh_flags |=
1982             (mi->mi_flags & MI4_PUBLIC) ? NFS4_GETFH_PUBLIC : 0;
1983         mutex_exit(&mi->mi_lock);
1984 
1985         /*
1986          * Just in case server path being mounted contains
1987          * symlinks and fails w/STALE, save the initial sv_path
1988          * so we can redrive the initial mount compound with the
1989          * initial sv_path -- not a symlink-expanded version.
1990          *
1991          * This could only happen if a symlink was expanded
1992          * and the expanded mount compound failed stale.  Because
1993          * it could be the case that the symlink was removed at
1994          * the server (and replaced with another symlink/dir,
1995          * we need to use the initial sv_path when attempting
1996          * to re-lookup everything and recover.
1997          */
1998         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1999         orig_sv_pathlen = svp->sv_pathlen;
2000         orig_sv_path = kmem_alloc(orig_sv_pathlen, KM_SLEEP);
2001         bcopy(svp->sv_path, orig_sv_path, orig_sv_pathlen);
2002         nfs_rw_exit(&svp->sv_lock);
2003 
2004         num_retry = nfs4_max_mount_retry;
2005 
2006         do {
2007                 /*
2008                  * Get the root fh from the server.  Retry nfs4_max_mount_retry
2009                  * (2) times if it fails with STALE since the recovery
2010                  * infrastructure doesn't do STALE recovery for components
2011                  * of the server path to the object being mounted.
2012                  */
2013                 nfs4getfh_otw(mi, svp, &vtype, getfh_flags, CRED(), ep);
2014 
2015                 if (ep->error == 0 && ep->stat == NFS4_OK)
2016                         break;
2017 
2018                 /*
2019                  * For some reason, the mount compound failed.  Before
2020                  * retrying, we need to restore the original sv_path
2021                  * because it might have contained symlinks that were
2022                  * expanded by nfsgetfh_otw before the failure occurred.
2023                  * replace current sv_path with orig sv_path -- just in case
2024                  * it changed due to embedded symlinks.
2025                  */
2026                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2027                 if (orig_sv_pathlen != svp->sv_pathlen) {
2028                         kmem_free(svp->sv_path, svp->sv_pathlen);
2029                         svp->sv_path = kmem_alloc(orig_sv_pathlen, KM_SLEEP);
2030                         svp->sv_pathlen = orig_sv_pathlen;
2031                 }
2032                 bcopy(orig_sv_path, svp->sv_path, orig_sv_pathlen);
2033                 nfs_rw_exit(&svp->sv_lock);
2034 
2035         } while (num_retry-- > 0);
2036 
2037         kmem_free(orig_sv_path, orig_sv_pathlen);
2038 
2039         if (ep->error != 0 || ep->stat != 0) {
2040                 return;
2041         }
2042 
2043         if (vtype != VNON && vtype != mi->mi_type) {
2044                 /* shouldn't happen */
2045                 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2046                     "nfs4_remap_root: server root vnode type (%d) doesn't "
2047                     "match mount info (%d)", vtype, mi->mi_type);
2048         }
2049 
2050         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2051         rootfh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2052         rootfh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2053         nfs_rw_exit(&svp->sv_lock);
2054         sfh4_update(mi->mi_rootfh, &rootfh);
2055 
2056         /*
2057          * It's possible that recovery took place on the filesystem
2058          * and the server has been updated between the time we did
2059          * the nfs4getfh_otw and now. Re-drive the otw operation
2060          * to make sure we have a good fh.
2061          */
2062         mutex_enter(&mi->mi_lock);
2063         if (mi->mi_curr_serv != svp)
2064                 goto remap_retry;
2065 
2066         mutex_exit(&mi->mi_lock);
2067 }
2068 
2069 static int
2070 nfs4rootvp(vnode_t **rtvpp, vfs_t *vfsp, struct servinfo4 *svp_head,
2071     int flags, cred_t *cr, zone_t *zone)
2072 {
2073         vnode_t *rtvp = NULL;
2074         mntinfo4_t *mi;
2075         dev_t nfs_dev;
2076         int error = 0;
2077         rnode4_t *rp;
2078         int i;
2079         struct vattr va;
2080         vtype_t vtype = VNON;
2081         vtype_t tmp_vtype = VNON;
2082         struct servinfo4 *firstsvp = NULL, *svp = svp_head;
2083         nfs4_server_t *np;
2084         nfs4_oo_hash_bucket_t *bucketp;
2085         nfs_fh4 fh;
2086         char *droptext = "";
2087         nfs4_fname_t *mfname;
2088         nfs4_error_t e;
2089         char *orig_sv_path;
2090         int orig_sv_pathlen, num_retry, removed;
2091         cred_t *lcr = NULL, *tcr = cr;
2092         struct nfs_stats *nfsstatsp;
2093 
2094         nfsstatsp = zone_getspecific(nfsstat_zone_key, nfs_zone());
2095         ASSERT(nfsstatsp != NULL);
2096 
2097         ASSERT(nfs_zone() == zone);
2098         ASSERT(crgetref(cr));
2099 
2100         /*
2101          * Create a mount record and link it to the vfs struct.
2102          */
2103         mi = kmem_zalloc(sizeof (*mi), KM_SLEEP);
2104         mutex_init(&mi->mi_lock, NULL, MUTEX_DEFAULT, NULL);
2105         nfs_rw_init(&mi->mi_recovlock, NULL, RW_DEFAULT, NULL);
2106         nfs_rw_init(&mi->mi_rename_lock, NULL, RW_DEFAULT, NULL);
2107         nfs_rw_init(&mi->mi_fh_lock, NULL, RW_DEFAULT, NULL);
2108 
2109         if (!(flags & NFSMNT_SOFT))
2110                 mi->mi_flags |= MI4_HARD;
2111         if ((flags & NFSMNT_NOPRINT))
2112                 mi->mi_flags |= MI4_NOPRINT;
2113         if (flags & NFSMNT_INT)
2114                 mi->mi_flags |= MI4_INT;
2115         if (flags & NFSMNT_PUBLIC)
2116                 mi->mi_flags |= MI4_PUBLIC;
2117         if (flags & NFSMNT_MIRRORMOUNT)
2118                 mi->mi_flags |= MI4_MIRRORMOUNT;
2119         mi->mi_retrans = NFS_RETRIES;
2120         if (svp->sv_knconf->knc_semantics == NC_TPI_COTS_ORD ||
2121             svp->sv_knconf->knc_semantics == NC_TPI_COTS)
2122                 mi->mi_timeo = nfs4_cots_timeo;
2123         else
2124                 mi->mi_timeo = NFS_TIMEO;
2125         mi->mi_prog = NFS_PROGRAM;
2126         mi->mi_vers = NFS_V4;
2127         cv_init(&mi->mi_failover_cv, NULL, CV_DEFAULT, NULL);
2128         mi->mi_servers = svp;
2129         mi->mi_curr_serv = svp;
2130         mi->mi_acregmin = SEC2HR(ACREGMIN);
2131         mi->mi_acregmax = SEC2HR(ACREGMAX);
2132         mi->mi_acdirmin = SEC2HR(ACDIRMIN);
2133         mi->mi_acdirmax = SEC2HR(ACDIRMAX);
2134         mi->mi_fh_expire_type = FH4_PERSISTENT;
2135         mi->mi_clientid_next = NULL;
2136         mi->mi_clientid_prev = NULL;
2137         mi->mi_grace_wait = 0;
2138         mi->mi_error = 0;
2139         mi->mi_srvsettime = 0;
2140 
2141         mi->mi_count = 1;
2142 
2143         mi->mi_tsize = nfs4_tsize(svp->sv_knconf);
2144         mi->mi_stsize = mi->mi_tsize;
2145 
2146         if (flags & NFSMNT_DIRECTIO)
2147                 mi->mi_flags |= MI4_DIRECTIO;
2148 
2149         mi->mi_flags |= MI4_MOUNTING;
2150 
2151         /*
2152          * Until a time when the user can set minorversion, do auto
2153          * negotiation.
2154          */
2155         nfs4_set_minorversion(mi, nfs4_max_minor_version);
2156 
2157         /*
2158          * Make a vfs struct for nfs.  We do this here instead of below
2159          * because rtvp needs a vfs before we can do a getattr on it.
2160          *
2161          * Assign a unique device id to the mount
2162          */
2163         mutex_enter(&nfs_minor_lock);
2164         do {
2165                 nfs_minor = (nfs_minor + 1) & MAXMIN32;
2166                 nfs_dev = makedevice(nfs_major, nfs_minor);
2167         } while (vfs_devismounted(nfs_dev));
2168         mutex_exit(&nfs_minor_lock);
2169 
2170         vfsp->vfs_dev = nfs_dev;
2171         vfs_make_fsid(&vfsp->vfs_fsid, nfs_dev, nfs4fstyp);
2172         vfsp->vfs_data = (caddr_t)mi;
2173         vfsp->vfs_fstype = nfsfstyp;
2174         vfsp->vfs_bsize = nfs4_bsize;
2175 
2176         /*
2177          * Initialize fields used to support async putpage operations.
2178          */
2179         for (i = 0; i < NFS4_ASYNC_TYPES; i++)
2180                 mi->mi_async_clusters[i] = nfs4_async_clusters;
2181         mi->mi_async_init_clusters = nfs4_async_clusters;
2182         mi->mi_async_curr = &mi->mi_async_reqs[0];
2183         mi->mi_max_threads = nfs4_max_threads;
2184         mutex_init(&mi->mi_async_lock, NULL, MUTEX_DEFAULT, NULL);
2185         cv_init(&mi->mi_async_reqs_cv, NULL, CV_DEFAULT, NULL);
2186         cv_init(&mi->mi_async_work_cv, NULL, CV_DEFAULT, NULL);
2187         cv_init(&mi->mi_async_cv, NULL, CV_DEFAULT, NULL);
2188         cv_init(&mi->mi_inact_req_cv, NULL, CV_DEFAULT, NULL);
2189 
2190         mi->mi_vfsp = vfsp;
2191         zone_hold(mi->mi_zone = zone);
2192         nfs4_mi_zonelist_add(mi);
2193 
2194         /*
2195          * Initialize the <open owner/cred> hash table.
2196          */
2197         for (i = 0; i < NFS4_NUM_OO_BUCKETS; i++) {
2198                 bucketp = &(mi->mi_oo_list[i]);
2199                 mutex_init(&bucketp->b_lock, NULL, MUTEX_DEFAULT, NULL);
2200                 list_create(&bucketp->b_oo_hash_list,
2201                     sizeof (nfs4_open_owner_t),
2202                     offsetof(nfs4_open_owner_t, oo_hash_node));
2203         }
2204 
2205         /*
2206          * Initialize the freed open owner list.
2207          */
2208         mi->mi_foo_num = 0;
2209         mi->mi_foo_max = NFS4_NUM_FREED_OPEN_OWNERS;
2210         list_create(&mi->mi_foo_list, sizeof (nfs4_open_owner_t),
2211             offsetof(nfs4_open_owner_t, oo_foo_node));
2212 
2213         list_create(&mi->mi_lost_state, sizeof (nfs4_lost_rqst_t),
2214             offsetof(nfs4_lost_rqst_t, lr_node));
2215 
2216         list_create(&mi->mi_bseqid_list, sizeof (nfs4_bseqid_entry_t),
2217             offsetof(nfs4_bseqid_entry_t, bs_node));
2218 
2219         /*
2220          * Initialize the msg buffer.
2221          */
2222         list_create(&mi->mi_msg_list, sizeof (nfs4_debug_msg_t),
2223             offsetof(nfs4_debug_msg_t, msg_node));
2224         mi->mi_msg_count = 0;
2225         mutex_init(&mi->mi_msg_list_lock, NULL, MUTEX_DEFAULT, NULL);
2226 
2227         /*
2228          * Initialize kstats
2229          */
2230         nfs4_mnt_kstat_init(vfsp);
2231 
2232         /*
2233          * Initialize the shared filehandle pool.
2234          */
2235         sfh4_createtab(&mi->mi_filehandles);
2236 
2237         /*
2238          * Save server path we're attempting to mount.
2239          */
2240         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2241         orig_sv_pathlen = svp_head->sv_pathlen;
2242         orig_sv_path = kmem_alloc(svp_head->sv_pathlen, KM_SLEEP);
2243         bcopy(svp_head->sv_path, orig_sv_path, svp_head->sv_pathlen);
2244         nfs_rw_exit(&svp->sv_lock);
2245 
2246         /*
2247          * Make the GETFH call to get root fh for each replica.
2248          */
2249         if (svp_head->sv_next)
2250                 droptext = ", dropping replica";
2251 
2252         /*
2253          * If the uid is set then set the creds for secure mounts
2254          * by proxy processes such as automountd.
2255          */
2256         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2257         if (svp->sv_secdata->uid != 0 &&
2258             svp->sv_secdata->rpcflavor == RPCSEC_GSS) {
2259                 lcr = crdup(cr);
2260                 (void) crsetugid(lcr, svp->sv_secdata->uid, crgetgid(cr));
2261                 tcr = lcr;
2262         }
2263         nfs_rw_exit(&svp->sv_lock);
2264         for (svp = svp_head; svp; svp = svp->sv_next) {
2265                 if (nfs4_chkdup_servinfo4(svp_head, svp)) {
2266                         nfs_cmn_err(error, CE_WARN,
2267                             VERS_MSG "Host %s is a duplicate%s",
2268                             svp->sv_hostname, droptext);
2269                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2270                         svp->sv_flags |= SV4_NOTINUSE;
2271                         nfs_rw_exit(&svp->sv_lock);
2272                         continue;
2273                 }
2274                 mi->mi_curr_serv = svp;
2275 
2276                 /*
2277                  * Just in case server path being mounted contains
2278                  * symlinks and fails w/STALE, save the initial sv_path
2279                  * so we can redrive the initial mount compound with the
2280                  * initial sv_path -- not a symlink-expanded version.
2281                  *
2282                  * This could only happen if a symlink was expanded
2283                  * and the expanded mount compound failed stale.  Because
2284                  * it could be the case that the symlink was removed at
2285                  * the server (and replaced with another symlink/dir,
2286                  * we need to use the initial sv_path when attempting
2287                  * to re-lookup everything and recover.
2288                  *
2289                  * Other mount errors should evenutally be handled here also
2290                  * (NFS4ERR_DELAY, NFS4ERR_RESOURCE).  For now, all mount
2291                  * failures will result in mount being redriven a few times.
2292                  */
2293                 num_retry = nfs4_max_mount_retry;
2294                 do {
2295                         nfs4getfh_otw(mi, svp, &tmp_vtype,
2296                             ((flags & NFSMNT_PUBLIC) ? NFS4_GETFH_PUBLIC : 0) |
2297                             NFS4_GETFH_NEEDSOP, tcr, &e);
2298 
2299                         if (e.error == 0 && e.stat == NFS4_OK)
2300                                 break;
2301 
2302                         /*
2303                          * replace current sv_path with orig sv_path -- just in
2304                          * case it changed due to embedded symlinks.
2305                          */
2306                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2307                         if (orig_sv_pathlen != svp->sv_pathlen) {
2308                                 kmem_free(svp->sv_path, svp->sv_pathlen);
2309                                 svp->sv_path = kmem_alloc(orig_sv_pathlen,
2310                                     KM_SLEEP);
2311                                 svp->sv_pathlen = orig_sv_pathlen;
2312                         }
2313                         bcopy(orig_sv_path, svp->sv_path, orig_sv_pathlen);
2314                         nfs_rw_exit(&svp->sv_lock);
2315 
2316                 } while (num_retry-- > 0);
2317 
2318                 error = e.error ? e.error : geterrno4(e.stat);
2319                 if (error) {
2320                         nfs_cmn_err(error, CE_WARN,
2321                             VERS_MSG "initial call to %s failed%s: %m",
2322                             svp->sv_hostname, droptext);
2323                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2324                         svp->sv_flags |= SV4_NOTINUSE;
2325                         nfs_rw_exit(&svp->sv_lock);
2326                         mi->mi_flags &= ~MI4_RECOV_FAIL;
2327                         mi->mi_error = 0;
2328                         nfs4_remove_mi_from_server(mi, NULL);
2329                         continue;
2330                 }
2331 
2332                 if (tmp_vtype == VBAD) {
2333                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2334                             VERS_MSG "%s returned a bad file type for "
2335                             "root%s", svp->sv_hostname, droptext);
2336                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2337                         svp->sv_flags |= SV4_NOTINUSE;
2338                         nfs_rw_exit(&svp->sv_lock);
2339                         continue;
2340                 }
2341 
2342                 if (vtype == VNON) {
2343                         vtype = tmp_vtype;
2344                 } else if (vtype != tmp_vtype) {
2345                         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
2346                             VERS_MSG "%s returned a different file type "
2347                             "for root%s", svp->sv_hostname, droptext);
2348                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2349                         svp->sv_flags |= SV4_NOTINUSE;
2350                         nfs_rw_exit(&svp->sv_lock);
2351                         continue;
2352                 }
2353                 if (firstsvp == NULL)
2354                         firstsvp = svp;
2355         }
2356 
2357         kmem_free(orig_sv_path, orig_sv_pathlen);
2358 
2359         if (firstsvp == NULL) {
2360                 if (error == 0)
2361                         error = ENOENT;
2362                 goto bad;
2363         }
2364 
2365         mi->mi_curr_serv = svp = firstsvp;
2366 
2367         /*
2368          * Revert back the clientid to mi_curr_serv
2369          */
2370         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
2371         mutex_enter(&nfs4_server_lst_lock);
2372         np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
2373         mutex_exit(&nfs4_server_lst_lock);
2374         mi->mi_clientid = np->clientid;
2375         mutex_exit(&np->s_lock);
2376         nfs4_server_rele(np);
2377         nfs_rw_exit(&mi->mi_recovlock);
2378 
2379         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2380         ASSERT((mi->mi_curr_serv->sv_flags & SV4_NOTINUSE) == 0);
2381         fh.nfs_fh4_len = svp->sv_fhandle.fh_len;
2382         fh.nfs_fh4_val = svp->sv_fhandle.fh_buf;
2383         mi->mi_rootfh = sfh4_get(&fh, mi);
2384         fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
2385         fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
2386         mi->mi_srvparentfh = sfh4_get(&fh, mi);
2387         nfs_rw_exit(&svp->sv_lock);
2388 
2389         /*
2390          * Get the fname for filesystem root.
2391          */
2392         mi->mi_fname = fn_get(NULL, ".", mi->mi_rootfh);
2393         mfname = mi->mi_fname;
2394         fn_hold(mfname);
2395 
2396         /*
2397          * Make the root vnode without attributes.
2398          */
2399         rtvp = makenfs4node_by_fh(mi->mi_rootfh, NULL,
2400             &mfname, NULL, mi, cr, gethrtime());
2401         rtvp->v_type = vtype;
2402 
2403         mi->mi_curread = mi->mi_tsize;
2404         mi->mi_curwrite = mi->mi_stsize;
2405 
2406         /*
2407          * Start the manager thread responsible for handling async worker
2408          * threads.
2409          */
2410         MI4_HOLD(mi);
2411         VFS_HOLD(vfsp); /* add reference for thread */
2412         mi->mi_manager_thread = zthread_create(NULL, 0, nfs4_async_manager,
2413             vfsp, 0, minclsyspri);
2414         ASSERT(mi->mi_manager_thread != NULL);
2415 
2416         /*
2417          * Create the thread that handles over-the-wire calls for
2418          * VOP_INACTIVE.
2419          * This needs to happen after the manager thread is created.
2420          */
2421         MI4_HOLD(mi);
2422         mi->mi_inactive_thread = zthread_create(NULL, 0, nfs4_inactive_thread,
2423             mi, 0, minclsyspri);
2424         ASSERT(mi->mi_inactive_thread != NULL);
2425 
2426         /* If we didn't get a type, get one now */
2427         if (rtvp->v_type == VNON) {
2428                 va.va_mask = AT_TYPE;
2429                 error = nfs4getattr(rtvp, &va, tcr);
2430                 if (error)
2431                         goto bad;
2432                 rtvp->v_type = va.va_type;
2433         }
2434 
2435         mi->mi_type = rtvp->v_type;
2436 
2437         mutex_enter(&mi->mi_lock);
2438         mi->mi_flags &= ~MI4_MOUNTING;
2439         mutex_exit(&mi->mi_lock);
2440 
2441         *rtvpp = rtvp;
2442         if (lcr != NULL)
2443                 crfree(lcr);
2444 
2445         return (0);
2446 bad:
2447         /*
2448          * An error occurred somewhere, need to clean up...
2449          */
2450         if (lcr != NULL)
2451                 crfree(lcr);
2452 
2453         if (rtvp != NULL) {
2454                 /*
2455                  * We need to release our reference to the root vnode and
2456                  * destroy the mntinfo4 struct that we just created.
2457                  */
2458                 rp = VTOR4(rtvp);
2459                 if (rp->r_flags & R4HASHED)
2460                         rp4_rmhash(rp);
2461                 VN_RELE(rtvp);
2462         }
2463         nfs4_async_stop(vfsp);
2464         nfs4_async_manager_stop(vfsp);
2465         removed = nfs4_mi_zonelist_remove(mi);
2466         if (removed)
2467                 zone_rele(mi->mi_zone);
2468 
2469         /*
2470          * This releases the initial "hold" of the mi since it will never
2471          * be referenced by the vfsp.  Also, when mount returns to vfs.c
2472          * with an error, the vfsp will be destroyed, not rele'd.
2473          */
2474         MI4_RELE(mi);
2475 
2476         *rtvpp = NULL;
2477         return (error);
2478 }
2479 
2480 /*
2481  * vfs operations
2482  */
2483 static int
2484 nfs4_unmount(vfs_t *vfsp, int flag, cred_t *cr)
2485 {
2486         mntinfo4_t              *mi;
2487         ushort_t                omax;
2488         int                     removed;
2489 
2490         bool_t                  must_unlock;
2491         bool_t                  must_rele;
2492 
2493         nfs4_ephemeral_tree_t   *eph_tree;
2494 
2495         if (secpolicy_fs_unmount(cr, vfsp) != 0)
2496                 return (EPERM);
2497 
2498         mi = VFTOMI4(vfsp);
2499 
2500         if (flag & MS_FORCE) {
2501                 vfsp->vfs_flag |= VFS_UNMOUNTED;
2502                 if (nfs_zone() != mi->mi_zone) {
2503                         /*
2504                          * If the request is coming from the wrong zone,
2505                          * we don't want to create any new threads, and
2506                          * performance is not a concern.  Do everything
2507                          * inline.
2508                          */
2509                         NFS4_DEBUG(nfs4_client_zone_debug, (CE_NOTE,
2510                             "nfs4_unmount x-zone forced unmount of vfs %p\n",
2511                             (void *)vfsp));
2512                         nfs4_free_mount(vfsp, flag, cr);
2513                 } else {
2514                         /*
2515                          * Free data structures asynchronously, to avoid
2516                          * blocking the current thread (for performance
2517                          * reasons only).
2518                          */
2519                         async_free_mount(vfsp, flag, cr);
2520                 }
2521 
2522                 return (0);
2523         }
2524 
2525         /*
2526          * return all layouts before nfs4_async_stop_sig() is called
2527          */
2528         layoutreturn_all(vfsp, cr);
2529         /*
2530          * Wait until all asynchronous putpage operations on
2531          * this file system are complete before flushing rnodes
2532          * from the cache.
2533          */
2534         omax = mi->mi_max_threads;
2535         if (nfs4_async_stop_sig(vfsp))
2536                 return (EINTR);
2537 
2538         r4flush(vfsp, cr);
2539 
2540         /*
2541          * About the only reason that this would fail would be
2542          * that the harvester is already busy tearing down this
2543          * node. So we fail back to the caller and let them try
2544          * again when needed.
2545          */
2546         if (nfs4_ephemeral_umount(mi, flag, cr,
2547             &must_unlock, &must_rele, &eph_tree)) {
2548                 ASSERT(must_unlock == FALSE);
2549                 mutex_enter(&mi->mi_async_lock);
2550                 mi->mi_max_threads = omax;
2551                 mutex_exit(&mi->mi_async_lock);
2552 
2553                 return (EBUSY);
2554         }
2555 
2556         /*
2557          * If there are any active vnodes on this file system,
2558          * then the file system is busy and can't be unmounted.
2559          */
2560         if (check_rtable4(vfsp)) {
2561                 nfs4_ephemeral_umount_unlock(&must_unlock, &must_rele,
2562                     &eph_tree);
2563 
2564                 mutex_enter(&mi->mi_async_lock);
2565                 mi->mi_max_threads = omax;
2566                 mutex_exit(&mi->mi_async_lock);
2567 
2568                 return (EBUSY);
2569         }
2570 
2571         /*
2572          * The unmount can't fail from now on, so record any
2573          * ephemeral changes.
2574          */
2575         nfs4_ephemeral_umount_activate(mi, &must_unlock,
2576             &must_rele, &eph_tree);
2577 
2578         /*
2579          * There are no active files that could require over-the-wire
2580          * calls to the server, so stop the async manager and the
2581          * inactive thread.
2582          */
2583         nfs4_async_manager_stop(vfsp);
2584 
2585         /*
2586          * Destroy all rnodes belonging to this file system from the
2587          * rnode hash queues and purge any resources allocated to
2588          * them.
2589          */
2590         destroy_rtable4(vfsp, cr);
2591         vfsp->vfs_flag |= VFS_UNMOUNTED;
2592 
2593         nfs4_remove_mi_from_server(mi, NULL);
2594         removed = nfs4_mi_zonelist_remove(mi);
2595         if (removed)
2596                 zone_rele(mi->mi_zone);
2597 
2598         return (0);
2599 }
2600 
2601 /*
2602  * find root of nfs
2603  */
2604 static int
2605 nfs4_root(vfs_t *vfsp, vnode_t **vpp)
2606 {
2607         mntinfo4_t *mi;
2608         vnode_t *vp;
2609         nfs4_fname_t *mfname;
2610         servinfo4_t *svp;
2611 
2612         mi = VFTOMI4(vfsp);
2613 
2614         if (nfs_zone() != mi->mi_zone)
2615                 return (EPERM);
2616 
2617         svp = mi->mi_curr_serv;
2618         if (svp) {
2619                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2620                 if (svp->sv_flags & SV4_ROOT_STALE) {
2621                         nfs_rw_exit(&svp->sv_lock);
2622 
2623                         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
2624                         if (svp->sv_flags & SV4_ROOT_STALE) {
2625                                 svp->sv_flags &= ~SV4_ROOT_STALE;
2626                                 nfs_rw_exit(&svp->sv_lock);
2627                                 return (ENOENT);
2628                         }
2629                         nfs_rw_exit(&svp->sv_lock);
2630                 } else
2631                         nfs_rw_exit(&svp->sv_lock);
2632         }
2633 
2634         mfname = mi->mi_fname;
2635         fn_hold(mfname);
2636         vp = makenfs4node_by_fh(mi->mi_rootfh, NULL, &mfname, NULL,
2637             VFTOMI4(vfsp), CRED(), gethrtime());
2638 
2639         if (VTOR4(vp)->r_flags & R4STALE) {
2640                 VN_RELE(vp);
2641                 return (ENOENT);
2642         }
2643 
2644         ASSERT(vp->v_type == VNON || vp->v_type == mi->mi_type);
2645 
2646         vp->v_type = mi->mi_type;
2647 
2648         *vpp = vp;
2649 
2650         return (0);
2651 }
2652 
2653 static int
2654 nfs4_statfs_otw(vnode_t *vp, struct statvfs64 *sbp, cred_t *cr)
2655 {
2656         int error;
2657         nfs4_ga_res_t gar;
2658         nfs4_ga_ext_res_t ger;
2659 
2660         gar.n4g_ext_res = &ger;
2661 
2662         if (error = nfs4_attr_otw(vp, TAG_FSINFO, &gar,
2663             &MI4_STATFS_ATTRMAP(VTOMI4(vp)), cr))
2664                 return (error);
2665 
2666         *sbp = gar.n4g_ext_res->n4g_sb;
2667 
2668         return (0);
2669 }
2670 
2671 /*
2672  * Get file system statistics.
2673  */
2674 static int
2675 nfs4_statvfs(vfs_t *vfsp, struct statvfs64 *sbp)
2676 {
2677         int error;
2678         vnode_t *vp;
2679         cred_t *cr;
2680 
2681         error = nfs4_root(vfsp, &vp);
2682         if (error)
2683                 return (error);
2684 
2685         cr = CRED();
2686 
2687         error = nfs4_statfs_otw(vp, sbp, cr);
2688         if (!error) {
2689                 (void) strncpy(sbp->f_basetype,
2690                     vfssw[vfsp->vfs_fstype].vsw_name, FSTYPSZ);
2691                 sbp->f_flag = vf_to_stf(vfsp->vfs_flag);
2692         } else {
2693                 nfs4_purge_stale_fh(error, vp, cr);
2694         }
2695 
2696         VN_RELE(vp);
2697 
2698         return (error);
2699 }
2700 
2701 static kmutex_t nfs4_syncbusy;
2702 
2703 /*
2704  * Flush dirty nfs files for file system vfsp.
2705  * If vfsp == NULL, all nfs files are flushed.
2706  *
2707  * SYNC_CLOSE in flag is passed to us to
2708  * indicate that we are shutting down and or
2709  * rebooting.
2710  */
2711 static int
2712 nfs4_sync(vfs_t *vfsp, short flag, cred_t *cr)
2713 {
2714         /*
2715          * Cross-zone calls are OK here, since this translates to a
2716          * VOP_PUTPAGE(B_ASYNC), which gets picked up by the right zone.
2717          */
2718         if (!(flag & SYNC_ATTR) && mutex_tryenter(&nfs4_syncbusy) != 0) {
2719                 r4flush(vfsp, cr);
2720                 mutex_exit(&nfs4_syncbusy);
2721         }
2722 
2723         /*
2724          * if SYNC_CLOSE is set then we know that
2725          * the system is rebooting, mark the mntinfo
2726          * for later examination.
2727          */
2728         if (vfsp && (flag & SYNC_CLOSE)) {
2729                 mntinfo4_t *mi;
2730 
2731                 mi = VFTOMI4(vfsp);
2732                 if (!(mi->mi_flags & MI4_SHUTDOWN)) {
2733                         mutex_enter(&mi->mi_lock);
2734                         mi->mi_flags |= MI4_SHUTDOWN;
2735                         mutex_exit(&mi->mi_lock);
2736                 }
2737         }
2738         return (0);
2739 }
2740 
2741 /*
2742  * vget is difficult, if not impossible, to support in v4 because we don't
2743  * know the parent directory or name, which makes it impossible to create a
2744  * useful shadow vnode.  And we need the shadow vnode for things like
2745  * OPEN.
2746  */
2747 
2748 /* ARGSUSED */
2749 /*
2750  * XXX Check nfs4_vget_pseudo() for dependency.
2751  */
2752 static int
2753 nfs4_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
2754 {
2755         return (EREMOTE);
2756 }
2757 
2758 /*
2759  * nfs4_mountroot get called in the case where we are diskless booting.  All
2760  * we need from here is the ability to get the server info and from there we
2761  * can simply call nfs4_rootvp.
2762  */
2763 /* ARGSUSED */
2764 static int
2765 nfs4_mountroot(vfs_t *vfsp, whymountroot_t why)
2766 {
2767         vnode_t *rtvp;
2768         char root_hostname[SYS_NMLN+1];
2769         struct servinfo4 *svp;
2770         int error;
2771         int vfsflags;
2772         size_t size;
2773         char *root_path;
2774         struct pathname pn;
2775         char *name;
2776         cred_t *cr;
2777         struct nfs_args args;           /* nfs mount arguments */
2778         static char token[10];
2779 
2780         bzero(&args, sizeof (args));
2781 
2782         /* do this BEFORE getfile which causes xid stamps to be initialized */
2783         clkset(-1L);            /* hack for now - until we get time svc? */
2784 
2785         if (why == ROOT_REMOUNT) {
2786                 /*
2787                  * Shouldn't happen.
2788                  */
2789                 panic("nfs4_mountroot: why == ROOT_REMOUNT");
2790         }
2791 
2792         if (why == ROOT_UNMOUNT) {
2793                 /*
2794                  * Nothing to do for NFS.
2795                  */
2796                 return (0);
2797         }
2798 
2799         /*
2800          * why == ROOT_INIT
2801          */
2802 
2803         name = token;
2804         *name = 0;
2805         (void) getfsname("root", name, sizeof (token));
2806 
2807         pn_alloc(&pn);
2808         root_path = pn.pn_path;
2809 
2810         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
2811         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
2812         svp->sv_knconf = kmem_zalloc(sizeof (*svp->sv_knconf), KM_SLEEP);
2813         svp->sv_knconf->knc_protofmly = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2814         svp->sv_knconf->knc_proto = kmem_alloc(KNC_STRSIZE, KM_SLEEP);
2815 
2816         /*
2817          * Get server address
2818          * Get the root path
2819          * Get server's transport
2820          * Get server's hostname
2821          * Get options
2822          */
2823         args.addr = &svp->sv_addr;
2824         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
2825         args.fh = (char *)&svp->sv_fhandle;
2826         args.knconf = svp->sv_knconf;
2827         args.hostname = root_hostname;
2828         vfsflags = 0;
2829         if (error = mount_root(*name ? name : "root", root_path, NFS_V4,
2830             &args, &vfsflags)) {
2831                 if (error == EPROTONOSUPPORT)
2832                         nfs_cmn_err(error, CE_WARN, "nfs4_mountroot: "
2833                             "mount_root failed: server doesn't support NFS V4");
2834                 else
2835                         nfs_cmn_err(error, CE_WARN,
2836                             "nfs4_mountroot: mount_root failed: %m");
2837                 nfs_rw_exit(&svp->sv_lock);
2838                 sv4_free(svp);
2839                 pn_free(&pn);
2840                 return (error);
2841         }
2842         nfs_rw_exit(&svp->sv_lock);
2843         svp->sv_hostnamelen = (int)(strlen(root_hostname) + 1);
2844         svp->sv_hostname = kmem_alloc(svp->sv_hostnamelen, KM_SLEEP);
2845         (void) strcpy(svp->sv_hostname, root_hostname);
2846 
2847         svp->sv_pathlen = (int)(strlen(root_path) + 1);
2848         svp->sv_path = kmem_alloc(svp->sv_pathlen, KM_SLEEP);
2849         (void) strcpy(svp->sv_path, root_path);
2850 
2851         /*
2852          * Force root partition to always be mounted with AUTH_UNIX for now
2853          */
2854         svp->sv_secdata = kmem_alloc(sizeof (*svp->sv_secdata), KM_SLEEP);
2855         svp->sv_secdata->secmod = AUTH_UNIX;
2856         svp->sv_secdata->rpcflavor = AUTH_UNIX;
2857         svp->sv_secdata->data = NULL;
2858 
2859         cr = crgetcred();
2860         rtvp = NULL;
2861 
2862         error = nfs4rootvp(&rtvp, vfsp, svp, args.flags, cr, global_zone);
2863 
2864         if (error) {
2865                 crfree(cr);
2866                 pn_free(&pn);
2867                 sv4_free(svp);
2868                 return (error);
2869         }
2870 
2871         crfree(cr);
2872 
2873         error = nfs4_setopts(rtvp, DATAMODEL_NATIVE, &args);
2874         if (error) {
2875                 nfs_cmn_err(error, CE_WARN,
2876                     "nfs4_mountroot: invalid root mount options");
2877                 pn_free(&pn);
2878                 goto errout;
2879         }
2880 
2881         (void) vfs_lock_wait(vfsp);
2882         vfs_add(NULL, vfsp, vfsflags);
2883         vfs_unlock(vfsp);
2884 
2885         size = strlen(svp->sv_hostname);
2886         (void) strcpy(rootfs.bo_name, svp->sv_hostname);
2887         rootfs.bo_name[size] = ':';
2888         (void) strcpy(&rootfs.bo_name[size + 1], root_path);
2889 
2890         pn_free(&pn);
2891 
2892 errout:
2893         if (error) {
2894                 sv4_free(svp);
2895                 nfs4_async_stop(vfsp);
2896                 nfs4_async_manager_stop(vfsp);
2897         }
2898 
2899         if (rtvp != NULL)
2900                 VN_RELE(rtvp);
2901 
2902         return (error);
2903 }
2904 
2905 /*
2906  * Initialization routine for VFS routines.  Should only be called once
2907  */
2908 int
2909 nfs4_vfsinit(void)
2910 {
2911         mutex_init(&nfs4_syncbusy, NULL, MUTEX_DEFAULT, NULL);
2912         nfs4setclientid_init();
2913         nfs4_ephemeral_init();
2914         nfs4session_init();
2915         return (0);
2916 }
2917 
2918 void
2919 nfs4_vfsfini(void)
2920 {
2921         nfs4_ephemeral_fini();
2922         nfs4setclientid_fini();
2923         mutex_destroy(&nfs4_syncbusy);
2924 }
2925 
2926 void
2927 nfs4_freevfs(vfs_t *vfsp)
2928 {
2929         mntinfo4_t *mi;
2930 
2931         /* need to release the initial hold */
2932         mi = VFTOMI4(vfsp);
2933         MI4_RELE(mi);
2934 }
2935 
2936 /*
2937  * Client side SETCLIENTID and SETCLIENTID_CONFIRM
2938  */
2939 struct nfs4_server nfs4_server_lst =
2940         { &nfs4_server_lst, &nfs4_server_lst };
2941 
2942 kmutex_t nfs4_server_lst_lock;
2943 
2944 static void
2945 nfs4setclientid_init(void)
2946 {
2947         mutex_init(&nfs4_server_lst_lock, NULL, MUTEX_DEFAULT, NULL);
2948 }
2949 
2950 static void
2951 nfs4setclientid_fini(void)
2952 {
2953         mutex_destroy(&nfs4_server_lst_lock);
2954 }
2955 
2956 int nfs4_retry_sclid_delay = NFS4_RETRY_SCLID_DELAY;
2957 int nfs4_num_sclid_retries = NFS4_NUM_SCLID_RETRIES;
2958 
2959 
2960 /*
2961  * np->s_lock held before entry and return
2962  */
2963 
2964 int
2965 nfs4bind_conn_to_session(nfs4_server_t *np, servinfo4_t *svp, mntinfo4_t *mi,
2966     cred_t *cr, channel_dir_from_client4 dir)
2967 {
2968         COMPOUND4args_clnt              args;
2969         COMPOUND4res_clnt               res;
2970         nfs_argop4                      argop[1];
2971         BIND_CONN_TO_SESSION4args       *argp;
2972         nfs4_error_t                    e;
2973         int                             doqueue = 1;
2974         int                             setcb;
2975         int                             needrecov = 0;
2976 
2977         res.argsp = &args;
2978 
2979         args.ctag = TAG_BIND_CONN_TO_SESSION;
2980         args.array = argop;
2981         args.array_len = 1;
2982 
2983         args.minor_vers = mi->mi_minorversion;
2984 
2985         argop[0].argop = OP_BIND_CONN_TO_SESSION;
2986         argp = &argop[0].nfs_argop4_u.opbind_conn_to_session;
2987         bcopy(&np->ssx.sessionid, &argp->bctsa_sessid,
2988             sizeof (np->ssx.sessionid));
2989 
2990         mutex_exit(&np->s_lock);
2991 
2992         argp->bctsa_dir = dir;
2993         argp->bctsa_use_conn_in_rdma_mode = FALSE;
2994 
2995         /*
2996          * Avoid callback server setup, if this is a non
2997          * bi-directional rpc connection that is for fore channel only.
2998          */
2999 
3000         if (dir == CDFC4_FORE)
3001                 setcb = 0;
3002         else
3003                 setcb = RFS4CALL_SETCB;
3004 
3005 
3006         rfs4call(mi, svp, &args, &res, cr, &doqueue, setcb, &e);
3007 
3008         /*
3009          * The errors we need to worry about involve a bad/dead
3010          * session. That is handled by the recovery action.
3011          */
3012 
3013         needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3014 
3015         if (e.error && !needrecov) {
3016                 mutex_enter(&np->s_lock);
3017                 return (e.error);
3018         }
3019 
3020         if (!e.error)
3021                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3022 
3023         if (needrecov) {
3024                 (void) nfs4_start_recovery(&e, mi, NULL,
3025                     NULL, NULL, NULL, OP_BIND_CONN_TO_SESSION, NULL);
3026         }
3027         mutex_enter(&np->s_lock);
3028         return (e.error);
3029 }
3030 
3031 static void
3032 nfs4_setup_pnfs_mi(nfs4_server_t *np, mntinfo4_t *mi, servinfo4_t *svp)
3033 {
3034 
3035         if (np->s_flags & N4S_USE_PNFS_MDS) {
3036                 if ((mi->mi_flags & MI4_PNFS) == 0) {
3037 
3038                         mi->mi_flags |= MI4_PNFS;
3039                         nfs4_pnfs_init_mi(mi);
3040 
3041                         /* XXX for now cmn_err is handy, will go away later */
3042                         cmn_err(CE_NOTE, "enabling pNFS on %s",
3043                             svp->sv_hostname);
3044 
3045                         DTRACE_PROBE2(nfsc__i_exchangeid, char *,
3046                             "enabling pNFS on ", char *, svp->sv_hostname);
3047                 }
3048         }
3049         /*
3050          * In the future, we'll need to consider the server turning off
3051          * the MDS bit.  This could happen after a server restart with
3052          * PNFS disabled (after having been previously enabled).  The client
3053          * might interpret this to be like LAYOUTRECALL_ALL.
3054          */
3055 
3056 }
3057 
3058 
3059 /*
3060  * Generic routine to set the clientid across
3061  * minor versions.
3062  */
3063 void
3064 nfs4_set_clientid(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr,
3065                     bool_t recovery, nfs4_error_t *n4ep)
3066 {
3067         struct nfs4_server      *np;
3068         nfs4_recov_state_t       recov_state;
3069         int                      num_retries = 0;
3070         bool_t                   retry;
3071         bool_t                   is_dataserver;
3072         cred_t                  *lcr = NULL;
3073         int                      retry_inuse = 1;       /* only retry once on */
3074                                                         /* NFS4ERR_CLID_INUSE */
3075         time_t                   lease_time = 0;
3076 
3077         /*
3078          * If svp is non-NULL, then we're setting the clientID on a pNFS
3079          * data server.  Otherwise, it's an MDS or non-pNFS server.
3080          */
3081         if (svp == NULL) {
3082                 svp = mi->mi_curr_serv;
3083                 is_dataserver = FALSE;
3084         } else {
3085                 is_dataserver = TRUE;
3086         }
3087 
3088         recov_state.rs_flags = 0;
3089         recov_state.rs_num_retry_despite_err = 0;
3090         ASSERT(n4ep != NULL);
3091 
3092 recov_retry:
3093         retry = FALSE;
3094         nfs4_error_zinit(n4ep);
3095         if (!recovery)
3096                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3097 
3098         mutex_enter(&nfs4_server_lst_lock);
3099         np = servinfo4_to_nfs4_server(svp); /* This locks np if it is found */
3100         mutex_exit(&nfs4_server_lst_lock);
3101 
3102         /* XXXrsb - Would we ever have np == NULL for the DS case? */
3103         if (!np) {
3104                 struct nfs4_server *tnp;
3105                 np = new_nfs4_server(svp, cr);
3106                 mutex_enter(&np->s_lock);
3107 
3108                 mutex_enter(&nfs4_server_lst_lock);
3109                 tnp = servinfo4_to_nfs4_server(svp);
3110                 if (tnp) {
3111                         /*
3112                          * another thread snuck in and put server on list.
3113                          * since we aren't adding it to the nfs4_server_list
3114                          * we need to set the ref count to 0 and destroy it.
3115                          */
3116                         np->s_refcnt = 0;
3117                         destroy_nfs4_server(np);
3118                         np = tnp;
3119                 } else {
3120                         /*
3121                          * do not give list a reference until everything
3122                          * succeeds
3123                          */
3124                         insque(np, &nfs4_server_lst);
3125                 }
3126                 mutex_exit(&nfs4_server_lst_lock);
3127         }
3128         ASSERT(MUTEX_HELD(&np->s_lock));
3129         /*
3130          * If we find the server already has N4S_CLIENTID_SET, then
3131          * just return, we've already done SETCLIENTID to that server
3132          */
3133         if (np->s_flags & N4S_CLIENTID_SET &&
3134             !(np->seqhb_flags & NFS4_SEQHB_EXIT)) {
3135                 /*
3136                  * XXXrsb - We need to be careful of the MDS/DS combo in
3137                  * this block.  That is, if a server is both an MDS and
3138                  * DS, we need to do the right thing.  (We should probably
3139                  * check the "use bits" on the nfs4_server_t, once we can
3140                  * trust them.)
3141                  */
3142                 if (is_dataserver == FALSE) {
3143                         /* add mi to np's mntinfo4_list */
3144                         nfs4_add_mi_to_server(np, mi);
3145                 }
3146                 if (!recovery) {
3147                         nfs4_set_minorversion(mi, np->s_minorversion);
3148                         /* See XXXrsb above */
3149                         if (is_dataserver == FALSE)
3150                                 nfs4_setup_pnfs_mi(np, mi, svp);
3151 
3152                         nfs_rw_exit(&mi->mi_recovlock);
3153                 }
3154                 mutex_exit(&np->s_lock);
3155                 nfs4_server_rele(np);
3156                 return;
3157         }
3158         mutex_exit(&np->s_lock);
3159 
3160         /*
3161          * Drop the mi_recovlock since nfs4_start_op will
3162          * acquire it again for us.
3163          *
3164          * XXXrsb - This gets called from the recovery framework (via
3165          * recov_clientid()) and from nfs4getfh_otw().  In the latter
3166          * case, this is done from an MDS/non-pNFS server and *not*
3167          * a data server.  Given that, we can use the "classic" start_op
3168          * and end_op interfaces.
3169          */
3170         if (!recovery) {
3171                 nfs_rw_exit(&mi->mi_recovlock);
3172                 n4ep->error = nfs4_start_op(mi, NULL, NULL, &recov_state);
3173                 if (n4ep->error) {
3174                         nfs4_server_rele(np);
3175                         return;
3176                 }
3177         }
3178 
3179         mutex_enter(&np->s_lock);
3180         while ((np->s_flags & N4S_CLIENTID_PEND) ||
3181             (np->seqhb_flags & NFS4_SEQHB_EXIT)) {
3182                 if (!cv_wait_sig(&np->s_clientid_pend, &np->s_lock)) {
3183                         mutex_exit(&np->s_lock);
3184                         nfs4_server_rele(np);
3185                         /* XXXrsb - See comment above about start_op/end_op */
3186                         if (!recovery)
3187                                 nfs4_end_op(mi, NULL, NULL, &recov_state,
3188                                     recovery);
3189                         n4ep->error = EINTR;
3190                         return;
3191                 }
3192         }
3193 
3194         if (np->s_flags & N4S_CLIENTID_SET &&
3195             !(np->seqhb_flags & NFS4_SEQHB_EXIT)) {
3196                 /* XXX copied/pasted from above */
3197                 /* add mi to np's mntinfo4_list */
3198                 if (is_dataserver == FALSE)
3199                         nfs4_add_mi_to_server(np, mi);
3200                 if (!recovery) {
3201                         nfs4_set_minorversion(mi, np->s_minorversion);
3202                         if (is_dataserver == FALSE)
3203                                 nfs4_setup_pnfs_mi(np, mi, svp);
3204                 }
3205                 mutex_exit(&np->s_lock);
3206                 nfs4_server_rele(np);
3207                 if (!recovery)
3208                         nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3209                 return;
3210         }
3211 
3212         /*
3213          * Reset the N4S_CB_PINGED flag. This is used to
3214          * indicate if we have received a CB_NULL from the
3215          * server. Also we reset the waiter flag.
3216          */
3217         np->s_flags &= ~(N4S_CB_PINGED | N4S_CB_WAITER);
3218         /* any failure must now clear this flag */
3219         np->s_flags |= N4S_CLIENTID_PEND;
3220         mutex_exit(&np->s_lock);
3221 
3222         NFS4_SET_CLIENTID(mi, svp, cr, np, n4ep, &retry_inuse);
3223 
3224         if (n4ep->error == EACCES) {
3225                 /*
3226                  * If the uid is set then set the creds for secure mounts
3227                  * by proxy processes such as automountd.
3228                  */
3229                 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3230                 if (svp->sv_secdata->uid != 0) {
3231                         lcr = crdup(cr);
3232                         (void) crsetugid(lcr, svp->sv_secdata->uid,
3233                             crgetgid(cr));
3234                 }
3235                 nfs_rw_exit(&svp->sv_lock);
3236 
3237                 if (lcr != NULL) {
3238                         mutex_enter(&np->s_lock);
3239                         crfree(np->s_cred);
3240                         np->s_cred = lcr;
3241                         mutex_exit(&np->s_lock);
3242                         NFS4_SET_CLIENTID(mi, svp, lcr, np, n4ep, &retry_inuse);
3243                 }
3244         }
3245         mutex_enter(&np->s_lock);
3246         lease_time = np->s_lease_time;
3247         np->s_flags &= ~N4S_CLIENTID_PEND;
3248         mutex_exit(&np->s_lock);
3249 
3250         if (n4ep->error != 0 || n4ep->stat != NFS4_OK) {
3251                 /*
3252                  * Start recovery if failover is a possibility.  If
3253                  * invoked by the recovery thread itself, then just
3254                  * return and let it handle the failover first.  NB:
3255                  * RECOVERY IS NOT ALLOWED IF THE MOUNT IS IN PRogress
3256                  * since the infrastructure is not sufficiently setup
3257                  * to allow it.  Just return the error (after suitable
3258                  * retries).
3259                  */
3260                 if (FAILOVER_MOUNT4(mi) && nfs4_try_failover(n4ep)) {
3261                         (void) nfs4_start_recovery(n4ep, mi, NULL,
3262                             NULL, NULL, NULL, OP_SETCLIENTID, NULL);
3263                         /*
3264                          * Don't retry here, just return and let
3265                          * recovery take over.
3266                          */
3267                         if (recovery)
3268                                 retry = FALSE;
3269                 } else if (nfs4_rpc_retry_error(n4ep->error) ||
3270                     n4ep->stat == NFS4ERR_RESOURCE ||
3271                     n4ep->stat == NFS4ERR_STALE_CLIENTID) {
3272 
3273                         retry = TRUE;
3274                         /*
3275                          * Always retry if in recovery or once had
3276                          * contact with the server (but now it's
3277                          * overloaded).
3278                          */
3279                         if (recovery == TRUE ||
3280                             n4ep->error == ETIMEDOUT ||
3281                             n4ep->error == ECONNRESET)
3282                                 num_retries = 0;
3283                 } else if (retry_inuse && n4ep->error == 0 &&
3284                     n4ep->stat == NFS4ERR_CLID_INUSE) {
3285                         retry = TRUE;
3286                         num_retries = 0;
3287                 }
3288         } else {
3289                 /*
3290                  * Since everything succeeded give the list a reference count if
3291                  * it hasn't been given one by add_new_nfs4_server() or if this
3292                  * is not a recovery situation in which case it is already on
3293                  * the list.
3294                  */
3295                 mutex_enter(&np->s_lock);
3296                 if ((np->s_flags & N4S_INSERTED) == 0) {
3297                         np->s_refcnt++;
3298                         np->s_flags |= N4S_INSERTED;
3299                 }
3300 
3301                 if (is_dataserver == FALSE && !recovery)
3302                         nfs4_setup_pnfs_mi(np, mi, svp);
3303 
3304                 /*
3305                  * In recovery or not, a new nfs4_server needs
3306                  * to have the minorversion set.
3307                  */
3308                 np->s_minorversion = mi->mi_minorversion;
3309                 mutex_exit(&np->s_lock);
3310         }
3311 
3312         if (!recovery)
3313                 nfs4_end_op(mi, NULL, NULL, &recov_state, recovery);
3314 
3315 
3316         if (retry && num_retries++ < nfs4_num_sclid_retries) {
3317                 if (retry_inuse) {
3318                         delay(SEC_TO_TICK(lease_time + nfs4_retry_sclid_delay));
3319                         retry_inuse = 0;
3320                 } else
3321                         delay(SEC_TO_TICK(nfs4_retry_sclid_delay));
3322 
3323                 nfs4_server_rele(np);
3324                 goto recov_retry;
3325         }
3326 
3327 
3328         if (n4ep->error == 0)
3329                 n4ep->error = geterrno4(n4ep->stat);
3330 
3331         /* broadcast before release in case no other threads are waiting */
3332         cv_broadcast(&np->s_clientid_pend);
3333         nfs4_server_rele(np);
3334 }
3335 
3336 /*
3337  * Add mi to sp's mntinfo4_list if it isn't already in the list.  Makes
3338  * mi's clientid the same as sp's.
3339  * Assumes sp is locked down.
3340  */
3341 void
3342 nfs4_add_mi_to_server(nfs4_server_t *sp, mntinfo4_t *mi)
3343 {
3344         mntinfo4_t *tmi;
3345         int in_list = 0;
3346 
3347         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3348             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3349         ASSERT(sp != &nfs4_server_lst);
3350         ASSERT(MUTEX_HELD(&sp->s_lock));
3351 
3352         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3353             "nfs4_add_mi_to_server: add mi %p to sp %p",
3354             (void*)mi, (void*)sp));
3355 
3356         for (tmi = sp->mntinfo4_list;
3357             tmi != NULL;
3358             tmi = tmi->mi_clientid_next) {
3359                 if (tmi == mi) {
3360                         NFS4_DEBUG(nfs4_client_lease_debug,
3361                             (CE_NOTE,
3362                             "nfs4_add_mi_to_server: mi in list"));
3363                         in_list = 1;
3364                 }
3365         }
3366 
3367         /*
3368          * First put a hold on the mntinfo4's vfsp so that references via
3369          * mntinfo4_list will be valid.
3370          */
3371         if (!in_list)
3372                 VFS_HOLD(mi->mi_vfsp);
3373 
3374         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE, "nfs4_add_mi_to_server: "
3375             "hold vfs %p for mi: %p", (void*)mi->mi_vfsp, (void*)mi));
3376 
3377         if (!in_list) {
3378                 if (sp->mntinfo4_list)
3379                         sp->mntinfo4_list->mi_clientid_prev = mi;
3380                 mi->mi_clientid_next = sp->mntinfo4_list;
3381                 sp->mntinfo4_list = mi;
3382                 mi->mi_srvsettime = gethrestime_sec();
3383         }
3384 
3385         /* set mi's clientid to that of sp's for later matching */
3386         mi->mi_clientid = sp->clientid;
3387 
3388         /*
3389          * Update the clientid for any other mi's belonging to sp.  This
3390          * must be done here while we hold sp->s_lock, so that
3391          * find_nfs4_server() continues to work.
3392          */
3393 
3394         for (tmi = sp->mntinfo4_list;
3395             tmi != NULL;
3396             tmi = tmi->mi_clientid_next) {
3397                 if (tmi != mi) {
3398                         tmi->mi_clientid = sp->clientid;
3399                 }
3400         }
3401 }
3402 
3403 /*
3404  * Remove the mi from sp's mntinfo4_list and release its reference.
3405  * Exception: if mi still has open files, flag it for later removal (when
3406  * all the files are closed).
3407  *
3408  * If this is the last mntinfo4 in sp's list then tell the lease renewal
3409  * thread to exit.
3410  */
3411 static void
3412 nfs4_remove_mi_from_server_nolock(mntinfo4_t *mi, nfs4_server_t *sp)
3413 {
3414         NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3415             "nfs4_remove_mi_from_server_nolock: remove mi %p from sp %p",
3416             (void*)mi, (void*)sp));
3417 
3418         ASSERT(sp != NULL);
3419         ASSERT(MUTEX_HELD(&sp->s_lock));
3420         ASSERT(mi->mi_open_files >= 0);
3421 
3422         /*
3423          * First make sure this mntinfo4 can be taken off of the list,
3424          * ie: it doesn't have any open files remaining.
3425          */
3426         if (mi->mi_open_files > 0) {
3427                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3428                     "nfs4_remove_mi_from_server_nolock: don't "
3429                     "remove mi since it still has files open"));
3430 
3431                 mutex_enter(&mi->mi_lock);
3432                 mi->mi_flags |= MI4_REMOVE_ON_LAST_CLOSE;
3433                 mutex_exit(&mi->mi_lock);
3434                 return;
3435         }
3436 
3437         VFS_HOLD(mi->mi_vfsp);
3438         remove_mi(sp, mi);
3439         VFS_RELE(mi->mi_vfsp);
3440 
3441         if (sp->mntinfo4_list == NULL) {
3442                 /* last fs unmounted, kill the thread */
3443                 NFS4_DEBUG(nfs4_client_lease_debug, (CE_NOTE,
3444                     "remove_mi_from_nfs4_server_nolock: kill the thread"));
3445                 nfs4_mark_srv_dead(sp, 0);
3446         }
3447 }
3448 
3449 /*
3450  * Remove mi from sp's mntinfo4_list and release the vfs reference.
3451  */
3452 static void
3453 remove_mi(nfs4_server_t *sp, mntinfo4_t *mi)
3454 {
3455         ASSERT(MUTEX_HELD(&sp->s_lock));
3456 
3457         /*
3458          * We release a reference, and the caller must still have a
3459          * reference.
3460          */
3461         ASSERT(mi->mi_vfsp->vfs_count >= 2);
3462 
3463         if (mi->mi_clientid_prev) {
3464                 mi->mi_clientid_prev->mi_clientid_next = mi->mi_clientid_next;
3465         } else {
3466                 /* This is the first mi in sp's mntinfo4_list */
3467                 /*
3468                  * Make sure the first mntinfo4 in the list is the actual
3469                  * mntinfo4 passed in.
3470                  */
3471                 ASSERT(sp->mntinfo4_list == mi);
3472 
3473                 sp->mntinfo4_list = mi->mi_clientid_next;
3474         }
3475         if (mi->mi_clientid_next)
3476                 mi->mi_clientid_next->mi_clientid_prev = mi->mi_clientid_prev;
3477 
3478         /* Now mark the mntinfo4's links as being removed */
3479         mi->mi_clientid_prev = mi->mi_clientid_next = NULL;
3480 
3481         VFS_RELE(mi->mi_vfsp);
3482 }
3483 
3484 /*
3485  * Free all the entries in sp's mntinfo4_list.
3486  */
3487 static void
3488 remove_all_mi(nfs4_server_t *sp)
3489 {
3490         mntinfo4_t *mi;
3491 
3492         ASSERT(MUTEX_HELD(&sp->s_lock));
3493 
3494         while (sp->mntinfo4_list != NULL) {
3495                 mi = sp->mntinfo4_list;
3496                 /*
3497                  * Grab a reference in case there is only one left (which
3498                  * remove_mi() frees).
3499                  */
3500                 VFS_HOLD(mi->mi_vfsp);
3501                 remove_mi(sp, mi);
3502                 VFS_RELE(mi->mi_vfsp);
3503         }
3504 }
3505 
3506 /*
3507  * Remove the mi from sp's mntinfo4_list as above, and rele the vfs.
3508  *
3509  * This version can be called with a null nfs4_server_t arg,
3510  * and will either find the right one and handle locking, or
3511  * do nothing because the mi wasn't added to an sp's mntinfo4_list.
3512  */
3513 void
3514 nfs4_remove_mi_from_server(mntinfo4_t *mi, nfs4_server_t *esp)
3515 {
3516         nfs4_server_t   *sp;
3517 
3518         if (esp == NULL) {
3519                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3520                 sp = find_nfs4_server_all(mi, 1);
3521         } else
3522                 sp = esp;
3523 
3524         if (sp != NULL)
3525                 nfs4_remove_mi_from_server_nolock(mi, sp);
3526 
3527         /*
3528          * If we had a valid esp as input, the calling function will be
3529          * responsible for unlocking the esp nfs4_server.
3530          */
3531         if (esp == NULL) {
3532                 if (sp != NULL)
3533                         mutex_exit(&sp->s_lock);
3534                 nfs_rw_exit(&mi->mi_recovlock);
3535                 if (sp != NULL)
3536                         nfs4_server_rele(sp);
3537         }
3538 }
3539 
3540 /*
3541  * Return TRUE if the given server has any non-unmounted filesystems.
3542  */
3543 
3544 bool_t
3545 nfs4_fs_active(nfs4_server_t *sp)
3546 {
3547         mntinfo4_t *mi;
3548 
3549         ASSERT(MUTEX_HELD(&sp->s_lock));
3550 
3551         for (mi = sp->mntinfo4_list; mi != NULL; mi = mi->mi_clientid_next) {
3552                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
3553                         return (TRUE);
3554         }
3555 
3556         return (FALSE);
3557 }
3558 
3559 /*
3560  * Mark sp as finished and notify any waiters.
3561  */
3562 
3563 void
3564 nfs4_mark_srv_dead(nfs4_server_t *sp, uint_t zone_shutdown)
3565 {
3566         ASSERT(MUTEX_HELD(&sp->s_lock));
3567 
3568         if (zone_shutdown)
3569                 sp->seqhb_flags |= NFS4_SEQHB_EXIT;
3570         else
3571                 sp->seqhb_flags |= NFS4_SEQHB_EXITING;
3572         sp->s_thread_exit = NFS4_THREAD_EXIT;
3573         cv_broadcast(&sp->cv_thread_exit);
3574 }
3575 
3576 /*
3577  * Layout rnode by fsid avl tree compare function
3578  */
3579 static int
3580 fsidcmp(const void *p1, const void *p2)
3581 {
3582         const nfs4_fsidlt_t *lt1 = p1;
3583         const nfs4_fsidlt_t *lt2 = p2;
3584         int m;
3585 
3586         m = memcmp(&lt1->lt_fsid, &lt2->lt_fsid, sizeof (&lt1->lt_fsid));
3587         return (m == 0 ? 0 : m < 0 ? -1 : 1);
3588 }
3589 
3590 /*
3591  * Layout rnode avl tree compare function
3592  */
3593 int
3594 layoutcmp(const void *p1, const void *p2)
3595 {
3596         const rnode4_t  *r1 = p1;
3597         const rnode4_t  *r2 = p2;
3598 
3599         return (nfs4cmpfh(&r1->r_fh->sfh_fh, &r2->r_fh->sfh_fh));
3600 }
3601 
3602 /*
3603  * Create a new nfs4_server_t structure.
3604  * Returns new node unlocked and not in list, but with a reference count of
3605  * 1.
3606  */
3607 struct nfs4_server *
3608 new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3609 {
3610         struct nfs4_server *np;
3611         timespec_t tt;
3612         union {
3613                 struct {
3614                         uint32_t sec;
3615                         uint32_t subsec;
3616                 } un_curtime;
3617                 verifier4       un_verifier;
3618         } nfs4clientid_verifier;
3619         char id_val[] = "Solaris: %s, NFSv4 kernel client";
3620         char tag[] = "INITSESS%p";
3621         int len;
3622 
3623         np = kmem_zalloc(sizeof (struct nfs4_server), KM_SLEEP);
3624         np->saddr.len = svp->sv_addr.len;
3625         np->saddr.maxlen = svp->sv_addr.maxlen;
3626         np->saddr.buf = kmem_alloc(svp->sv_addr.maxlen, KM_SLEEP);
3627         bcopy(svp->sv_addr.buf, np->saddr.buf, svp->sv_addr.len);
3628 
3629         /*
3630          * Initialize rnode avl tree.
3631          */
3632         mutex_init(&np->s_lt_lock, NULL, MUTEX_DEFAULT, NULL);
3633         avl_create(&np->s_fsidlt, fsidcmp, sizeof (nfs4_fsidlt_t),
3634             offsetof(nfs4_fsidlt_t, lt_node));
3635         nfs4_pnfs_init_n4s(np);
3636         np->s_refcnt = 1;
3637 
3638         /*
3639          * Build the nfs_client_id4 for this server mount.  Ensure
3640          * the verifier is useful and that the identification is
3641          * somehow based on the server's address for the case of
3642          * multi-homed servers.
3643          */
3644         nfs4clientid_verifier.un_verifier = 0;
3645         gethrestime(&tt);
3646         nfs4clientid_verifier.un_curtime.sec = (uint32_t)tt.tv_sec;
3647         nfs4clientid_verifier.un_curtime.subsec = (uint32_t)tt.tv_nsec;
3648         np->clidtosend.verifier = nfs4clientid_verifier.un_verifier;
3649 
3650         /*
3651          * calculate the length of the opaque identifier.  Subtract 2
3652          * for the "%s" and add the traditional +1 for null
3653          * termination.
3654          */
3655         len = strlen(id_val) - 2 + strlen(uts_nodename()) + 1;
3656         np->clidtosend.id_len = len + np->saddr.maxlen;
3657 
3658         np->clidtosend.id_val = kmem_alloc(np->clidtosend.id_len, KM_SLEEP);
3659         (void) sprintf(np->clidtosend.id_val, id_val, uts_nodename());
3660         bcopy(np->saddr.buf, &np->clidtosend.id_val[len], np->saddr.len);
3661 
3662         np->s_flags = 0;
3663         np->mntinfo4_list = NULL;
3664         /* save cred for issuing rfs4calls inside the renew thread */
3665         crhold(cr);
3666         np->s_cred = cr;
3667         cv_init(&np->cv_thread_exit, NULL, CV_DEFAULT, NULL);
3668         mutex_init(&np->s_lock, NULL, MUTEX_DEFAULT, NULL);
3669         nfs_rw_init(&np->s_recovlock, NULL, RW_DEFAULT, NULL);
3670         list_create(&np->s_deleg_list, sizeof (rnode4_t),
3671             offsetof(rnode4_t, r_deleg_link));
3672         np->s_thread_exit = 0;
3673         np->state_ref_count = 0;
3674         np->lease_valid = NFS4_LEASE_NOT_STARTED;
3675         cv_init(&np->s_cv_otw_count, NULL, CV_DEFAULT, NULL);
3676         cv_init(&np->s_clientid_pend, NULL, CV_DEFAULT, NULL);
3677         np->s_otw_call_count = 0;
3678         cv_init(&np->wait_cb_null, NULL, CV_DEFAULT, NULL);
3679         np->zoneid = getzoneid();
3680         np->zone_globals = nfs4_get_callback_globals();
3681         ASSERT(np->zone_globals != NULL);
3682 
3683         /*
3684          * Dummy session id untill CREATE_SESSION is completed
3685          */
3686         (void) snprintf(np->ssx.sessionid, sizeof (sessionid4), tag, curthread);
3687 
3688         /*
3689          * By default, we begin with bi-dir rpc
3690          */
3691         if (nfs41_birpc) {
3692                 np->ssx.bi_rpc = 1;
3693         }
3694 
3695         /*
3696          * Initialize Slot management fields
3697          */
3698         cv_init(&np->ssx.slot_wait, NULL, CV_DEFAULT, NULL);
3699         nfs_rw_init(&np->ssx.slot_table_rwlock, NULL, RW_DEFAULT, NULL);
3700         mutex_init(&np->ssx.slot_lock, NULL, MUTEX_DEFAULT, NULL);
3701         return (np);
3702 }
3703 
3704 /*
3705  * Create a new nfs4_server_t structure and add it to the list.
3706  * Returns new node locked; reference must eventually be freed.
3707  */
3708 struct nfs4_server *
3709 add_new_nfs4_server(struct servinfo4 *svp, cred_t *cr)
3710 {
3711         nfs4_server_t *sp;
3712 
3713         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
3714         sp = new_nfs4_server(svp, cr);
3715         mutex_enter(&sp->s_lock);
3716         insque(sp, &nfs4_server_lst);
3717         sp->s_refcnt++;                      /* list gets a reference */
3718         sp->s_flags |= N4S_INSERTED;
3719         sp->clientid = 0;
3720         return (sp);
3721 }
3722 
3723 int nfs4_server_t_debug = 0;
3724 
3725 #ifdef lint
3726 extern void
3727 dumpnfs4slist(char *, mntinfo4_t *, clientid4, servinfo4_t *);
3728 #endif
3729 
3730 #ifndef lint
3731 #ifdef DEBUG
3732 void
3733 dumpnfs4slist(char *txt, mntinfo4_t *mi, clientid4 clientid, servinfo4_t *srv_p)
3734 {
3735         int hash16(void *p, int len);
3736         nfs4_server_t *np;
3737 
3738         NFS4_DEBUG(nfs4_server_t_debug, (CE_NOTE,
3739             "dumping nfs4_server_t list in %s", txt));
3740         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3741             "mi 0x%p, want clientid %llx, addr %d/%04X",
3742             mi, (longlong_t)clientid, srv_p->sv_addr.len,
3743             hash16((void *)srv_p->sv_addr.buf, srv_p->sv_addr.len)));
3744         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst;
3745             np = np->forw) {
3746                 NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3747                     "node 0x%p,    clientid %llx, addr %d/%04X, cnt %d",
3748                     np, (longlong_t)np->clientid, np->saddr.len,
3749                     hash16((void *)np->saddr.buf, np->saddr.len),
3750                     np->state_ref_count));
3751                 if (np->saddr.len == srv_p->sv_addr.len &&
3752                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
3753                     np->saddr.len) == 0)
3754                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3755                             " - address matches"));
3756                 if (np->clientid == clientid || np->clientid == 0)
3757                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3758                             " - clientid matches"));
3759                 if (np->s_thread_exit != NFS4_THREAD_EXIT)
3760                         NFS4_DEBUG(nfs4_server_t_debug, (CE_CONT,
3761                             " - thread not exiting"));
3762         }
3763         delay(hz);
3764 }
3765 #endif
3766 #endif
3767 
3768 
3769 /*
3770  * Move a mntinfo4_t from one server list to another.
3771  * Locking of the two nfs4_server_t nodes will be done in list order.
3772  *
3773  * Returns NULL if the current nfs4_server_t for the filesystem could not
3774  * be found (e.g., due to forced unmount).  Otherwise returns a reference
3775  * to the new nfs4_server_t, which must eventually be freed.
3776  */
3777 nfs4_server_t *
3778 nfs4_move_mi(mntinfo4_t *mi, servinfo4_t *old, servinfo4_t *new)
3779 {
3780         nfs4_server_t *p, *op = NULL, *np = NULL;
3781         int num_open;
3782         zoneid_t zoneid = nfs_zoneid();
3783 
3784         ASSERT(nfs_zone() == mi->mi_zone);
3785 
3786         mutex_enter(&nfs4_server_lst_lock);
3787 #ifdef DEBUG
3788         if (nfs4_server_t_debug)
3789                 dumpnfs4slist("nfs4_move_mi", mi, (clientid4)0, new);
3790 #endif
3791         for (p = nfs4_server_lst.forw; p != &nfs4_server_lst; p = p->forw) {
3792                 if (p->zoneid != zoneid)
3793                         continue;
3794                 if (p->saddr.len == old->sv_addr.len &&
3795                     bcmp(p->saddr.buf, old->sv_addr.buf, p->saddr.len) == 0 &&
3796                     p->s_thread_exit != NFS4_THREAD_EXIT) {
3797                         op = p;
3798                         mutex_enter(&op->s_lock);
3799                         op->s_refcnt++;
3800                 }
3801                 if (p->saddr.len == new->sv_addr.len &&
3802                     bcmp(p->saddr.buf, new->sv_addr.buf, p->saddr.len) == 0 &&
3803                     p->s_thread_exit != NFS4_THREAD_EXIT) {
3804                         np = p;
3805                         mutex_enter(&np->s_lock);
3806                 }
3807                 if (op != NULL && np != NULL)
3808                         break;
3809         }
3810         if (op == NULL) {
3811                 /*
3812                  * Filesystem has been forcibly unmounted.  Bail out.
3813                  */
3814                 if (np != NULL)
3815                         mutex_exit(&np->s_lock);
3816                 mutex_exit(&nfs4_server_lst_lock);
3817                 return (NULL);
3818         }
3819         if (np != NULL) {
3820                 np->s_refcnt++;
3821         } else {
3822 #ifdef DEBUG
3823                 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3824                     "nfs4_move_mi: no target nfs4_server, will create."));
3825 #endif
3826                 np = add_new_nfs4_server(new, kcred);
3827         }
3828         mutex_exit(&nfs4_server_lst_lock);
3829 
3830         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3831             "nfs4_move_mi: for mi 0x%p, "
3832             "old servinfo4 0x%p, new servinfo4 0x%p, "
3833             "old nfs4_server 0x%p, new nfs4_server 0x%p, ",
3834             (void*)mi, (void*)old, (void*)new,
3835             (void*)op, (void*)np));
3836         ASSERT(op != NULL && np != NULL);
3837 
3838         /* discard any delegations */
3839         nfs4_deleg_discard(mi, op);
3840 
3841         num_open = mi->mi_open_files;
3842         mi->mi_open_files = 0;
3843         op->state_ref_count -= num_open;
3844         ASSERT(op->state_ref_count >= 0);
3845         np->state_ref_count += num_open;
3846         nfs4_remove_mi_from_server_nolock(mi, op);
3847         mi->mi_open_files = num_open;
3848         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3849             "nfs4_move_mi: mi_open_files %d, op->cnt %d, np->cnt %d",
3850             mi->mi_open_files, op->state_ref_count, np->state_ref_count));
3851 
3852         nfs4_add_mi_to_server(np, mi);
3853 
3854         mutex_exit(&op->s_lock);
3855         nfs4_server_rele(op);
3856         mutex_exit(&np->s_lock);
3857 
3858         return (np);
3859 }
3860 
3861 /*
3862  * Need to have the nfs4_server_lst_lock.
3863  * Search the nfs4_server list to find a match on this servinfo4
3864  * based on its address.
3865  *
3866  * Returns NULL if no match is found.  Otherwise returns a reference (which
3867  * must eventually be freed) to a locked nfs4_server.
3868  */
3869 nfs4_server_t *
3870 servinfo4_to_nfs4_server(servinfo4_t *srv_p)
3871 {
3872         nfs4_server_t *np;
3873         zoneid_t zoneid = nfs_zoneid();
3874 
3875         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
3876         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
3877                 if (np->zoneid == zoneid &&
3878                     np->saddr.len == srv_p->sv_addr.len &&
3879                     bcmp(np->saddr.buf, srv_p->sv_addr.buf,
3880                     np->saddr.len) == 0) {
3881                         mutex_enter(&np->s_lock);
3882                         /*
3883                          * If there is an already created session
3884                          * reuse this nfs4_server_t, even if
3885                          * NFS4_THREAD_EXIT is set (which just means
3886                          * no mounts exist to the server).
3887                          */
3888                         if (np->s_thread_exit == NFS4_THREAD_EXIT &&
3889                             (!(np->s_flags & N4S_SESSION_CREATED))) {
3890                                 mutex_exit(&np->s_lock);
3891                                 continue;
3892                         }
3893                         np->s_thread_exit = 0;
3894                         np->s_refcnt++;
3895                         return (np);
3896                 }
3897         }
3898         return (NULL);
3899 }
3900 
3901 /*
3902  * Search the nfs4_server_lst to find a match based on clientid and
3903  * addr.
3904  * Locks the nfs4_server down if it is found and returns a reference that
3905  * must eventually be freed.
3906  *
3907  * Returns NULL it no match is found.  This means one of two things: either
3908  * mi is in the process of being mounted, or mi has been unmounted.
3909  *
3910  * The caller should be holding mi->mi_recovlock, and it should continue to
3911  * hold the lock until done with the returned nfs4_server_t.  Once
3912  * mi->mi_recovlock is released, there is no guarantee that the returned
3913  * mi->nfs4_server_t will continue to correspond to mi.
3914  */
3915 nfs4_server_t *
3916 find_nfs4_server(mntinfo4_t *mi)
3917 {
3918         return (find_nfs4_server_all(mi, 0));
3919 }
3920 
3921 /*
3922  * This is a special version of find_nfs4_server, which takes
3923  * the mi_recovlock, activates the current nfs4_server_t for
3924  * that mi, and drops the lock.  This function must be used
3925  * with care, since after dropping mi_recovlock, the mi will
3926  * may no longer refer to this structure.  Callers of this
3927  * service must be aware of this and can never assume that
3928  * the value returned remains the current target of the mi.
3929  */
3930 nfs4_server_t *
3931 find_nfs4_server_nolock(mntinfo4_t *mi)
3932 {
3933         nfs4_server_t *np;
3934 
3935         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, 0);
3936         np = find_nfs4_server(mi);
3937         nfs_rw_exit(&mi->mi_recovlock);
3938         /* either np is NULL OR n4sp->s_lock is held */
3939         return (np);
3940 }
3941 
3942 /*
3943  * Same as above, but takes an "all" parameter which can be
3944  * set to 1 if the caller wishes to find nfs4_server_t's which
3945  * have been marked for termination by the exit of the renew
3946  * thread.  This should only be used by operations which are
3947  * cleaning up and will not cause an OTW op.
3948  */
3949 nfs4_server_t *
3950 find_nfs4_server_all(mntinfo4_t *mi, int all)
3951 {
3952         nfs4_server_t *np;
3953         servinfo4_t *svp;
3954         zoneid_t zoneid = mi->mi_zone->zone_id;
3955 
3956         ASSERT(nfs_rw_lock_held(&mi->mi_recovlock, RW_READER) ||
3957             nfs_rw_lock_held(&mi->mi_recovlock, RW_WRITER));
3958         /*
3959          * This can be called from nfs4_unmount() which can be called from the
3960          * global zone, hence it's legal for the global zone to muck with
3961          * another zone's server list, as long as it doesn't try to contact
3962          * them.
3963          */
3964         ASSERT(zoneid == getzoneid() || getzoneid() == GLOBAL_ZONEID ||
3965             nfs_global_client_only != 0);
3966 
3967         /*
3968          * The nfs4_server_lst_lock global lock is held when we get a new
3969          * clientid (via SETCLIENTID OTW).  Holding this global lock and
3970          * mi_recovlock (READER is fine) ensures that the nfs4_server
3971          * and this mntinfo4 can't get out of sync, so the following search is
3972          * always valid.
3973          */
3974         mutex_enter(&nfs4_server_lst_lock);
3975 #ifdef DEBUG
3976         if (nfs4_server_t_debug) {
3977                 /* mi->mi_clientid is unprotected, ok for debug output */
3978                 dumpnfs4slist("find_nfs4_server", mi, mi->mi_clientid,
3979                     mi->mi_curr_serv);
3980         }
3981 #endif
3982         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
3983                 mutex_enter(&np->s_lock);
3984                 svp = mi->mi_curr_serv;
3985 
3986                 if (np->zoneid == zoneid &&
3987                     np->clientid == mi->mi_clientid &&
3988                     np->saddr.len == svp->sv_addr.len &&
3989                     bcmp(np->saddr.buf, svp->sv_addr.buf, np->saddr.len) == 0 &&
3990                     (np->s_thread_exit != NFS4_THREAD_EXIT || all != 0)) {
3991                         mutex_exit(&nfs4_server_lst_lock);
3992                         np->s_refcnt++;
3993                         return (np);
3994                 }
3995                 mutex_exit(&np->s_lock);
3996         }
3997         mutex_exit(&nfs4_server_lst_lock);
3998 
3999         return (NULL);
4000 }
4001 
4002 /* ARGSUSED */
4003 nfs4_server_t *
4004 find_nfs4_server_by_addr(struct netbuf *nb, struct knetconfig *knc)
4005 {
4006         nfs4_server_t *np;
4007 
4008         ASSERT(MUTEX_HELD(&nfs4_server_lst_lock));
4009 
4010         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4011                 mutex_enter(&np->s_lock);
4012 
4013                 if (np->saddr.len == nb->len &&
4014                     bcmp(np->saddr.buf, nb->buf, np->saddr.len) == 0 &&
4015                     (np->s_thread_exit != NFS4_THREAD_EXIT)) {
4016                         mutex_exit(&nfs4_server_lst_lock);
4017                         np->s_refcnt++;
4018                         return (np);
4019                 }
4020                 mutex_exit(&np->s_lock);
4021         }
4022         /*
4023          * NB - return holding lst_lock so caller can insert using
4024          * add_new_nfs4_server without a race.
4025          */
4026         return (NULL);
4027 }
4028 
4029 /*
4030  * Take a new reference to the nfs4_server.  Note that several
4031  * routines need to do this inline in order to keep the lock.
4032  */
4033 void
4034 nfs4_server_hold(nfs4_server_t *sp)
4035 {
4036         mutex_enter(&sp->s_lock);
4037         sp->s_refcnt++;
4038         mutex_exit(&sp->s_lock);
4039 }
4040 
4041 /*
4042  * Release the reference to sp and destroy it if that's the last one.
4043  */
4044 void
4045 nfs4_server_rele(nfs4_server_t *sp)
4046 {
4047         mutex_enter(&sp->s_lock);
4048         nfs4_server_rele_lockt(sp);
4049         /* s_lock has been released */
4050 }
4051 
4052 void
4053 nfs4_server_rele_lockt(nfs4_server_t *sp)
4054 {
4055         ASSERT(MUTEX_HELD(&sp->s_lock));
4056         ASSERT(sp->s_refcnt > 0);
4057         sp->s_refcnt--;
4058         if (sp->s_refcnt > 0) {
4059                 mutex_exit(&sp->s_lock);
4060                 return;
4061         }
4062         mutex_exit(&sp->s_lock);
4063 
4064         mutex_enter(&nfs4_server_lst_lock);
4065         mutex_enter(&sp->s_lock);
4066         if (sp->s_refcnt > 0) {
4067                 mutex_exit(&sp->s_lock);
4068                 mutex_exit(&nfs4_server_lst_lock);
4069                 return;
4070         }
4071         remque(sp);
4072         sp->forw = sp->back = NULL;
4073         mutex_exit(&nfs4_server_lst_lock);
4074         destroy_nfs4_server(sp);
4075 }
4076 
4077 /*
4078  *  Initiate and wait for destroy of a session.
4079  */
4080 
4081 void
4082 nfs4_cleanup_oldsession(nfs4_server_t *np)
4083 {
4084         mutex_enter(&np->s_lock);
4085         if (np->seqhb_flags & NFS4_SEQHB_STARTED) {
4086 
4087                 /*
4088                  * If not already signalled in start_recovery()
4089                  * signal sequence_heartbeat_thread() to exit.
4090                  */
4091 
4092                 if (!(np->seqhb_flags & NFS4_SEQHB_EXIT)) {
4093                         np->seqhb_flags |= NFS4_SEQHB_EXIT;
4094                         np->s_refcnt++;
4095                         cv_broadcast(&np->cv_thread_exit);
4096                 }
4097 
4098                 /*
4099                  * Wait for the sequence heartbeat thread to exit
4100                  * On it's way out, this will destroy the session.
4101                  */
4102 
4103                 while (np->seqhb_flags & NFS4_SEQHB_EXIT) {
4104                         cv_wait(&np->ssx_wait, &np->s_lock);
4105                 }
4106 
4107                 mutex_exit(&np->s_lock);
4108 
4109         } else if (np->seqhb_flags & NFS4_SEQHB_DESTROY) {
4110                 /*
4111                  * If (seqhb_flags & NFS4_SEQHB_DESTROY == TRUE) then the
4112                  * sequence heart beat thread raced us and has already
4113                  * destroyed the session. Nothing more to do.
4114                  */
4115                 mutex_exit(&np->s_lock);
4116         } else if (np->s_flags & N4S_SESSION_CREATED) {
4117                 /*
4118                  * No sequence heartbeat thread means this
4119                  * session is to a data server. Just destroy the
4120                  * the session.
4121                  */
4122                 np->seqhb_flags = 0;
4123                 mutex_exit(&np->s_lock);
4124                 nfs4destroy_session(np, NULL);
4125         } else {
4126                 np->seqhb_flags = 0;
4127                 mutex_exit(&np->s_lock);
4128         }
4129 }
4130 
4131 void
4132 nfs4destroy_session_otw(nfs4_session_t *sessp, CLIENT *clientp)
4133 {
4134         COMPOUND4args_clnt      args;
4135         COMPOUND4res_clnt       res;
4136         nfs_argop4              argop[2];
4137         nfs4_slot_t             *slotp;
4138         struct timeval          wait;
4139         enum    clnt_stat       status;
4140         nfs4_error_t            e;
4141         uint32_t                zilch = 0;
4142 
4143         res.argsp = &args;
4144         res.array = NULL;
4145         res.status = 0;
4146         res.array_len = 0;
4147         res.decode_len = 0;
4148 
4149         args.ctag = TAG_DESTROY_SESSION;
4150 
4151         args.array = argop;
4152         args.array_len = 2;
4153         args.minor_vers = nfs4_max_minor_version;
4154 
4155         argop[0].argop = OP_SEQUENCE;
4156 
4157         argop[1].argop = OP_DESTROY_SESSION;
4158         bcopy(sessp->sessionid,
4159             argop[1].nfs_argop4_u.opdestroy_session.dsa_sessionid,
4160             sizeof (sessp->sessionid));
4161 
4162         TICK_TO_TIMEVAL(30 * hz / 10, &wait);
4163 
4164         if (!(CLNT_CONTROL(clientp, CLSET_XID, (char *)&zilch))) {
4165                 zcmn_err(getzoneid(), CE_WARN,
4166                     "Failed to zero xid to destroy session");
4167                 goto destroy;
4168         }
4169 
4170         nfs4sequence_setup(sessp, &args, &slotp);
4171         status = CLNT_CALL(clientp, NFSPROC4_COMPOUND,
4172             xdr_COMPOUND4args_clnt, (caddr_t)&args,
4173             xdr_COMPOUND4res_clnt, (caddr_t)&res,
4174             wait);
4175 
4176         nfs4_error_set(&e, status, res.status);
4177         nfs4sequence_fin(sessp, &res, slotp, &e);
4178 
4179         if (status != RPC_SUCCESS || res.status ||
4180             res.array[1].nfs_resop4_u.opdestroy_session.dsr_status) {
4181                 DTRACE_PROBE1(nfsc__i_destroysession, char *,
4182                     "Destroy_session request failed, destroying anyways");
4183                 goto destroy;
4184         }
4185 
4186         xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4187 
4188 destroy:
4189         kmem_free(sessp->slot_table,
4190             sessp->slot_table_size * sizeof (void *));
4191         kmem_free(sessp->cb_slot_table,
4192             sessp->cb_slot_table_size * sizeof (void *));
4193         kmem_free(sessp->saddr.buf, sessp->saddr.len);
4194 }
4195 
4196 void
4197 nfs4destroy_session(nfs4_server_t *np, CLIENT *seqhandle)
4198 {
4199         struct nfs41_cb_info    *cbi;
4200         struct nfs4_callback_globals    *ncg = np->zone_globals;
4201 
4202         /* XXX currently no otw destroy for null client handles */
4203         if (seqhandle != NULL)
4204                 nfs4destroy_session_otw(&np->ssx, seqhandle);
4205 
4206 
4207         mutex_enter(&np->s_lock);
4208         cbi = ncg->nfs4prog2cbinfo[np->s_program - NFS4_CALLBACK];
4209         mutex_exit(&np->s_lock);
4210 
4211         /*
4212          * Tell callback connection thread to exit.
4213          */
4214         mutex_enter(&cbi->cb_cbconn_lock);
4215         cbi->cb_cbconn_exit = TRUE;
4216         cv_broadcast(&cbi->cb_cbconn_wait);
4217         mutex_exit(&cbi->cb_cbconn_lock);
4218 
4219         /*
4220          * Tell callback handling thread to exit.
4221          * Wait till it exits and then free the cbinfo.
4222          */






4223 
4224         mutex_enter(&cbi->cb_rpc->r_lock);
4225         cbi->cb_flags |= NFS41_CB_THREAD_EXIT;
4226         cv_broadcast(&cbi->cb_rpc->r_cbwait);
4227         mutex_exit(&cbi->cb_rpc->r_lock);
4228 
4229         mutex_enter(&cbi->cb_reflock);
4230         while (cbi->cb_refcnt != 1) {
4231                 cv_wait(&cbi->cb_destroy_wait, &cbi->cb_reflock);
4232         }
4233         mutex_exit(&cbi->cb_reflock);
4234 
4235         mutex_enter(&np->s_lock);
4236         nfs4callback_destroy(np);
4237         np->s_flags &= ~(N4S_SESSION_CREATED);
4238         mutex_exit(&np->s_lock);
4239 }
4240 
4241 static void
4242 destroy_nfs4_server(nfs4_server_t *sp)
4243 {
4244         nfs4_fsidlt_t *ltp = NULL;
4245         void *cookie = NULL;
4246 
4247         ASSERT(MUTEX_HELD(&sp->s_lock));
4248         ASSERT(sp->s_refcnt == 0);
4249         ASSERT(sp->s_otw_call_count == 0);
4250 
4251         remove_all_mi(sp);
4252 
4253         crfree(sp->s_cred);
4254         kmem_free(sp->saddr.buf, sp->saddr.maxlen);
4255         kmem_free(sp->clidtosend.id_val, sp->clidtosend.id_len);
4256         mutex_exit(&sp->s_lock);
4257 
4258         while ((ltp = avl_destroy_nodes(&sp->s_fsidlt, &cookie)) != NULL) {
4259                 avl_destroy(&ltp->lt_rlayout_tree);
4260                 kmem_free(ltp, sizeof (*ltp));
4261         }
4262         avl_destroy(&sp->s_fsidlt);
4263         pnfs_trash_devtree(sp);
4264 
4265         /* destroy the nfs4_server */
4266         nfs4callback_destroy(sp);
4267         list_destroy(&sp->s_deleg_list);
4268         mutex_destroy(&sp->s_lock);
4269         cv_destroy(&sp->cv_thread_exit);
4270         cv_destroy(&sp->s_cv_otw_count);
4271         cv_destroy(&sp->s_clientid_pend);
4272         cv_destroy(&sp->wait_cb_null);
4273         nfs_rw_destroy(&sp->s_recovlock);
4274         kmem_free(sp, sizeof (*sp));
4275 }
4276 
4277 /*
4278  * Lock sp, but only if it's still active (in the list and hasn't been
4279  * flagged as exiting) or 'all' is non-zero.
4280  * Returns TRUE if sp got locked and adds a reference to sp.
4281  */
4282 bool_t
4283 nfs4_server_vlock(nfs4_server_t *sp, int all)
4284 {
4285         nfs4_server_t *np;
4286 
4287         mutex_enter(&nfs4_server_lst_lock);
4288         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
4289                 if (sp == np && (np->s_thread_exit != NFS4_THREAD_EXIT ||
4290                     all != 0)) {
4291                         mutex_enter(&np->s_lock);
4292                         np->s_refcnt++;
4293                         mutex_exit(&nfs4_server_lst_lock);
4294                         return (TRUE);
4295                 }
4296         }
4297         mutex_exit(&nfs4_server_lst_lock);
4298         return (FALSE);
4299 }
4300 
4301 /*
4302  * Fork off a thread to free the data structures for a mount.
4303  */
4304 
4305 static void
4306 async_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4307 {
4308         freemountargs_t *args;
4309         args = kmem_alloc(sizeof (freemountargs_t), KM_SLEEP);
4310         args->fm_vfsp = vfsp;
4311         VFS_HOLD(vfsp);
4312         MI4_HOLD(VFTOMI4(vfsp));
4313         args->fm_flag = flag;
4314         args->fm_cr = cr;
4315         crhold(cr);
4316         (void) zthread_create(NULL, 0, nfs4_free_mount_thread, args, 0,
4317             minclsyspri);
4318 }
4319 
4320 static void
4321 nfs4_free_mount_thread(freemountargs_t *args)
4322 {
4323         mntinfo4_t *mi;
4324         nfs4_free_mount(args->fm_vfsp, args->fm_flag, args->fm_cr);
4325         mi = VFTOMI4(args->fm_vfsp);
4326         crfree(args->fm_cr);
4327         VFS_RELE(args->fm_vfsp);
4328         MI4_RELE(mi);
4329         kmem_free(args, sizeof (freemountargs_t));
4330         zthread_exit();
4331         /* NOTREACHED */
4332 }
4333 
4334 /*
4335  * Thread to free the data structures for a given filesystem.
4336  */
4337 static void
4338 nfs4_free_mount(vfs_t *vfsp, int flag, cred_t *cr)
4339 {
4340         mntinfo4_t              *mi = VFTOMI4(vfsp);
4341         nfs4_server_t           *sp;
4342         callb_cpr_t             cpr_info;
4343         kmutex_t                cpr_lock;
4344         boolean_t               async_thread;
4345         int                     removed;
4346 
4347         bool_t                  must_unlock;
4348         bool_t                  must_rele;
4349         nfs4_ephemeral_tree_t   *eph_tree;
4350 
4351         /*
4352          * We need to participate in the CPR framework if this is a kernel
4353          * thread.
4354          */
4355         async_thread = (curproc == nfs_zone()->zone_zsched);
4356         if (async_thread) {
4357                 mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
4358                 CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
4359                     "nfsv4AsyncUnmount");
4360         }
4361 
4362         /*
4363          * We need to wait for all outstanding OTW calls
4364          * and recovery to finish before we remove the mi
4365          * from the nfs4_server_t, as current pending
4366          * calls might still need this linkage (in order
4367          * to find a nfs4_server_t from a mntinfo4_t).
4368          */
4369         (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
4370         sp = find_nfs4_server(mi);
4371         nfs_rw_exit(&mi->mi_recovlock);
4372 
4373         if (sp) {
4374                 while (sp->s_otw_call_count != 0) {
4375                         if (async_thread) {
4376                                 mutex_enter(&cpr_lock);
4377                                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
4378                                 mutex_exit(&cpr_lock);
4379                         }
4380                         cv_wait(&sp->s_cv_otw_count, &sp->s_lock);
4381                         if (async_thread) {
4382                                 mutex_enter(&cpr_lock);
4383                                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4384                                 mutex_exit(&cpr_lock);
4385                         }
4386                 }
4387                 mutex_exit(&sp->s_lock);
4388                 nfs4_server_rele(sp);
4389                 sp = NULL;
4390         }
4391 
4392         mutex_enter(&mi->mi_lock);
4393         while (mi->mi_in_recovery != 0) {
4394                 if (async_thread) {
4395                         mutex_enter(&cpr_lock);
4396                         CALLB_CPR_SAFE_BEGIN(&cpr_info);
4397                         mutex_exit(&cpr_lock);
4398                 }
4399                 cv_wait(&mi->mi_cv_in_recov, &mi->mi_lock);
4400                 if (async_thread) {
4401                         mutex_enter(&cpr_lock);
4402                         CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
4403                         mutex_exit(&cpr_lock);
4404                 }
4405         }
4406         mutex_exit(&mi->mi_lock);
4407 
4408         /*
4409          * If we got an error, then do not nuke the
4410          * tree. Either the harvester is busy reclaiming
4411          * this node or we ran into some busy condition.
4412          *
4413          * The harvester will eventually come along and cleanup.
4414          * The only problem would be the root mount point.
4415          *
4416          * Since the busy node can occur for a variety
4417          * of reasons and can result in an entry staying
4418          * in df output but no longer accessible from the
4419          * directory tree, we are okay.
4420          */
4421         if (!nfs4_ephemeral_umount(mi, flag, cr,
4422             &must_unlock, &must_rele, &eph_tree))
4423                 nfs4_ephemeral_umount_activate(mi, &must_unlock,
4424                     &must_rele, &eph_tree);
4425 
4426         /*
4427          * The original purge of the dnlc via 'dounmount'
4428          * doesn't guarantee that another dnlc entry was not
4429          * added while we waitied for all outstanding OTW
4430          * and recovery calls to finish.  So re-purge the
4431          * dnlc now.
4432          */
4433         (void) dnlc_purge_vfsp(vfsp, 0);
4434 
4435         /*
4436          * We need to explicitly stop the manager thread; the asyc worker
4437          * threads can timeout and exit on their own.
4438          */
4439         mutex_enter(&mi->mi_async_lock);
4440         mi->mi_max_threads = 0;
4441         cv_broadcast(&mi->mi_async_work_cv);
4442         mutex_exit(&mi->mi_async_lock);
4443         if (mi->mi_manager_thread)
4444                 nfs4_async_manager_stop(vfsp);
4445 
4446         destroy_rtable4(vfsp, cr);
4447 
4448         nfs4_remove_mi_from_server(mi, NULL);
4449 
4450         if (async_thread) {
4451                 mutex_enter(&cpr_lock);
4452                 CALLB_CPR_EXIT(&cpr_info);  /* drops cpr_lock */
4453                 mutex_destroy(&cpr_lock);
4454         }
4455 
4456         removed = nfs4_mi_zonelist_remove(mi);
4457         if (removed)
4458                 zone_rele(mi->mi_zone);
4459 }
--- EOF ---