Move CallBack Server thread creation, initial processing and destruction to RPC
Cleanup some RPC code.
Remove extraneous fields from nfs41_cb_info and clean up the code.
Change KM_SLEEP in mir_nfs41_callback_thread to KM_NOSLEEP.
Fix lint warnings

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /*
  27  *      Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  28  *      All Rights Reserved
  29  */
  30 
  31 #include <sys/param.h>
  32 #include <sys/types.h>
  33 #include <sys/systm.h>
  34 #include <sys/cmn_err.h>
  35 #include <sys/vtrace.h>
  36 #include <sys/session.h>
  37 #include <sys/thread.h>
  38 #include <sys/dnlc.h>
  39 #include <sys/cred.h>
  40 #include <sys/priv.h>
  41 #include <sys/list.h>
  42 #include <sys/sdt.h>
  43 #include <sys/policy.h>
  44 
  45 #include <rpc/types.h>
  46 #include <rpc/xdr.h>
  47 
  48 #include <nfs/nfs.h>
  49 
  50 #include <nfs/nfs_clnt.h>
  51 
  52 #include <nfs/nfs4.h>
  53 #include <nfs/rnode4.h>
  54 #include <nfs/nfs4_clnt.h>
  55 #include <nfs/nfs41_sessions.h>
  56 #include <nfs/nfs4_clnt_impl.h>
  57 
  58 /*
  59  * client side statistics
  60  */
  61 static const struct clstat4 clstat4_tmpl = {
  62         { "calls",      KSTAT_DATA_UINT64 },
  63         { "badcalls",   KSTAT_DATA_UINT64 },
  64         { "clgets",     KSTAT_DATA_UINT64 },
  65         { "cltoomany",  KSTAT_DATA_UINT64 }
  66 };
  67 #ifdef DEBUG
  68 struct clstat4_debug clstat4_debug = {
  69         { "clalloc",    KSTAT_DATA_UINT64 },
  70         { "noresponse", KSTAT_DATA_UINT64 },
  71         { "failover",   KSTAT_DATA_UINT64 },
  72         { "remap",      KSTAT_DATA_UINT64 },
  73         { "nrnode",     KSTAT_DATA_UINT64 },
  74         { "access",     KSTAT_DATA_UINT64 },
  75         { "dirent",     KSTAT_DATA_UINT64 },
  76         { "dirents",    KSTAT_DATA_UINT64 },
  77         { "reclaim",    KSTAT_DATA_UINT64 },
  78         { "clreclaim",  KSTAT_DATA_UINT64 },
  79         { "f_reclaim",  KSTAT_DATA_UINT64 },
  80         { "a_reclaim",  KSTAT_DATA_UINT64 },
  81         { "r_reclaim",  KSTAT_DATA_UINT64 },
  82         { "r_path",     KSTAT_DATA_UINT64 }
  83 };
  84 #endif
  85 
  86 /*
  87  * We keep a global list of per-zone client data, so we can clean up all zones
  88  * if we get low on memory.
  89  */
  90 static list_t nfs4_clnt_list;
  91 static kmutex_t nfs4_clnt_list_lock;
  92 
  93 static struct kmem_cache *chtab4_cache;
  94 
  95 #ifdef DEBUG
  96 static int nfs4_rfscall_debug;
  97 static int nfs4_try_failover_any;
  98 int nfs4_utf8_debug = 0;
  99 #endif
 100 
 101 /*
 102  * NFSv4 readdir cache implementation
 103  */
 104 typedef struct rddir4_cache_impl {
 105         rddir4_cache    rc;             /* readdir cache element */
 106         kmutex_t        lock;           /* lock protects count */
 107         uint_t          count;          /* reference count */
 108         avl_node_t      tree;           /* AVL tree link */
 109 } rddir4_cache_impl;
 110 
 111 static int rddir4_cache_compar(const void *, const void *);
 112 static void rddir4_cache_free(rddir4_cache_impl *);
 113 static rddir4_cache *rddir4_cache_alloc(int);
 114 static void rddir4_cache_hold(rddir4_cache *);
 115 static int try_failover(enum clnt_stat);
 116 
 117 static int nfs4_readdir_cache_hits = 0;
 118 static int nfs4_readdir_cache_waits = 0;
 119 static int nfs4_readdir_cache_misses = 0;
 120 
 121 /*
 122  * Shared nfs4 functions
 123  */
 124 
 125 /*
 126  * Copy an nfs_fh4.  The destination storage (to->nfs_fh4_val) must already
 127  * be allocated.
 128  */
 129 
 130 void
 131 nfs_fh4_copy(nfs_fh4 *from, nfs_fh4 *to)
 132 {
 133         to->nfs_fh4_len = from->nfs_fh4_len;
 134         bcopy(from->nfs_fh4_val, to->nfs_fh4_val, to->nfs_fh4_len);
 135 }
 136 
 137 /*
 138  * nfs4cmpfh - compare 2 filehandles.
 139  * Returns 0 if the two nfsv4 filehandles are the same, -1 if the first is
 140  * "less" than the second, +1 if the first is "greater" than the second.
 141  */
 142 
 143 int
 144 nfs4cmpfh(const nfs_fh4 *fh4p1, const nfs_fh4 *fh4p2)
 145 {
 146         const char *c1, *c2;
 147 
 148         if (fh4p1->nfs_fh4_len < fh4p2->nfs_fh4_len)
 149                 return (-1);
 150         if (fh4p1->nfs_fh4_len > fh4p2->nfs_fh4_len)
 151                 return (1);
 152         for (c1 = fh4p1->nfs_fh4_val, c2 = fh4p2->nfs_fh4_val;
 153             c1 < fh4p1->nfs_fh4_val + fh4p1->nfs_fh4_len;
 154             c1++, c2++) {
 155                 if (*c1 < *c2)
 156                         return (-1);
 157                 if (*c1 > *c2)
 158                         return (1);
 159         }
 160 
 161         return (0);
 162 }
 163 
 164 /*
 165  * Compare two v4 filehandles.  Return zero if they're the same, non-zero
 166  * if they're not.  Like nfs4cmpfh(), but different filehandle
 167  * representation, and doesn't provide information about greater than or
 168  * less than.
 169  */
 170 
 171 int
 172 nfs4cmpfhandle(nfs4_fhandle_t *fh1, nfs4_fhandle_t *fh2)
 173 {
 174         if (fh1->fh_len == fh2->fh_len)
 175                 return (bcmp(fh1->fh_buf, fh2->fh_buf, fh1->fh_len));
 176 
 177         return (1);
 178 }
 179 
 180 int
 181 stateid4_cmp(stateid4 *s1, stateid4 *s2)
 182 {
 183         if (bcmp(s1, s2, sizeof (stateid4)) == 0)
 184                 return (1);
 185         else
 186                 return (0);
 187 }
 188 
 189 nfsstat4
 190 puterrno4(int error)
 191 {
 192         switch (error) {
 193         case 0:
 194                 return (NFS4_OK);
 195         case EPERM:
 196                 return (NFS4ERR_PERM);
 197         case ENOENT:
 198                 return (NFS4ERR_NOENT);
 199         case EINTR:
 200                 return (NFS4ERR_IO);
 201         case EIO:
 202                 return (NFS4ERR_IO);
 203         case ENXIO:
 204                 return (NFS4ERR_NXIO);
 205         case ENOMEM:
 206                 return (NFS4ERR_RESOURCE);
 207         case EACCES:
 208                 return (NFS4ERR_ACCESS);
 209         case EBUSY:
 210                 return (NFS4ERR_IO);
 211         case EEXIST:
 212                 return (NFS4ERR_EXIST);
 213         case EXDEV:
 214                 return (NFS4ERR_XDEV);
 215         case ENODEV:
 216                 return (NFS4ERR_IO);
 217         case ENOTDIR:
 218                 return (NFS4ERR_NOTDIR);
 219         case EISDIR:
 220                 return (NFS4ERR_ISDIR);
 221         case EINVAL:
 222                 return (NFS4ERR_INVAL);
 223         case EMFILE:
 224                 return (NFS4ERR_RESOURCE);
 225         case EFBIG:
 226                 return (NFS4ERR_FBIG);
 227         case ENOSPC:
 228                 return (NFS4ERR_NOSPC);
 229         case EROFS:
 230                 return (NFS4ERR_ROFS);
 231         case EMLINK:
 232                 return (NFS4ERR_MLINK);
 233         case EDEADLK:
 234                 return (NFS4ERR_DEADLOCK);
 235         case ENOLCK:
 236                 return (NFS4ERR_DENIED);
 237         case EREMOTE:
 238                 return (NFS4ERR_SERVERFAULT);
 239         case ENOTSUP:
 240                 return (NFS4ERR_NOTSUPP);
 241         case EDQUOT:
 242                 return (NFS4ERR_DQUOT);
 243         case ENAMETOOLONG:
 244                 return (NFS4ERR_NAMETOOLONG);
 245         case EOVERFLOW:
 246                 return (NFS4ERR_INVAL);
 247         case ENOSYS:
 248                 return (NFS4ERR_NOTSUPP);
 249         case ENOTEMPTY:
 250                 return (NFS4ERR_NOTEMPTY);
 251         case EOPNOTSUPP:
 252                 return (NFS4ERR_NOTSUPP);
 253         case ESTALE:
 254                 return (NFS4ERR_STALE);
 255         case EAGAIN:
 256                 if (curthread->t_flag & T_WOULDBLOCK) {
 257                         curthread->t_flag &= ~T_WOULDBLOCK;
 258                         return (NFS4ERR_DELAY);
 259                 }
 260                 return (NFS4ERR_LOCKED);
 261         default:
 262                 return ((enum nfsstat4)error);
 263         }
 264 }
 265 
 266 int
 267 geterrno4(enum nfsstat4 status)
 268 {
 269         switch (status) {
 270         case NFS4_OK:
 271                 return (0);
 272         case NFS4ERR_PERM:
 273                 return (EPERM);
 274         case NFS4ERR_NOENT:
 275                 return (ENOENT);
 276         case NFS4ERR_IO:
 277                 return (EIO);
 278         case NFS4ERR_NXIO:
 279                 return (ENXIO);
 280         case NFS4ERR_ACCESS:
 281                 return (EACCES);
 282         case NFS4ERR_EXIST:
 283                 return (EEXIST);
 284         case NFS4ERR_XDEV:
 285                 return (EXDEV);
 286         case NFS4ERR_NOTDIR:
 287                 return (ENOTDIR);
 288         case NFS4ERR_ISDIR:
 289                 return (EISDIR);
 290         case NFS4ERR_INVAL:
 291                 return (EINVAL);
 292         case NFS4ERR_FBIG:
 293                 return (EFBIG);
 294         case NFS4ERR_NOSPC:
 295                 return (ENOSPC);
 296         case NFS4ERR_ROFS:
 297                 return (EROFS);
 298         case NFS4ERR_MLINK:
 299                 return (EMLINK);
 300         case NFS4ERR_NAMETOOLONG:
 301                 return (ENAMETOOLONG);
 302         case NFS4ERR_NOTEMPTY:
 303                 return (ENOTEMPTY);
 304         case NFS4ERR_DQUOT:
 305                 return (EDQUOT);
 306         case NFS4ERR_STALE:
 307                 return (ESTALE);
 308         case NFS4ERR_BADHANDLE:
 309                 return (ESTALE);
 310         case NFS4ERR_BAD_COOKIE:
 311                 return (EINVAL);
 312         case NFS4ERR_NOTSUPP:
 313                 return (EOPNOTSUPP);
 314         case NFS4ERR_TOOSMALL:
 315                 return (EINVAL);
 316         case NFS4ERR_SERVERFAULT:
 317                 return (EIO);
 318         case NFS4ERR_BADTYPE:
 319                 return (EINVAL);
 320         case NFS4ERR_DELAY:
 321                 return (ENXIO);
 322         case NFS4ERR_SAME:
 323                 return (EPROTO);
 324         case NFS4ERR_DENIED:
 325                 return (ENOLCK);
 326         case NFS4ERR_EXPIRED:
 327                 return (EPROTO);
 328         case NFS4ERR_LOCKED:
 329                 return (EACCES);
 330         case NFS4ERR_GRACE:
 331                 return (EAGAIN);
 332         case NFS4ERR_FHEXPIRED: /* if got here, failed to get a new fh */
 333                 return (ESTALE);
 334         case NFS4ERR_SHARE_DENIED:
 335                 return (EACCES);
 336         case NFS4ERR_WRONGSEC:
 337                 return (EPERM);
 338         case NFS4ERR_CLID_INUSE:
 339                 return (EAGAIN);
 340         case NFS4ERR_RESOURCE:
 341                 return (EAGAIN);
 342         case NFS4ERR_MOVED:
 343                 return (EPROTO);
 344         case NFS4ERR_NOFILEHANDLE:
 345                 return (EIO);
 346         case NFS4ERR_MINOR_VERS_MISMATCH:
 347                 return (ENOTSUP);
 348         case NFS4ERR_STALE_CLIENTID:
 349                 return (EIO);
 350         case NFS4ERR_STALE_STATEID:
 351                 return (EIO);
 352         case NFS4ERR_OLD_STATEID:
 353                 return (EIO);
 354         case NFS4ERR_BAD_STATEID:
 355                 return (EIO);
 356         case NFS4ERR_BAD_SEQID:
 357                 return (EIO);
 358         case NFS4ERR_NOT_SAME:
 359                 return (EPROTO);
 360         case NFS4ERR_LOCK_RANGE:
 361                 return (EPROTO);
 362         case NFS4ERR_SYMLINK:
 363                 return (EPROTO);
 364         case NFS4ERR_RESTOREFH:
 365                 return (EPROTO);
 366         case NFS4ERR_LEASE_MOVED:
 367                 return (EPROTO);
 368         case NFS4ERR_ATTRNOTSUPP:
 369                 return (ENOTSUP);
 370         case NFS4ERR_NO_GRACE:
 371                 return (EPROTO);
 372         case NFS4ERR_RECLAIM_BAD:
 373                 return (EPROTO);
 374         case NFS4ERR_RECLAIM_CONFLICT:
 375                 return (EPROTO);
 376         case NFS4ERR_BADXDR:
 377                 return (EINVAL);
 378         case NFS4ERR_LOCKS_HELD:
 379                 return (EIO);
 380         case NFS4ERR_OPENMODE:
 381                 return (EACCES);
 382         case NFS4ERR_BADOWNER:
 383                 /*
 384                  * Client and server are in different DNS domains
 385                  * and the NFSMAPID_DOMAIN in /etc/default/nfs
 386                  * doesn't match.  No good answer here.  Return
 387                  * EACCESS, which translates to "permission denied".
 388                  */
 389                 return (EACCES);
 390         case NFS4ERR_BADCHAR:
 391                 return (EINVAL);
 392         case NFS4ERR_BADNAME:
 393                 return (EINVAL);
 394         case NFS4ERR_BAD_RANGE:
 395                 return (EIO);
 396         case NFS4ERR_LOCK_NOTSUPP:
 397                 return (ENOTSUP);
 398         case NFS4ERR_OP_ILLEGAL:
 399                 return (EINVAL);
 400         case NFS4ERR_DEADLOCK:
 401                 return (EDEADLK);
 402         case NFS4ERR_FILE_OPEN:
 403                 return (EACCES);
 404         case NFS4ERR_ADMIN_REVOKED:
 405                 return (EPROTO);
 406         case NFS4ERR_CB_PATH_DOWN:
 407                 return (EPROTO);
 408         case NFS4ERR_BADSESSION:
 409                 return (EIO);
 410         default:
 411 #ifdef DEBUG
 412                 zcmn_err(getzoneid(), CE_WARN, "geterrno4: got status %d",
 413                     status);
 414 #endif
 415                 return ((int)status);
 416         }
 417 }
 418 
 419 void
 420 nfs4_log_badowner(mntinfo4_t *mi, nfs_opnum4 op)
 421 {
 422         nfs4_server_t *server;
 423 
 424         /*
 425          * Return if already printed/queued a msg
 426          * for this mount point.
 427          */
 428         if (mi->mi_flags & MI4_BADOWNER_DEBUG)
 429                 return;
 430         /*
 431          * Happens once per client <-> server pair.
 432          */
 433         if (nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER,
 434             mi->mi_flags & MI4_INT))
 435                 return;
 436 
 437         server = find_nfs4_server(mi);
 438         if (server == NULL) {
 439                 nfs_rw_exit(&mi->mi_recovlock);
 440                 return;
 441         }
 442 
 443         if (!(server->s_flags & N4S_BADOWNER_DEBUG)) {
 444                 zcmn_err(mi->mi_zone->zone_id, CE_WARN,
 445                     "!NFSMAPID_DOMAIN does not match"
 446                     " the server: %s domain.\n"
 447                     "Please check configuration",
 448                     mi->mi_curr_serv->sv_hostname);
 449                 server->s_flags |= N4S_BADOWNER_DEBUG;
 450         }
 451         mutex_exit(&server->s_lock);
 452         nfs4_server_rele(server);
 453         nfs_rw_exit(&mi->mi_recovlock);
 454 
 455         /*
 456          * Happens once per mntinfo4_t.
 457          * This error is deemed as one of the recovery facts "RF_BADOWNER",
 458          * queue this in the mesg queue for this mount_info. This message
 459          * is not printed, meaning its absent from id_to_dump_solo_fact()
 460          * but its there for inspection if the queue is ever dumped/inspected.
 461          */
 462         mutex_enter(&mi->mi_lock);
 463         if (!(mi->mi_flags & MI4_BADOWNER_DEBUG)) {
 464                 nfs4_queue_fact(RF_BADOWNER, mi, NFS4ERR_BADOWNER, 0, op,
 465                     FALSE, NULL, 0, NULL);
 466                 mi->mi_flags |= MI4_BADOWNER_DEBUG;
 467         }
 468         mutex_exit(&mi->mi_lock);
 469 }
 470 
 471 int
 472 nfs4_time_ntov(nfstime4 *ntime, timestruc_t *vatime)
 473 {
 474         int64_t sec;
 475         int32_t nsec;
 476 
 477         /*
 478          * Here check that the nfsv4 time is valid for the system.
 479          * nfsv4 time value is a signed 64-bit, and the system time
 480          * may be either int64_t or int32_t (depends on the kernel),
 481          * so if the kernel is 32-bit, the nfsv4 time value may not fit.
 482          */
 483 #ifndef _LP64
 484         if (! NFS4_TIME_OK(ntime->seconds)) {
 485                 return (EOVERFLOW);
 486         }
 487 #endif
 488 
 489         /* Invalid to specify 1 billion (or more) nsecs */
 490         if (ntime->nseconds >= 1000000000)
 491                 return (EINVAL);
 492 
 493         if (ntime->seconds < 0) {
 494                 sec = ntime->seconds + 1;
 495                 nsec = -1000000000 + ntime->nseconds;
 496         } else {
 497                 sec = ntime->seconds;
 498                 nsec = ntime->nseconds;
 499         }
 500 
 501         vatime->tv_sec = sec;
 502         vatime->tv_nsec = nsec;
 503 
 504         return (0);
 505 }
 506 
 507 int
 508 nfs4_time_vton(timestruc_t *vatime, nfstime4 *ntime)
 509 {
 510         int64_t sec;
 511         uint32_t nsec;
 512 
 513         /*
 514          * nfsv4 time value is a signed 64-bit, and the system time
 515          * may be either int64_t or int32_t (depends on the kernel),
 516          * so all system time values will fit.
 517          */
 518         if (vatime->tv_nsec >= 0) {
 519                 sec = vatime->tv_sec;
 520                 nsec = vatime->tv_nsec;
 521         } else {
 522                 sec = vatime->tv_sec - 1;
 523                 nsec = 1000000000 + vatime->tv_nsec;
 524         }
 525         ntime->seconds = sec;
 526         ntime->nseconds = nsec;
 527 
 528         return (0);
 529 }
 530 
 531 /*
 532  * Converts a utf8 string to a valid null terminated filename string.
 533  *
 534  * XXX - Not actually translating the UTF-8 string as per RFC 2279.
 535  *       For now, just validate that the UTF-8 string off the wire
 536  *       does not have characters that will freak out UFS, and leave
 537  *       it at that.
 538  */
 539 char *
 540 utf8_to_fn(utf8string *u8s, uint_t *lenp, char *s)
 541 {
 542         ASSERT(lenp != NULL);
 543 
 544         if (u8s == NULL || u8s->utf8string_len <= 0 ||
 545             u8s->utf8string_val == NULL)
 546                 return (NULL);
 547 
 548         /*
 549          * Check for obvious illegal filename chars
 550          */
 551         if (utf8_strchr(u8s, '/') != NULL) {
 552 #ifdef DEBUG
 553                 if (nfs4_utf8_debug) {
 554                         char *path;
 555                         int len = u8s->utf8string_len;
 556 
 557                         path = kmem_alloc(len + 1, KM_SLEEP);
 558                         bcopy(u8s->utf8string_val, path, len);
 559                         path[len] = '\0';
 560 
 561                         zcmn_err(getzoneid(), CE_WARN,
 562                             "Invalid UTF-8 filename: %s", path);
 563 
 564                         kmem_free(path, len + 1);
 565                 }
 566 #endif
 567                 return (NULL);
 568         }
 569 
 570         return (utf8_to_str(u8s, lenp, s));
 571 }
 572 
 573 /*
 574  * Converts a utf8 string to a C string.
 575  * kmem_allocs a new string if not supplied
 576  */
 577 char *
 578 utf8_to_str(utf8string *str, uint_t *lenp, char *s)
 579 {
 580         char    *sp;
 581         char    *u8p;
 582         int     len;
 583         int      i;
 584 
 585         ASSERT(lenp != NULL);
 586 
 587         if (str == NULL)
 588                 return (NULL);
 589 
 590         u8p = str->utf8string_val;
 591         len = str->utf8string_len;
 592         if (len <= 0 || u8p == NULL) {
 593                 if (s)
 594                         *s = '\0';
 595                 return (NULL);
 596         }
 597 
 598         sp = s;
 599         if (sp == NULL)
 600                 sp = kmem_alloc(len + 1, KM_SLEEP);
 601 
 602         /*
 603          * At least check for embedded nulls
 604          */
 605         for (i = 0; i < len; i++) {
 606                 sp[i] = u8p[i];
 607                 if (u8p[i] == '\0') {
 608 #ifdef  DEBUG
 609                         zcmn_err(getzoneid(), CE_WARN,
 610                             "Embedded NULL in UTF-8 string");
 611 #endif
 612                         if (s == NULL)
 613                                 kmem_free(sp, len + 1);
 614                         return (NULL);
 615                 }
 616         }
 617         sp[len] = '\0';
 618         *lenp = len + 1;
 619 
 620         return (sp);
 621 }
 622 
 623 /*
 624  * str_to_utf8 - converts a null-terminated C string to a utf8 string
 625  */
 626 utf8string *
 627 str_to_utf8(char *nm, utf8string *str)
 628 {
 629         int len;
 630 
 631         if (str == NULL)
 632                 return (NULL);
 633 
 634         if (nm == NULL || *nm == '\0') {
 635                 str->utf8string_len = 0;
 636                 str->utf8string_val = NULL;
 637         }
 638 
 639         len = strlen(nm);
 640 
 641         str->utf8string_val = kmem_alloc(len, KM_SLEEP);
 642         str->utf8string_len = len;
 643         bcopy(nm, str->utf8string_val, len);
 644 
 645         return (str);
 646 }
 647 
 648 utf8string *
 649 utf8_copy(utf8string *src, utf8string *dest)
 650 {
 651         if (src == NULL)
 652                 return (NULL);
 653         if (dest == NULL)
 654                 return (NULL);
 655 
 656         if (src->utf8string_len > 0) {
 657                 dest->utf8string_val = kmem_alloc(src->utf8string_len,
 658                     KM_SLEEP);
 659                 bcopy(src->utf8string_val, dest->utf8string_val,
 660                     src->utf8string_len);
 661                 dest->utf8string_len = src->utf8string_len;
 662         } else {
 663                 dest->utf8string_val = NULL;
 664                 dest->utf8string_len = 0;
 665         }
 666 
 667         return (dest);
 668 }
 669 
 670 int
 671 utf8_compare(const utf8string *a, const utf8string *b)
 672 {
 673         int mlen, cmp;
 674         int alen, blen;
 675         char *aval, *bval;
 676 
 677         if ((a == NULL) && (b == NULL))
 678                 return (0);
 679         else if (a == NULL)
 680                 return (-1);
 681         else if (b == NULL)
 682                 return (1);
 683 
 684         alen = a->utf8string_len;
 685         blen = b->utf8string_len;
 686         aval = a->utf8string_val;
 687         bval = b->utf8string_val;
 688 
 689         if (((alen == 0) || (aval == NULL)) &&
 690             ((blen == 0) || (bval == NULL)))
 691                 return (0);
 692         else if ((alen == 0) || (aval == NULL))
 693                 return (-1);
 694         else if ((blen == 0) || (bval == NULL))
 695                 return (1);
 696 
 697         mlen = MIN(alen, blen);
 698         cmp = strncmp(aval, bval, mlen);
 699 
 700         if ((cmp == 0) && (alen == blen))
 701                 return (0);
 702         else if ((cmp == 0) && (alen < blen))
 703                 return (-1);
 704         else if (cmp == 0)
 705                 return (1);
 706         else if (cmp < 0)
 707                 return (-1);
 708         return (1);
 709 }
 710 
 711 /*
 712  * utf8_dir_verify - checks that the utf8 string is valid
 713  */
 714 int
 715 utf8_dir_verify(utf8string *str)
 716 {
 717         char *nm;
 718         int len;
 719 
 720         if (str == NULL)
 721                 return (0);
 722 
 723         nm = str->utf8string_val;
 724         len = str->utf8string_len;
 725         if (nm == NULL || len == 0) {
 726                 return (0);
 727         }
 728 
 729         if (len == 1 && nm[0] == '.')
 730                 return (0);
 731         if (len == 2 && nm[0] == '.' && nm[1] == '.')
 732                 return (0);
 733 
 734         if (utf8_strchr(str, '/') != NULL)
 735                 return (0);
 736 
 737         if (utf8_strchr(str, '\0') != NULL)
 738                 return (0);
 739 
 740         return (1);
 741 }
 742 
 743 /*
 744  * from rpcsec module (common/rpcsec)
 745  */
 746 extern int sec_clnt_geth(CLIENT *, struct sec_data *, cred_t *, AUTH **);
 747 extern void sec_clnt_freeh(AUTH *);
 748 extern void sec_clnt_freeinfo(struct sec_data *);
 749 
 750 /*
 751  * authget() gets an auth handle based on the security
 752  * information from the servinfo in mountinfo.
 753  * The auth handle is stored in ch_client->cl_auth.
 754  *
 755  * First security flavor of choice is to use sv_secdata
 756  * which is initiated by the client. If that fails, get
 757  * secinfo from the server and then select one from the
 758  * server secinfo list .
 759  *
 760  * For RPCSEC_GSS flavor, upon success, a secure context is
 761  * established between client and server.
 762  */
 763 int
 764 authget(servinfo4_t *svp, CLIENT *ch_client, cred_t *cr)
 765 {
 766         int error, i;
 767 
 768         /*
 769          * SV4_TRYSECINFO indicates to try the secinfo list from
 770          * sv_secinfo until a successful one is reached. Point
 771          * sv_currsec to the selected security mechanism for
 772          * later sessions.
 773          */
 774         (void) nfs_rw_enter_sig(&svp->sv_lock, RW_WRITER, 0);
 775         if ((svp->sv_flags & SV4_TRYSECINFO) && svp->sv_secinfo) {
 776                 for (i = svp->sv_secinfo->index; i < svp->sv_secinfo->count;
 777                     i++) {
 778                         if (!(error = sec_clnt_geth(ch_client,
 779                             &svp->sv_secinfo->sdata[i],
 780                             cr, &ch_client->cl_auth))) {
 781 
 782                                 svp->sv_currsec = &svp->sv_secinfo->sdata[i];
 783                                 svp->sv_secinfo->index = i;
 784                                 /* done */
 785                                 svp->sv_flags &= ~SV4_TRYSECINFO;
 786                                 break;
 787                         }
 788 
 789                         /*
 790                          * Allow the caller retry with the security flavor
 791                          * pointed by svp->sv_secinfo->index when
 792                          * ETIMEDOUT/ECONNRESET occurs.
 793                          */
 794                         if (error == ETIMEDOUT || error == ECONNRESET) {
 795                                 svp->sv_secinfo->index = i;
 796                                 break;
 797                         }
 798                 }
 799         } else {
 800                 /* sv_currsec points to one of the entries in sv_secinfo */
 801                 if (svp->sv_currsec) {
 802                         error = sec_clnt_geth(ch_client, svp->sv_currsec, cr,
 803                             &ch_client->cl_auth);
 804                 } else {
 805                         /* If it's null, use sv_secdata. */
 806                         error = sec_clnt_geth(ch_client, svp->sv_secdata, cr,
 807                             &ch_client->cl_auth);
 808                 }
 809         }
 810         nfs_rw_exit(&svp->sv_lock);
 811 
 812         return (error);
 813 }
 814 
 815 /*
 816  * Common handle get program for NFS, NFS ACL, and NFS AUTH client.
 817  */
 818 int
 819 clget4(clinfo_t *ci, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
 820     struct chtab **chp, struct nfs4_clnt *nfscl, mntinfo4_t *mi)
 821 {
 822         struct chhead *ch, *newch;
 823         struct chhead **plistp;
 824         struct chtab *cp;
 825         int error;
 826         k_sigset_t smask;
 827 
 828         if (newcl == NULL || chp == NULL || ci == NULL)
 829                 return (EINVAL);
 830 
 831         *newcl = NULL;
 832         *chp = NULL;
 833 
 834         /*
 835          * Find an unused handle or create one
 836          */
 837         newch = NULL;
 838         /*
 839          * Update statistics based on minor version number
 840          */
 841         nfscl->nfscl_stat[NFS4_MINORVERSION(mi)].clgets.value.ui64++;
 842 top:
 843         /*
 844          * Find the correct entry in the cache to check for free
 845          * client handles.  The search is based on the RPC program
 846          * number, program version number, dev_t for the transport
 847          * device, and the protocol family.
 848          */
 849         mutex_enter(&nfscl->nfscl_chtable4_lock);
 850         plistp = &nfscl->nfscl_chtable4;
 851         for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
 852                 if (ch->ch_prog == ci->cl_prog &&
 853                     ch->ch_vers == ci->cl_vers &&
 854                     ch->ch_dev == svp->sv_knconf->knc_rdev &&
 855                     (strcmp(ch->ch_protofmly,
 856                     svp->sv_knconf->knc_protofmly) == 0))
 857                         break;
 858                 plistp = &ch->ch_next;
 859         }
 860 
 861         /*
 862          * If we didn't find a cache entry for this quadruple, then
 863          * create one.  If we don't have one already preallocated,
 864          * then drop the cache lock, create one, and then start over.
 865          * If we did have a preallocated entry, then just add it to
 866          * the front of the list.
 867          */
 868         if (ch == NULL) {
 869                 if (newch == NULL) {
 870                         mutex_exit(&nfscl->nfscl_chtable4_lock);
 871                         newch = kmem_alloc(sizeof (*newch), KM_SLEEP);
 872                         newch->ch_timesused = 0;
 873                         newch->ch_prog = ci->cl_prog;
 874                         newch->ch_vers = ci->cl_vers;
 875                         newch->ch_dev = svp->sv_knconf->knc_rdev;
 876                         newch->ch_protofmly = kmem_alloc(
 877                             strlen(svp->sv_knconf->knc_protofmly) + 1,
 878                             KM_SLEEP);
 879                         (void) strcpy(newch->ch_protofmly,
 880                             svp->sv_knconf->knc_protofmly);
 881                         newch->ch_list = NULL;
 882                         goto top;
 883                 }
 884                 ch = newch;
 885                 newch = NULL;
 886                 ch->ch_next = nfscl->nfscl_chtable4;
 887                 nfscl->nfscl_chtable4 = ch;
 888         /*
 889          * We found a cache entry, but if it isn't on the front of the
 890          * list, then move it to the front of the list to try to take
 891          * advantage of locality of operations.
 892          */
 893         } else if (ch != nfscl->nfscl_chtable4) {
 894                 *plistp = ch->ch_next;
 895                 ch->ch_next = nfscl->nfscl_chtable4;
 896                 nfscl->nfscl_chtable4 = ch;
 897         }
 898 
 899         /*
 900          * If there was a free client handle cached, then remove it
 901          * from the list, init it, and use it.
 902          */
 903         if (ch->ch_list != NULL) {
 904                 cp = ch->ch_list;
 905                 ch->ch_list = cp->ch_list;
 906                 mutex_exit(&nfscl->nfscl_chtable4_lock);
 907                 if (newch != NULL) {
 908                         kmem_free(newch->ch_protofmly,
 909                             strlen(newch->ch_protofmly) + 1);
 910                         kmem_free(newch, sizeof (*newch));
 911                 }
 912                 (void) clnt_tli_kinit(cp->ch_client, svp->sv_knconf,
 913                     &svp->sv_addr, ci->cl_readsize, ci->cl_retrans, cr);
 914 
 915                 /*
 916                  * Get an auth handle.
 917                  */
 918                 error = authget(svp, cp->ch_client, cr);
 919                 if (error || cp->ch_client->cl_auth == NULL) {
 920                         CLNT_DESTROY(cp->ch_client);
 921                         kmem_cache_free(chtab4_cache, cp);
 922                         return ((error != 0) ? error : EINTR);
 923                 }
 924                 ch->ch_timesused++;
 925                 *newcl = cp->ch_client;
 926                 *chp = cp;
 927                 return (0);
 928         }
 929 
 930         /*
 931          * There weren't any free client handles which fit, so allocate a
 932          * new one and use that.
 933          */
 934 #ifdef DEBUG
 935         atomic_add_64(&clstat4_debug.clalloc.value.ui64, 1);
 936 #endif
 937         mutex_exit(&nfscl->nfscl_chtable4_lock);
 938 
 939         nfscl->nfscl_stat[NFS4_MINORVERSION(mi)].cltoomany.value.ui64++;
 940         if (newch != NULL) {
 941                 kmem_free(newch->ch_protofmly, strlen(newch->ch_protofmly) + 1);
 942                 kmem_free(newch, sizeof (*newch));
 943         }
 944 
 945         cp = kmem_cache_alloc(chtab4_cache, KM_SLEEP);
 946         cp->ch_head = ch;
 947 
 948         sigintr(&smask, (int)ci->cl_flags & MI4_INT);
 949         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr, ci->cl_prog,
 950             ci->cl_vers, ci->cl_readsize, ci->cl_retrans, cr, &cp->ch_client);
 951         sigunintr(&smask);
 952 
 953         if (error != 0) {
 954                 kmem_cache_free(chtab4_cache, cp);
 955 #ifdef DEBUG
 956         atomic_add_64(&clstat4_debug.clalloc.value.ui64, -1);
 957 #endif
 958                 /*
 959                  * Warning is unnecessary if error is EINTR.
 960                  */
 961                 if (error != EINTR) {
 962                         nfs_cmn_err(error, CE_WARN,
 963                             "clget: couldn't create handle: %m\n");
 964                 }
 965                 return (error);
 966         }
 967         (void) CLNT_CONTROL(cp->ch_client, CLSET_PROGRESS, NULL);
 968         auth_destroy(cp->ch_client->cl_auth);
 969 
 970 
 971 
 972         /*
 973          * Get an auth handle.
 974          */
 975         error = authget(svp, cp->ch_client, cr);
 976         if (error || cp->ch_client->cl_auth == NULL) {
 977                 CLNT_DESTROY(cp->ch_client);
 978                 kmem_cache_free(chtab4_cache, cp);
 979 #ifdef DEBUG
 980         atomic_add_64(&clstat4_debug.clalloc.value.ui64, -1);
 981 #endif
 982                 return ((error != 0) ? error : EINTR);
 983         }
 984         ch->ch_timesused++;
 985         *newcl = cp->ch_client;
 986         ASSERT(cp->ch_client->cl_nosignal == FALSE);
 987         *chp = cp;
 988         return (0);
 989 }
 990 
 991 int
 992 nfs_clget4(mntinfo4_t *mi, servinfo4_t *svp, cred_t *cr, CLIENT **newcl,
 993     struct chtab **chp, struct nfs4_clnt *nfscl)
 994 {
 995         clinfo_t ci;
 996         bool_t is_recov;
 997         int firstcall, error = 0;
 998 
 999         /*
1000          * Set read buffer size to rsize
1001          * and add room for RPC headers.
1002          */
1003         ci.cl_readsize = mi->mi_tsize;
1004         if (ci.cl_readsize != 0)
1005                 ci.cl_readsize += (RPC_MAXDATASIZE - NFS_MAXDATA);
1006 
1007         /*
1008          * If soft mount and server is down just try once.
1009          * meaning: do not retransmit.
1010          */
1011         if (!(mi->mi_flags & MI4_HARD) && (mi->mi_flags & MI4_DOWN))
1012                 ci.cl_retrans = 0;
1013         else
1014                 ci.cl_retrans = mi->mi_retrans;
1015 
1016         ci.cl_prog = mi->mi_prog;
1017         ci.cl_vers = mi->mi_vers;
1018         ci.cl_flags = mi->mi_flags;
1019 
1020         /*
1021          * clget4 calls authget() to get an auth handle. For RPCSEC_GSS
1022          * security flavor, the client tries to establish a security context
1023          * by contacting the server. If the connection is timed out or reset,
1024          * e.g. server reboot, we will try again.
1025          */
1026 
1027         /*
1028          * XXXrecovery:  We've already captured the nfs4_server_t in
1029          * start_op but we don't (yet) push it down through rfs4call()
1030          * and friends.  We need to do that, especially in the case of
1031          * an operation directed to the data server, so that we can
1032          * determine if this thread may be in recovery (non-pNFS, MDS, or DS).
1033          */
1034         is_recov = (curthread == mi->mi_recovthread);
1035         firstcall = 1;
1036 
1037         do {
1038                 error = clget4(&ci, svp, cr, newcl, chp, nfscl, mi);
1039 
1040                 if (error == 0)
1041                         break;
1042 
1043                 /*
1044                  * For forced unmount and zone shutdown, bail out but
1045                  * let the recovery thread do one more transmission.
1046                  */
1047                 if ((FS_OR_ZONE_GONE4(mi->mi_vfsp)) &&
1048                     (!is_recov || !firstcall)) {
1049                         error = EIO;
1050                         break;
1051                 }
1052 
1053                 /* do not retry for soft mount */
1054                 if (!(mi->mi_flags & MI4_HARD))
1055                         break;
1056 
1057                 /* let the caller deal with the failover case */
1058                 if (FAILOVER_MOUNT4(mi))
1059                         break;
1060 
1061                 firstcall = 0;
1062 
1063         } while (error == ETIMEDOUT || error == ECONNRESET);
1064 
1065         return (error);
1066 }
1067 
1068 void
1069 clfree4(CLIENT *cl, struct chtab *cp, struct nfs4_clnt *nfscl)
1070 {
1071         if (cl->cl_auth != NULL) {
1072                 sec_clnt_freeh(cl->cl_auth);
1073                 cl->cl_auth = NULL;
1074         }
1075 
1076         if (!CLNT_CONTROL(cl, CLSET_TAG_CLEAR, (char *)NULL))
1077                 zcmn_err(getzoneid(), CE_WARN,
1078                     "Failed to clear tag on freed client handle");
1079 
1080         if (!(CLNT_CONTROL(cl, CLSET_BACKCHANNEL_CLEAR, NULL))) {
1081                 zcmn_err(getzoneid(), CE_WARN,
1082                     "Unable to clear backchannel on freed client handle %p",
1083                     (void *)cl);
1084         }
1085 
1086         /*
1087          * Timestamp this cache entry so that we know when it was last
1088          * used.
1089          */
1090         cp->ch_freed = gethrestime_sec();
1091 
1092         /*
1093          * Add the free client handle to the front of the list.
1094          * This way, the list will be sorted in youngest to oldest
1095          * order.
1096          */
1097         mutex_enter(&nfscl->nfscl_chtable4_lock);
1098         cp->ch_list = cp->ch_head->ch_list;
1099         cp->ch_head->ch_list = cp;
1100         mutex_exit(&nfscl->nfscl_chtable4_lock);
1101 }
1102 
1103 #define CL_HOLDTIME     60      /* time to hold client handles */
1104 
1105 static void
1106 clreclaim4_zone(struct nfs4_clnt *nfscl, uint_t cl_holdtime)
1107 {
1108         struct chhead *ch;
1109         struct chtab *cp;       /* list of objects that can be reclaimed */
1110         struct chtab *cpe;
1111         struct chtab *cpl;
1112         struct chtab **cpp;
1113 #ifdef DEBUG
1114         int n = 0;
1115         clstat4_debug.clreclaim.value.ui64++;
1116 #endif
1117 
1118         /*
1119          * Need to reclaim some memory, so step through the cache
1120          * looking through the lists for entries which can be freed.
1121          */
1122         cp = NULL;
1123 
1124         mutex_enter(&nfscl->nfscl_chtable4_lock);
1125 
1126         /*
1127          * Here we step through each non-NULL quadruple and start to
1128          * construct the reclaim list pointed to by cp.  Note that
1129          * cp will contain all eligible chtab entries.  When this traversal
1130          * completes, chtab entries from the last quadruple will be at the
1131          * front of cp and entries from previously inspected quadruples have
1132          * been appended to the rear of cp.
1133          */
1134         for (ch = nfscl->nfscl_chtable4; ch != NULL; ch = ch->ch_next) {
1135                 if (ch->ch_list == NULL)
1136                         continue;
1137                 /*
1138                  * Search each list for entries older then
1139                  * cl_holdtime seconds.  The lists are maintained
1140                  * in youngest to oldest order so that when the
1141                  * first entry is found which is old enough, then
1142                  * all of the rest of the entries on the list will
1143                  * be old enough as well.
1144                  */
1145                 cpl = ch->ch_list;
1146                 cpp = &ch->ch_list;
1147                 while (cpl != NULL &&
1148                     cpl->ch_freed + cl_holdtime > gethrestime_sec()) {
1149                         cpp = &cpl->ch_list;
1150                         cpl = cpl->ch_list;
1151                 }
1152                 if (cpl != NULL) {
1153                         *cpp = NULL;
1154                         if (cp != NULL) {
1155                                 cpe = cpl;
1156                                 while (cpe->ch_list != NULL)
1157                                         cpe = cpe->ch_list;
1158                                 cpe->ch_list = cp;
1159                         }
1160                         cp = cpl;
1161                 }
1162         }
1163 
1164         mutex_exit(&nfscl->nfscl_chtable4_lock);
1165 
1166         /*
1167          * If cp is empty, then there is nothing to reclaim here.
1168          */
1169         if (cp == NULL)
1170                 return;
1171 
1172         /*
1173          * Step through the list of entries to free, destroying each client
1174          * handle and kmem_free'ing the memory for each entry.
1175          */
1176         while (cp != NULL) {
1177 #ifdef DEBUG
1178                 n++;
1179 #endif
1180                 CLNT_DESTROY(cp->ch_client);
1181                 cpl = cp->ch_list;
1182                 kmem_cache_free(chtab4_cache, cp);
1183                 cp = cpl;
1184         }
1185 
1186 #ifdef DEBUG
1187         /*
1188          * Update clalloc so that nfsstat shows the current number of
1189          * allocated client handles.
1190          */
1191         atomic_add_64(&clstat4_debug.clalloc.value.ui64, -n);
1192 #endif
1193 }
1194 
1195 /* ARGSUSED */
1196 static void
1197 clreclaim4(void *all)
1198 {
1199         struct nfs4_clnt *nfscl;
1200 
1201         /*
1202          * The system is low on memory; go through and try to reclaim some from
1203          * every zone on the system.
1204          */
1205         mutex_enter(&nfs4_clnt_list_lock);
1206         nfscl = list_head(&nfs4_clnt_list);
1207         for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl))
1208                 clreclaim4_zone(nfscl, CL_HOLDTIME);
1209         mutex_exit(&nfs4_clnt_list_lock);
1210 }
1211 
1212 /*
1213  * Minimum time-out values indexed by call type
1214  * These units are in "eights" of a second to avoid multiplies
1215  */
1216 static unsigned int minimum_timeo[] = {
1217         6, 7, 10
1218 };
1219 
1220 #define SHORTWAIT       (NFS_COTS_TIMEO / 10)
1221 
1222 /*
1223  * Back off for retransmission timeout, MAXTIMO is in hz of a sec
1224  */
1225 #define MAXTIMO (20*hz)
1226 #define backoff(tim)    (((tim) < MAXTIMO) ? dobackoff(tim) : (tim))
1227 #define dobackoff(tim)  ((((tim) << 1) > MAXTIMO) ? MAXTIMO : ((tim) << 1))
1228 
1229 static int
1230 nfs4_rfscall(mntinfo4_t *mi, servinfo4_t *svp,
1231     rpcproc_t which, xdrproc_t xdrargs, caddr_t argsp,
1232     xdrproc_t xdrres, caddr_t resp, cred_t *icr, int *doqueue,
1233     enum clnt_stat *rpc_statusp, int flags, struct nfs4_clnt *nfscl)
1234 {
1235         CLIENT *client;
1236         struct chtab *ch;
1237         cred_t *cr = icr;
1238         struct rpc_err rpcerr;
1239         enum clnt_stat status;
1240         int error;
1241         int ctlret;
1242         struct timeval wait;
1243         int timeo;              /* in units of hz */
1244         bool_t tryagain, is_recov;
1245         bool_t cred_cloned = FALSE;
1246         k_sigset_t smask;
1247 #ifdef DEBUG
1248         char *bufp;
1249 #endif
1250         int firstcall;
1251         struct nfs41_cb_info    *cbi;
1252         struct nfs4_server      *np;
1253 
1254         rpcerr.re_status = RPC_SUCCESS;
1255 
1256         /*
1257          * If we know that we are rebooting then let's
1258          * not bother with doing any over the wireness.
1259          */
1260         mutex_enter(&mi->mi_lock);
1261         if (mi->mi_flags & MI4_SHUTDOWN) {
1262                 mutex_exit(&mi->mi_lock);
1263                 return (EIO);
1264         }
1265         mutex_exit(&mi->mi_lock);
1266 
1267         /* For TSOL, use a new cred which has net_mac_aware flag */
1268         if (!cred_cloned && is_system_labeled()) {
1269                 cred_cloned = TRUE;
1270                 cr = crdup(icr);
1271                 (void) setpflags(NET_MAC_AWARE, 1, cr);
1272         }
1273 
1274         /*
1275          * clget() calls clnt_tli_kinit() which clears the xid, so we
1276          * are guaranteed to reprocess the retry as a new request.
1277          */
1278         if (svp == NULL)
1279                 svp = mi->mi_curr_serv;
1280         rpcerr.re_errno = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
1281         if (rpcerr.re_errno != 0)
1282                 return (rpcerr.re_errno);
1283 
1284         if (NFS4_MINORVERSION(mi) == 1) {
1285                 mutex_enter(&nfs4_server_lst_lock);
1286                 np = servinfo4_to_nfs4_server(svp);
1287                 mutex_exit(&nfs4_server_lst_lock);
1288 
1289                 if (np) {
1290                         if (np->s_program != 0 && (flags & RFS4CALL_SETCB)) {
1291                                 cbi = np->zone_globals->nfs4prog2cbinfo
1292                                     [np->s_program-NFS4_CALLBACK];
1293                                 if (cbi != NULL) {
1294                                         CBSERVER_ARGS  cbargs;
1295                                         cbargs.callback = cbi->cb_dispatch;
1296                                         cbargs.prog = cbi->cb_prog;
1297                                         ctlret =
1298                                             CLNT_CONTROL(client,
1299                                             CLSET_CBSERVER_SETUP,
1300                                             (char *)&cbargs);
1301                                         if (ctlret == 0) {
1302                                                 zcmn_err(getzoneid(), CE_WARN,
1303                                                     "Failed to set client"
1304                                                     " handle as callback");
1305                                         }
1306                                 }
1307 
1308                                 if (!np->ssx.bi_rpc) {
1309                                         ctlret = CLNT_CONTROL(client,
1310                                             CLSET_BACKCHANNEL, NULL);
1311                                         if (ctlret == 0) {
1312                                                 zcmn_err(getzoneid(), CE_WARN,
1313                                                     "Failed to set client"
1314                                                     " handle as callback");
1315                                         }
1316                                 }
1317 
1318                                 /*
1319                                  * In case of non birpc, make sure rpc layer
1320                                  * reflects the same -- the below call sets
1321                                  * the RPC flag  non birpc.
1322                                  */
1323                                 if (NFS41_CHECK(mi, nfs41_birpc) == FALSE) {
1324                                         (void) CLNT_CONTROL(client,
1325                                             CLSET_NON_BIRPC, (char *)NULL);
1326                                 }
1327                         }
1328 
1329                         if (!CLNT_CONTROL(client, CLSET_TAG,
1330                             (char *)(np->ssx.sessionid)))
1331                                 zcmn_err(getzoneid(), CE_WARN,
1332                                     "Failed to set tag on client handle");
1333 
1334                         mutex_exit(&np->s_lock);
1335                         nfs4_server_rele(np);
1336                 }
1337         }
1338 
1339         timeo = (mi->mi_timeo * hz) / 10;
1340 
1341         /*
1342          * If hard mounted fs, retry call forever unless hard error
1343          * occurs.
1344          *
1345          * For forced unmount, let the recovery thread through but return
1346          * an error for all others.  This is so that user processes can
1347          * exit quickly.  The recovery thread bails out after one
1348          * transmission so that it can tell if it needs to continue.
1349          *
1350          * For zone shutdown, behave as above to encourage quick
1351          * process exit, but also fail quickly when servers have
1352          * timed out before and reduce the timeouts.
1353          */
1354 
1355         /*
1356          * XXXrecovery:  We've already captured the nfs4_server_t in
1357          * start_op but we don't (yet) push it down through rfs4call()
1358          * and friends.  We need to do that, especially in the case of
1359          * an operation directed to the data server, so that we can
1360          * determine if this thread may be in recovery (non-pNFS, MDS, or DS).
1361          */
1362         is_recov = (curthread == mi->mi_recovthread);
1363         firstcall = 1;
1364         do {
1365                 tryagain = FALSE;
1366 
1367                 NFS4_DEBUG(nfs4_rfscall_debug, (CE_NOTE,
1368                     "nfs4_rfscall: vfs_flag=0x%x, %s",
1369                     mi->mi_vfsp->vfs_flag,
1370                     is_recov ? "recov thread" : "not recov thread"));
1371 
1372                 /*
1373                  * It's possible while we're retrying the admin
1374                  * decided to reboot.
1375                  */
1376                 mutex_enter(&mi->mi_lock);
1377                 if (mi->mi_flags & MI4_SHUTDOWN) {
1378                         mutex_exit(&mi->mi_lock);
1379                         clfree4(client, ch, nfscl);
1380                         if (cred_cloned)
1381                                 crfree(cr);
1382                         return (EIO);
1383                 }
1384                 mutex_exit(&mi->mi_lock);
1385 
1386                 if ((mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) &&
1387                     (!is_recov || !firstcall)) {
1388                         clfree4(client, ch, nfscl);
1389                         if (cred_cloned)
1390                                 crfree(cr);
1391                         return (EIO);
1392                 }
1393 
1394                 if (zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN) {
1395                         mutex_enter(&mi->mi_lock);
1396                         if ((mi->mi_flags & MI4_TIMEDOUT) ||
1397                             !is_recov || !firstcall) {
1398                                 mutex_exit(&mi->mi_lock);
1399                                 clfree4(client, ch, nfscl);
1400                                 if (cred_cloned)
1401                                         crfree(cr);
1402                                 return (EIO);
1403                         }
1404                         mutex_exit(&mi->mi_lock);
1405                         timeo = (MIN(mi->mi_timeo, SHORTWAIT) * hz) / 10;
1406                 }
1407 
1408                 firstcall = 0;
1409                 TICK_TO_TIMEVAL(timeo, &wait);
1410 
1411                 /*
1412                  * Mask out all signals except SIGHUP, SIGINT, SIGQUIT
1413                  * and SIGTERM. (Preserving the existing masks).
1414                  * Mask out SIGINT if mount option nointr is specified.
1415                  */
1416                 sigintr(&smask, (int)mi->mi_flags & MI4_INT);
1417                 if (!(mi->mi_flags & MI4_INT))
1418                         client->cl_nosignal = TRUE;
1419 
1420                 /*
1421                  * If there is a current signal, then don't bother
1422                  * even trying to send out the request because we
1423                  * won't be able to block waiting for the response.
1424                  * Simply assume RPC_INTR and get on with it.
1425                  */
1426                 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING))
1427                         status = RPC_INTR;
1428                 else {
1429                         status = CLNT_CALL(client, which, xdrargs, argsp,
1430                             xdrres, resp, wait);
1431                 }
1432 
1433                 if (!(mi->mi_flags & MI4_INT))
1434                         client->cl_nosignal = FALSE;
1435                 /*
1436                  * restore original signal mask
1437                  */
1438                 sigunintr(&smask);
1439 
1440                 switch (status) {
1441                 case RPC_SUCCESS:
1442                         break;
1443 
1444                 case RPC_INTR:
1445                         /*
1446                          * There is no way to recover from this error,
1447                          * even if mount option nointr is specified.
1448                          * SIGKILL, for example, cannot be blocked.
1449                          */
1450                         rpcerr.re_status = RPC_INTR;
1451                         rpcerr.re_errno = EINTR;
1452                         break;
1453 
1454                 case RPC_CONN_NOT_BOUND:
1455                         rpcerr.re_status = status;
1456                         rpcerr.re_errno = EIO;
1457                         break;
1458 
1459                 case RPC_UDERROR:
1460                         /*
1461                          * If the NFS server is local (vold) and
1462                          * it goes away then we get RPC_UDERROR.
1463                          * This is a retryable error, so we would
1464                          * loop, so check to see if the specific
1465                          * error was ECONNRESET, indicating that
1466                          * target did not exist at all.  If so,
1467                          * return with RPC_PROGUNAVAIL and
1468                          * ECONNRESET to indicate why.
1469                          */
1470                         CLNT_GETERR(client, &rpcerr);
1471                         if (rpcerr.re_errno == ECONNRESET) {
1472                                 rpcerr.re_status = RPC_PROGUNAVAIL;
1473                                 rpcerr.re_errno = ECONNRESET;
1474                                 break;
1475                         }
1476                         /*FALLTHROUGH*/
1477 
1478                 default:                /* probably RPC_TIMEDOUT */
1479 
1480                         if (IS_UNRECOVERABLE_RPC(status))
1481                                 break;
1482 
1483                         /*
1484                          * increment server not responding count
1485                          */
1486                         mutex_enter(&mi->mi_lock);
1487                         mi->mi_noresponse++;
1488                         mutex_exit(&mi->mi_lock);
1489 #ifdef DEBUG
1490                         clstat4_debug.noresponse.value.ui64++;
1491 #endif
1492                         /*
1493                          * On zone shutdown, mark server dead and move on.
1494                          */
1495                         if (zone_status_get(curproc->p_zone) >=
1496                             ZONE_IS_SHUTTING_DOWN) {
1497                                 mutex_enter(&mi->mi_lock);
1498                                 mi->mi_flags |= MI4_TIMEDOUT;
1499                                 mutex_exit(&mi->mi_lock);
1500                                 clfree4(client, ch, nfscl);
1501                                 if (cred_cloned)
1502                                         crfree(cr);
1503                                 return (EIO);
1504                         }
1505 
1506                         /*
1507                          * NFS client failover support:
1508                          * return and let the caller take care of
1509                          * failover.  We only return for failover mounts
1510                          * because otherwise we want the "not responding"
1511                          * message, the timer updates, etc.
1512                          */
1513                         if (mi->mi_vers == 4 && FAILOVER_MOUNT4(mi) &&
1514                             (error = try_failover(status)) != 0) {
1515                                 clfree4(client, ch, nfscl);
1516                                 if (cred_cloned)
1517                                         crfree(cr);
1518                                 *rpc_statusp = status;
1519                                 return (error);
1520                         }
1521 
1522                         if (flags & RFSCALL_SOFT)
1523                                 break;
1524 
1525                         tryagain = TRUE;
1526 
1527                         /*
1528                          * The call is in progress (over COTS).
1529                          * Try the CLNT_CALL again, but don't
1530                          * print a noisy error message.
1531                          */
1532                         if (status == RPC_INPROGRESS)
1533                                 break;
1534 
1535                         timeo = backoff(timeo);
1536                         mutex_enter(&mi->mi_lock);
1537                         if (!(mi->mi_flags & MI4_PRINTED)) {
1538                                 mi->mi_flags |= MI4_PRINTED;
1539                                 mutex_exit(&mi->mi_lock);
1540                                 nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi, 0, 0, 0,
1541                                     FALSE, NULL, 0, NULL);
1542                         } else
1543                                 mutex_exit(&mi->mi_lock);
1544 
1545                         if (*doqueue && nfs_has_ctty()) {
1546                                 *doqueue = 0;
1547                                 if (!(mi->mi_flags & MI4_NOPRINT))
1548                                         nfs4_queue_fact(RF_SRV_NOT_RESPOND, mi,
1549                                             0, 0, 0, FALSE, NULL, 0, NULL);
1550                         }
1551                 }
1552         } while (tryagain);
1553 
1554         DTRACE_PROBE2(nfs4__rfscall_debug, enum clnt_stat, status,
1555             int, rpcerr.re_errno);
1556 
1557         if (status != RPC_SUCCESS) {
1558                 zoneid_t zoneid = mi->mi_zone->zone_id;
1559 
1560                 /*
1561                  * Let soft mounts use the timed out message.
1562                  */
1563                 if (status == RPC_INPROGRESS)
1564                         status = RPC_TIMEDOUT;
1565                 nfscl->nfscl_stat[NFS4_MINORVERSION(mi)].badcalls.value.ui64++;
1566                 if (status != RPC_INTR) {
1567                         mutex_enter(&mi->mi_lock);
1568                         mi->mi_flags |= MI4_DOWN;
1569                         mutex_exit(&mi->mi_lock);
1570                         CLNT_GETERR(client, &rpcerr);
1571 #ifdef DEBUG
1572                         bufp = clnt_sperror(client, svp->sv_hostname);
1573                         zprintf(zoneid, "NFS%d %s failed for %s\n",
1574                             mi->mi_vers, mi->mi_rfsnames[which], bufp);
1575                         if (nfs_has_ctty()) {
1576                                 if (!(mi->mi_flags & MI4_NOPRINT)) {
1577                                         uprintf("NFS%d %s failed for %s\n",
1578                                             mi->mi_vers, mi->mi_rfsnames[which],
1579                                             bufp);
1580                                 }
1581                         }
1582                         kmem_free(bufp, MAXPATHLEN);
1583 #else
1584                         zprintf(zoneid,
1585                             "NFS %s failed for server %s: error %d (%s)\n",
1586                             mi->mi_rfsnames[which], svp->sv_hostname,
1587                             status, clnt_sperrno(status));
1588                         if (nfs_has_ctty()) {
1589                                 if (!(mi->mi_flags & MI4_NOPRINT)) {
1590                                         uprintf(
1591                                 "NFS %s failed for server %s: error %d (%s)\n",
1592                                             mi->mi_rfsnames[which],
1593                                             svp->sv_hostname, status,
1594                                             clnt_sperrno(status));
1595                                 }
1596                         }
1597 #endif
1598                         /*
1599                          * when CLNT_CALL() fails with RPC_AUTHERROR,
1600                          * re_errno is set appropriately depending on
1601                          * the authentication error
1602                          */
1603                         if (status == RPC_VERSMISMATCH ||
1604                             status == RPC_PROGVERSMISMATCH)
1605                                 rpcerr.re_errno = EIO;
1606                 }
1607         } else {
1608                 /*
1609                  * Test the value of mi_down and mi_printed without
1610                  * holding the mi_lock mutex.  If they are both zero,
1611                  * then it is okay to skip the down and printed
1612                  * processing.  This saves on a mutex_enter and
1613                  * mutex_exit pair for a normal, successful RPC.
1614                  * This was just complete overhead.
1615                  */
1616                 if (mi->mi_flags & (MI4_DOWN | MI4_PRINTED)) {
1617                         mutex_enter(&mi->mi_lock);
1618                         mi->mi_flags &= ~MI4_DOWN;
1619                         if (mi->mi_flags & MI4_PRINTED) {
1620                                 mi->mi_flags &= ~MI4_PRINTED;
1621                                 mutex_exit(&mi->mi_lock);
1622                                 if (!(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1623                                         nfs4_queue_fact(RF_SRV_OK, mi, 0, 0,
1624                                             0, FALSE, NULL, 0, NULL);
1625                         } else
1626                                 mutex_exit(&mi->mi_lock);
1627                 }
1628 
1629                 if (*doqueue == 0) {
1630                         if (!(mi->mi_flags & MI4_NOPRINT) &&
1631                             !(mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED))
1632                                 nfs4_queue_fact(RF_SRV_OK, mi, 0, 0, 0,
1633                                     FALSE, NULL, 0, NULL);
1634 
1635                         *doqueue = 1;
1636                 }
1637         }
1638 
1639         clfree4(client, ch, nfscl);
1640         if (cred_cloned)
1641                 crfree(cr);
1642 
1643         ASSERT(rpcerr.re_status == RPC_SUCCESS || rpcerr.re_errno != 0);
1644 
1645         TRACE_1(TR_FAC_NFS, TR_RFSCALL_END, "nfs4_rfscall_end:errno %d",
1646             rpcerr.re_errno);
1647 
1648         *rpc_statusp = status;
1649         return (rpcerr.re_errno);
1650 }
1651 
1652 /*
1653  * rfs4call - general wrapper for RPC calls initiated by the client
1654  * KLR-make this a nosequence rfs4call which will not add a sequence op
1655  * XXXrsb - External callers now user rfs4call() with RFS4CALL_NOSEQ.
1656  */
1657 static void
1658 rfs4call_nosequence(mntinfo4_t *mi, servinfo4_t *svp, COMPOUND4args_clnt *argsp,
1659     COMPOUND4res_clnt *resp, cred_t *cr, int *doqueue, int flags,
1660     nfs4_error_t *ep)
1661 {
1662         int i, error;
1663         enum clnt_stat rpc_status = NFS4_OK;
1664         int num_resops;
1665         struct nfs4_clnt *nfscl;
1666 
1667         ASSERT(nfs_zone() == mi->mi_zone);
1668         nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
1669         ASSERT(nfscl != NULL);
1670         /*
1671          * Note that the first call will be accounted for the default
1672          * minor version, even if there are no mounts for that minor
1673          * version. The call may result in a minor vesion mismatch and
1674          * subsequent calls will get accounted correctly. It makes sense
1675          * to account the first call for the default minor version,
1676          * because the client thought that this call is for that minor
1677          * version. Same goes for the compound procedure as well.
1678          */
1679         nfscl->nfscl_stat[NFS4_MINORVERSION(mi)].calls.value.ui64++;
1680         mi->mi_reqs[NFSPROC4_COMPOUND].value.ui64++;
1681 
1682                 /* XXX - Set up minorversion */
1683         argsp->minor_vers = NFS4_MINORVERSION(mi);
1684 
1685         /* Set up the results struct for XDR usage */
1686         resp->argsp = argsp;
1687         resp->array = NULL;
1688         resp->status = 0;
1689         resp->decode_len = 0;
1690 
1691         error = nfs4_rfscall(mi, svp, NFSPROC4_COMPOUND,
1692             xdr_COMPOUND4args_clnt, (caddr_t)argsp,
1693             xdr_COMPOUND4res_clnt, (caddr_t)resp, cr,
1694             doqueue, &rpc_status, flags, nfscl);
1695 
1696         /*
1697          * Map the connection not bound rpc error to nfs
1698          * error. Currently with no connection binding enforcement
1699          * by the client, we won't hit this. With connection binding
1700          * enforcement in the future (with SSV), the below method is
1701          * needed to drive a bind_conn_to_session after a connection
1702          * loss by the client (See section - 2.10.10.1.4 of the draft)
1703          */
1704         if (error && rpc_status == RPC_CONN_NOT_BOUND) {
1705                 ep->error = 0;
1706                 ep->rpc_status = 0;
1707                 ep->stat = NFS4ERR_CONN_NOT_BOUND_TO_SESSION;
1708                 return;
1709         }
1710 
1711         /* Return now if it was any other RPC error */
1712         if (error) {
1713                 ep->error = error;
1714                 ep->stat = resp->status;
1715                 ep->rpc_status = rpc_status;
1716                 return;
1717         }
1718         /*
1719          * else we'll count the processed operations. Note that we will
1720          * NOT enter here in case of NFS4ERR_MINOR_VERS_MISMATCH.
1721          */
1722         num_resops = resp->decode_len;
1723         for (i = 0; i < num_resops; i++) {
1724                 /*
1725                  * Count the individual operations
1726                  * processed by the server.
1727                  */
1728                 if (NFS4_MINORVERSION(mi) == NFS4_MINOR_v1) {
1729                         if (resp->array[i].resop >= NFSPROC4_NULL &&
1730                             resp->array[i].resop <= OP_RECLAIM_COMPLETE) {
1731                                 mi->mi_reqs[resp->array[i].resop].value.ui64++;
1732                         }
1733                 } else if (NFS4_MINORVERSION(mi) == NFS4_MINOR_v0) {
1734                         if (resp->array[i].resop >= NFSPROC4_NULL &&
1735                             resp->array[i].resop <= OP_RELEASE_LOCKOWNER) {
1736                                 mi->mi_reqs[resp->array[i].resop].value.ui64++;
1737                         }
1738                 }
1739         }
1740 
1741         ep->error = 0;
1742         ep->stat = resp->status;
1743         ep->rpc_status = rpc_status;
1744 }
1745 
1746 void
1747 rfs41_call(mntinfo4_t *mi, servinfo4_t *svp, COMPOUND4args_clnt *argsp,
1748         COMPOUND4res_clnt *resp, cred_t *cr, int *doqueue, int flags,
1749         nfs4_error_t *ep)
1750 {
1751         nfs4_slot_t             *slot;
1752         SEQUENCE4res            *seqres;
1753         struct nfs4_server      *np;
1754         COMPOUND4args_clnt      rfs_args, *rfsargp;
1755         COMPOUND4res_clnt       rfs_res, *rfsresp;
1756         int                     add_seq = 0;
1757 
1758         /*
1759          * XXXrsb - The following code is likely to change
1760          * For now, we have a pointer from the servinfo4 to the nfs4_server
1761          * If we have a servinfo4 and the pointer is valid, then use it.
1762          * One note, we may have to deal with the "np == NULL" case.
1763          */
1764         if (svp && svp->sv_ds_n4sp) {
1765                 np = svp->sv_ds_n4sp;
1766                 nfs4_server_hold(np);
1767         } else {
1768                 np = find_nfs4_server(mi);
1769                 ASSERT(np != NULL);
1770                 mutex_exit(&np->s_lock);
1771         }
1772 
1773 
1774         /*
1775          * Allocate another args array so we can insert
1776          * a SEQUENCE Op as the first operation, copy already
1777          * built args into it also.
1778          */
1779         if (argsp->array->argop != OP_SEQUENCE) {
1780                 rfs_args.ctag = argsp->ctag;
1781                 rfs_args.array_len = argsp->array_len + 1;
1782                 rfs_args.array = kmem_zalloc(sizeof (nfs_argop4) *
1783                     rfs_args.array_len, KM_SLEEP);
1784 
1785                 bcopy(argsp->array, rfs_args.array + 1,
1786                     sizeof (nfs_argop4) * argsp->array_len);
1787 
1788                 ASSERT(argsp->array_len >= 1);
1789                 rfs_args.array->argop = OP_SEQUENCE;
1790                 rfsargp = &rfs_args;
1791                 rfsresp = &rfs_res;
1792                 add_seq = 1;
1793         } else {
1794                 rfsargp = argsp;
1795                 rfsresp = resp;
1796         }
1797 
1798         /* Set up the sequence OP */
1799 
1800         nfs4sequence_setup(&np->ssx, rfsargp, &slot);
1801 
1802         /*
1803          * Send it using rfs4call_nosequence()
1804          * XXXrsb - this will likely be refactored with the rest of
1805          * the rfs4call() family
1806          */
1807         rfs4call_nosequence(mi, svp, rfsargp, rfsresp, cr, doqueue, flags, ep);
1808 
1809 #if     0
1810         zcmn_err(mi->mi_zone->zone_id, CE_WARN,
1811             "Tag: %x SEQUENCE slot: %x seq: %x estatus: %x nstatus: %x",
1812             rfsargp->ctag,
1813             rfsargp->array->nfs_argop4_u.opsequence.sa_slotid,
1814             rfsargp->array->nfs_argop4_u.opsequence.sa_sequenceid,
1815             ep->error,
1816             rfsresp->array != NULL ?
1817             rfsresp->array->nfs_resop4_u.opsequence.status : 0);
1818 
1819         if (ep->error || ep->stat || ep->rpc_status)
1820                 cmn_err(CE_WARN, "rfs4call failed: %d, %d, %d",
1821                     ep->error, ep->stat, ep->rpc_status);
1822 #endif
1823 
1824         nfs4sequence_fin(&np->ssx, rfsresp, slot, ep);
1825 
1826         /*
1827          * If the OTW call failed completely, or if the
1828          * results array is NULL, just get out
1829          */
1830         if (ep->error || (ep->stat && rfsresp->array == NULL)) {
1831 
1832                 if (ep->error == 0) {
1833                         ep->error = geterrno4(ep->stat);
1834                 }
1835 
1836                 if (add_seq)
1837                         kmem_free(rfs_args.array,
1838                             sizeof (nfs_argop4) * rfs_args.array_len);
1839 
1840                 nfs4_server_rele(np);
1841                 return;
1842         }
1843 
1844         /*
1845          * Check the results of the sequence op.  If it failed and we
1846          * added it for the caller, then we don't have any results
1847          * to return.
1848          */
1849         seqres = &rfsresp->array->nfs_resop4_u.opsequence;
1850         if (seqres->sr_status != NFS4_OK) {
1851 
1852                 cmn_err(CE_WARN, "rfs4call: sequence OP failed %d",
1853                     seqres->sr_status);
1854 
1855                 if (add_seq) {
1856                         kmem_free(rfs_args.array,
1857                             sizeof (nfs_argop4) * rfs_args.array_len);
1858                         resp->status = seqres->sr_status;
1859                         resp->array_len = resp->decode_len = 0;
1860                         resp->array = NULL;
1861                 }
1862                 /* XXX - xdr_free? free cpy */
1863                 nfs4_server_rele(np);
1864                 return;
1865         }
1866 
1867         /*
1868          * Update lease time if we have state since SEQUENCE op was successful
1869          */
1870         mutex_enter(&np->s_lock);
1871         if (np->lease_valid == NFS4_LEASE_VALID && np->state_ref_count)
1872                 np->last_renewal_time = gethrestime_sec();
1873         mutex_exit(&np->s_lock);
1874 
1875         /*
1876          * We some results of interest to the, so
1877          * Allocate an additional response array which doesn't have
1878          * SEQUENCE op results, copy results to it if not just a
1879          * SEQUENCE op for lease renewal.
1880          */
1881         if (add_seq) {
1882                 resp->status = rfsresp->status;
1883                 resp->array_len =
1884                     rfsresp->array_len == 0 ? 0 :rfsresp->array_len - 1;
1885                 resp->decode_len = rfsresp->decode_len == 0 ? 0 :
1886                     rfsresp->decode_len - 1;
1887                 resp->argsp = argsp;
1888                 if (resp->array_len > 0) {
1889                         ASSERT(rfs_res.array != NULL);
1890                         resp->array =
1891                             kmem_alloc(sizeof (nfs_resop4) *
1892                             resp->array_len, KM_SLEEP);
1893                         bcopy(rfsresp->array + 1, resp->array,
1894                             sizeof (nfs_resop4) * resp->array_len);
1895                 } else {
1896                         resp->array = NULL;
1897                 }
1898                 kmem_free(rfs_args.array,
1899                     sizeof (nfs_argop4) * rfs_args.array_len);
1900                 kmem_free(rfs_res.array,
1901                     sizeof (nfs_resop4) * rfs_res.array_len);
1902         }
1903         nfs4_server_rele(np);
1904 }
1905 
1906 void
1907 rfs4call(mntinfo4_t *mi, servinfo4_t *svp, COMPOUND4args_clnt *argsp,
1908         COMPOUND4res_clnt *resp, cred_t *cr, int *doqueue, int flags,
1909         nfs4_error_t *ep)
1910 {
1911         if (NFS4_MINORVERSION(mi) == 0 || (flags & RFS4CALL_NOSEQ)) {
1912                 rfs4call_nosequence(mi, svp, argsp, resp, cr, doqueue,
1913                     flags, ep);
1914                 return;
1915         }
1916         rfs41_call(mi, svp, argsp, resp, cr, doqueue, flags, ep);
1917 }
1918 
1919 /*
1920  * nfs4rename_update - updates stored state after a rename.  Currently this
1921  * is the path of the object and anything under it, and the filehandle of
1922  * the renamed object.
1923  */
1924 void
1925 nfs4rename_update(vnode_t *renvp, vnode_t *ndvp, nfs_fh4 *nfh4p, char *nnm)
1926 {
1927         sfh4_update(VTOR4(renvp)->r_fh, nfh4p);
1928         fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, nnm);
1929 }
1930 
1931 /*
1932  * Routine to look up the filehandle for the given path and rootvp.
1933  *
1934  * Return values:
1935  * - success: returns zero and *statp is set to NFS4_OK, and *fhp is
1936  *   updated.
1937  * - error: return value (errno value) and/or *statp is set appropriately.
1938  */
1939 #define RML_ORDINARY    1
1940 #define RML_NAMED_ATTR  2
1941 #define RML_ATTRDIR     3
1942 
1943 static void
1944 remap_lookup(nfs4_fname_t *fname, vnode_t *rootvp,
1945     int filetype, cred_t *cr,
1946     nfs_fh4 *fhp, nfs4_ga_res_t *garp,          /* fh, attrs for object */
1947     nfs_fh4 *pfhp, nfs4_ga_res_t *pgarp,        /* fh, attrs for parent */
1948     nfs4_error_t *ep)
1949 {
1950         COMPOUND4args_clnt args;
1951         COMPOUND4res_clnt res;
1952         nfs_argop4 *argop;
1953         nfs_resop4 *resop;
1954         int num_argops;
1955         lookup4_param_t lookuparg;
1956         nfs_fh4 *tmpfhp;
1957         int doqueue = 1;
1958         char *path;
1959         mntinfo4_t *mi;
1960 
1961         ASSERT(fname != NULL);
1962         ASSERT(rootvp->v_type == VDIR);
1963 
1964         mi = VTOMI4(rootvp);
1965         path = fn_path(fname);
1966         switch (filetype) {
1967         case RML_NAMED_ATTR:
1968                 lookuparg.l4_getattrs = LKP4_LAST_NAMED_ATTR;
1969                 args.ctag = TAG_REMAP_LOOKUP_NA;
1970                 break;
1971         case RML_ATTRDIR:
1972                 lookuparg.l4_getattrs = LKP4_LAST_ATTRDIR;
1973                 args.ctag = TAG_REMAP_LOOKUP_AD;
1974                 break;
1975         case RML_ORDINARY:
1976                 lookuparg.l4_getattrs = LKP4_ALL_ATTRIBUTES;
1977                 args.ctag = TAG_REMAP_LOOKUP;
1978                 break;
1979         default:
1980                 ep->error = EINVAL;
1981                 return;
1982         }
1983         lookuparg.argsp = &args;
1984         lookuparg.resp = &res;
1985         lookuparg.header_len = 1;       /* Putfh */
1986         lookuparg.trailer_len = 0;
1987         lookuparg.ga_bits = MI4_DEFAULT_ATTRMAP(mi);
1988         lookuparg.mi = VTOMI4(rootvp);
1989 
1990         (void) nfs4lookup_setup(path, &lookuparg, 1);
1991 
1992         /* 0: putfh directory */
1993         argop = args.array;
1994         argop[0].argop = OP_CPUTFH;
1995         argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(rootvp)->r_fh;
1996 
1997         num_argops = args.array_len;
1998 
1999         rfs4call(mi, NULL, &args, &res, cr, &doqueue, RFSCALL_SOFT, ep);
2000 
2001         if (ep->error || res.status != NFS4_OK)
2002                 goto exit;
2003 
2004         /* get the object filehandle */
2005         resop = &res.array[res.array_len - 2];
2006         if (resop->resop != OP_GETFH) {
2007                 nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
2008                     0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
2009                 ep->stat = NFS4ERR_SERVERFAULT;
2010                 goto exit;
2011         }
2012         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
2013         if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
2014                 nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
2015                     tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
2016                     TAG_NONE, 0, 0);
2017                 ep->stat = NFS4ERR_SERVERFAULT;
2018                 goto exit;
2019         }
2020         fhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
2021         nfs_fh4_copy(tmpfhp, fhp);
2022 
2023         /* get the object attributes */
2024         resop = &res.array[res.array_len - 1];
2025         if (garp && resop->resop == OP_GETATTR)
2026                 *garp = resop->nfs_resop4_u.opgetattr.ga_res;
2027 
2028         /* See if there are enough fields in the response for parent info */
2029         if ((int)res.array_len - 5 <= 0)
2030                 goto exit;
2031 
2032         /* get the parent filehandle */
2033         resop = &res.array[res.array_len - 5];
2034         if (resop->resop != OP_GETFH) {
2035                 nfs4_queue_event(RE_FAIL_REMAP_OP, mi, NULL,
2036                     0, NULL, NULL, 0, NULL, 0, TAG_NONE, TAG_NONE, 0, 0);
2037                 ep->stat = NFS4ERR_SERVERFAULT;
2038                 goto exit;
2039         }
2040         tmpfhp = &resop->nfs_resop4_u.opgetfh.object;
2041         if (tmpfhp->nfs_fh4_len > NFS4_FHSIZE) {
2042                 nfs4_queue_event(RE_FAIL_REMAP_LEN, mi, NULL,
2043                     tmpfhp->nfs_fh4_len, NULL, NULL, 0, NULL, 0, TAG_NONE,
2044                     TAG_NONE, 0, 0);
2045                 ep->stat = NFS4ERR_SERVERFAULT;
2046                 goto exit;
2047         }
2048         pfhp->nfs_fh4_val = kmem_alloc(tmpfhp->nfs_fh4_len, KM_SLEEP);
2049         nfs_fh4_copy(tmpfhp, pfhp);
2050 
2051         /* get the parent attributes */
2052         resop = &res.array[res.array_len - 4];
2053         if (pgarp && resop->resop == OP_GETATTR)
2054                 *pgarp = resop->nfs_resop4_u.opgetattr.ga_res;
2055 
2056 exit:
2057         /*
2058          * It is too hard to remember where all the OP_LOOKUPs are
2059          */
2060         nfs4args_lookup_free(argop, num_argops);
2061         kmem_free(argop, lookuparg.arglen * sizeof (nfs_argop4));
2062 
2063         if (!ep->error)
2064                 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2065         kmem_free(path, strlen(path)+1);
2066 }
2067 
2068 /*
2069  * NFS client failover / volatile filehandle support
2070  *
2071  * Recover the filehandle for the given rnode.
2072  *
2073  * Errors are returned via the nfs4_error_t parameter.
2074  */
2075 
2076 void
2077 nfs4_remap_file(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
2078 {
2079         int is_stub;
2080         rnode4_t *rp = VTOR4(vp);
2081         vnode_t *rootvp = NULL;
2082         vnode_t *dvp = NULL;
2083         cred_t *cr, *cred_otw;
2084         nfs4_ga_res_t gar, pgar;
2085         nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2086         int filetype = RML_ORDINARY;
2087         nfs4_recov_state_t recov = {NULL, 0, 0};
2088         int badfhcount = 0;
2089         nfs4_open_stream_t *osp = NULL;
2090         bool_t first_time = TRUE;       /* first time getting OTW cred */
2091         bool_t last_time = FALSE;       /* last time getting OTW cred */
2092 
2093         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2094             "nfs4_remap_file: remapping %s", rnode4info(rp)));
2095         ASSERT(nfs4_consistent_type(vp));
2096 
2097         if (vp->v_flag & VROOT) {
2098                 nfs4_remap_root(mi, ep, flags);
2099                 return;
2100         }
2101 
2102         /*
2103          * Given the root fh, use the path stored in
2104          * the rnode to find the fh for the new server.
2105          */
2106         ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2107         if (ep->error != 0)
2108                 return;
2109 
2110         cr = curthread->t_cred;
2111         ASSERT(cr != NULL);
2112 get_remap_cred:
2113         /*
2114          * Releases the osp, if it is provided.
2115          * Puts a hold on the cred_otw and the new osp (if found).
2116          */
2117         cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
2118             &first_time, &last_time);
2119         ASSERT(cred_otw != NULL);
2120 
2121         if (rp->r_flags & R4ISXATTR) {
2122                 filetype = RML_NAMED_ATTR;
2123                 (void) vtodv(vp, &dvp, cred_otw, FALSE);
2124         }
2125 
2126         if (vp->v_flag & V_XATTRDIR) {
2127                 filetype = RML_ATTRDIR;
2128         }
2129 
2130         if (filetype == RML_ORDINARY && rootvp->v_type == VREG) {
2131                 /* file mount, doesn't need a remap */
2132                 goto done;
2133         }
2134 
2135 again:
2136         remap_lookup(rp->r_svnode.sv_name, rootvp, filetype, cred_otw,
2137             &newfh, &gar, &newpfh, &pgar, ep);
2138 
2139         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2140             "nfs4_remap_file: remap_lookup returned %d/%d",
2141             ep->error, ep->stat));
2142 
2143         if (last_time == FALSE && ep->error == EACCES) {
2144                 crfree(cred_otw);
2145                 if (dvp != NULL)
2146                         VN_RELE(dvp);
2147                 goto get_remap_cred;
2148         }
2149         if (ep->error != 0)
2150                 goto done;
2151 
2152         switch (ep->stat) {
2153         case NFS4_OK:
2154                 badfhcount = 0;
2155                 if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
2156                         mutex_enter(&rp->r_statelock);
2157                         rp->r_delay_interval = 0;
2158                         mutex_exit(&rp->r_statelock);
2159                         uprintf("NFS File Available..\n");
2160                 }
2161                 break;
2162         case NFS4ERR_FHEXPIRED:
2163         case NFS4ERR_BADHANDLE:
2164                 /*
2165                  * If we ran into filehandle problems, we should try to
2166                  * remap the root vnode first and hope life gets better.
2167                  * But we need to avoid loops.
2168                  */
2169                 if (badfhcount++ > 0)
2170                         goto done;
2171                 if (newfh.nfs_fh4_len != 0) {
2172                         kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2173                         newfh.nfs_fh4_len = 0;
2174                 }
2175                 if (newpfh.nfs_fh4_len != 0) {
2176                         kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2177                         newpfh.nfs_fh4_len = 0;
2178                 }
2179                 /* relative path - remap rootvp then retry */
2180                 VN_RELE(rootvp);
2181                 rootvp = NULL;
2182                 nfs4_remap_root(mi, ep, flags);
2183                 if (ep->error != 0 || ep->stat != NFS4_OK)
2184                         goto done;
2185                 ep->error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2186                 if (ep->error != 0)
2187                         goto done;
2188                 goto again;
2189         case NFS4ERR_DELAY:
2190                 badfhcount = 0;
2191                 nfs4_set_delay_wait(vp);
2192                 ep->error = nfs4_wait_for_delay(vp, &recov, 0);
2193                 if (ep->error != 0)
2194                         goto done;
2195                 goto again;
2196         case NFS4ERR_ACCESS:
2197                 /* get new cred, try again */
2198                 if (last_time == TRUE)
2199                         goto done;
2200                 if (dvp != NULL)
2201                         VN_RELE(dvp);
2202                 crfree(cred_otw);
2203                 goto get_remap_cred;
2204         default:
2205                 goto done;
2206         }
2207 
2208         /*
2209          * Check on the new and old rnodes before updating;
2210          * if the vnode type or size changes, issue a warning
2211          * and mark the file dead.
2212          */
2213         mutex_enter(&rp->r_statelock);
2214         if (flags & NFS4_REMAP_CKATTRS) {
2215                 if (vp->v_type != gar.n4g_va.va_type ||
2216                     (vp->v_type != VDIR &&
2217                     rp->r_size != gar.n4g_va.va_size)) {
2218                         NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2219                             "nfs4_remap_file: size %d vs. %d, type %d vs. %d",
2220                             (int)rp->r_size, (int)gar.n4g_va.va_size,
2221                             vp->v_type, gar.n4g_va.va_type));
2222                         mutex_exit(&rp->r_statelock);
2223                         nfs4_queue_event(RE_FILE_DIFF, mi,
2224                             rp->r_server->sv_hostname, 0, vp, NULL, 0, NULL, 0,
2225                             TAG_NONE, TAG_NONE, 0, 0);
2226                         nfs4_fail_recov(vp, NULL, 0, NFS4_OK);
2227                         goto done;
2228                 }
2229         }
2230         ASSERT(gar.n4g_va.va_type != VNON);
2231         rp->r_server = mi->mi_curr_serv;
2232 
2233         /*
2234          * Turn this object into a "stub" object if we
2235          * crossed an underlying server fs boundary.
2236          *
2237          * This stub will be for a mirror-mount.
2238          *
2239          * See comment in r4_do_attrcache() for more details.
2240          */
2241         is_stub = 0;
2242         if (gar.n4g_fsid_valid) {
2243                 (void) nfs_rw_enter_sig(&rp->r_server->sv_lock, RW_READER, 0);
2244                 rp->r_srv_fsid = gar.n4g_fsid;
2245                 if (!FATTR4_FSID_EQ(&gar.n4g_fsid, &rp->r_server->sv_fsid))
2246                         is_stub = 1;
2247                 nfs_rw_exit(&rp->r_server->sv_lock);
2248 #ifdef DEBUG
2249         } else {
2250                 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2251                     "remap_file: fsid attr not provided by server.  rp=%p",
2252                     (void *)rp));
2253 #endif
2254         }
2255         if (is_stub)
2256                 r4_stub_mirrormount(rp);
2257         else
2258                 r4_stub_none(rp);
2259         mutex_exit(&rp->r_statelock);
2260         nfs4_attrcache_noinval(vp, &gar, gethrtime()); /* force update */
2261         sfh4_update(rp->r_fh, &newfh);
2262         ASSERT(nfs4_consistent_type(vp));
2263 
2264         /*
2265          * If we got parent info, use it to update the parent
2266          */
2267         if (newpfh.nfs_fh4_len != 0) {
2268                 if (rp->r_svnode.sv_dfh != NULL)
2269                         sfh4_update(rp->r_svnode.sv_dfh, &newpfh);
2270                 if (dvp != NULL) {
2271                         /* force update of attrs */
2272                         nfs4_attrcache_noinval(dvp, &pgar, gethrtime());
2273                 }
2274         }
2275 done:
2276         if (newfh.nfs_fh4_len != 0)
2277                 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2278         if (newpfh.nfs_fh4_len != 0)
2279                 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2280         if (cred_otw != NULL)
2281                 crfree(cred_otw);
2282         if (rootvp != NULL)
2283                 VN_RELE(rootvp);
2284         if (dvp != NULL)
2285                 VN_RELE(dvp);
2286         if (osp != NULL)
2287                 open_stream_rele(osp, rp);
2288 }
2289 
2290 /*
2291  * Client-side failover support: remap the filehandle for vp if it appears
2292  * necessary.  errors are returned via the nfs4_error_t parameter; though,
2293  * if there is a problem, we will just try again later.
2294  */
2295 
2296 void
2297 nfs4_check_remap(mntinfo4_t *mi, vnode_t *vp, int flags, nfs4_error_t *ep)
2298 {
2299         if (vp == NULL)
2300                 return;
2301 
2302         if (!(vp->v_vfsp->vfs_flag & VFS_RDONLY))
2303                 return;
2304 
2305         if (VTOR4(vp)->r_server == mi->mi_curr_serv)
2306                 return;
2307 
2308         nfs4_remap_file(mi, vp, flags, ep);
2309 }
2310 
2311 /*
2312  * nfs4_make_dotdot() - find or create a parent vnode of a non-root node.
2313  *
2314  * Our caller has a filehandle for ".." relative to a particular
2315  * directory object.  We want to find or create a parent vnode
2316  * with that filehandle and return it.  We can of course create
2317  * a vnode from this filehandle, but we need to also make sure
2318  * that if ".." is a regular file (i.e. dvp is a V_XATTRDIR)
2319  * that we have a parent FH for future reopens as well.  If
2320  * we have a remap failure, we won't be able to reopen this
2321  * file, but we won't treat that as fatal because a reopen
2322  * is at least unlikely.  Someday nfs4_reopen() should look
2323  * for a missing parent FH and try a remap to recover from it.
2324  *
2325  * need_start_op argument indicates whether this function should
2326  * do a start_op before calling remap_lookup().  This should
2327  * be FALSE, if you are the recovery thread or in an op; otherwise,
2328  * set it to TRUE.
2329  */
2330 int
2331 nfs4_make_dotdot(nfs4_sharedfh_t *fhp, hrtime_t t, vnode_t *dvp,
2332     cred_t *cr, vnode_t **vpp, int need_start_op)
2333 {
2334         mntinfo4_t *mi = VTOMI4(dvp);
2335         nfs4_fname_t *np = NULL, *pnp = NULL;
2336         vnode_t *vp = NULL, *rootvp = NULL;
2337         rnode4_t *rp;
2338         nfs_fh4 newfh = {0, NULL}, newpfh = {0, NULL};
2339         nfs4_ga_res_t gar, pgar;
2340         vattr_t va, pva;
2341         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2342         nfs4_sharedfh_t *sfh = NULL, *psfh = NULL;
2343         nfs4_recov_state_t recov_state;
2344 
2345 #ifdef DEBUG
2346         /*
2347          * ensure need_start_op is correct
2348          */
2349         {
2350                 int no_need_start_op = (tsd_get(nfs4_tsd_key) ||
2351                     (curthread == mi->mi_recovthread));
2352                 /* C needs a ^^ operator! */
2353                 ASSERT(((need_start_op) && (!no_need_start_op)) ||
2354                     ((! need_start_op) && (no_need_start_op)));
2355         }
2356 #endif
2357         ASSERT(VTOMI4(dvp)->mi_zone == nfs_zone());
2358 
2359         NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE,
2360             "nfs4_make_dotdot: called with fhp %p, dvp %s", (void *)fhp,
2361             rnode4info(VTOR4(dvp))));
2362 
2363         /*
2364          * rootvp might be needed eventually. Holding it now will
2365          * ensure that r4find_unlocked() will find it, if ".." is the root.
2366          */
2367         e.error = VFS_ROOT(mi->mi_vfsp, &rootvp);
2368         if (e.error != 0)
2369                 goto out;
2370         rp = r4find_unlocked(fhp, mi->mi_vfsp);
2371         if (rp != NULL) {
2372                 *vpp = RTOV4(rp);
2373                 VN_RELE(rootvp);
2374                 return (0);
2375         }
2376 
2377         /*
2378          * Since we don't have the rnode, we have to go over the wire.
2379          * remap_lookup() can get all of the filehandles and attributes
2380          * we need in one operation.
2381          */
2382         np = fn_parent(VTOSV(dvp)->sv_name);
2383         ASSERT(np != NULL);
2384 
2385         recov_state.rs_flags = 0;
2386         recov_state.rs_num_retry_despite_err = 0;
2387 recov_retry:
2388         if (need_start_op) {
2389                 e.error = nfs4_start_fop(mi, rootvp, NULL, OH_LOOKUP,
2390                     &recov_state, NULL);
2391                 if (e.error != 0) {
2392                         goto out;
2393                 }
2394         }
2395         va.va_type = VNON;
2396         pva.va_type = VNON;
2397         remap_lookup(np, rootvp, RML_ORDINARY, cr,
2398             &newfh, &gar, &newpfh, &pgar, &e);
2399         if (nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp)) {
2400                 if (need_start_op) {
2401                         bool_t abort;
2402 
2403                         abort = nfs4_start_recovery(&e, mi,
2404                             rootvp, NULL, NULL, NULL, OP_LOOKUP, NULL);
2405                         if (abort) {
2406                                 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2407                                     &recov_state, FALSE);
2408                                 if (e.error == 0)
2409                                         e.error = EIO;
2410                                 goto out;
2411                         }
2412                         nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2413                             &recov_state, TRUE);
2414                         goto recov_retry;
2415                 }
2416                 if (e.error == 0)
2417                         e.error = EIO;
2418                 goto out;
2419         }
2420 
2421         if (!e.error) {
2422                 va = gar.n4g_va;
2423                 pva = pgar.n4g_va;
2424         }
2425 
2426         if ((e.error != 0) ||
2427             (va.va_type != VDIR)) {
2428                 if (need_start_op)
2429                         nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2430                             &recov_state, FALSE);
2431                 if (e.error == 0)
2432                         e.error = EIO;
2433                 goto out;
2434         }
2435 
2436         if (e.stat != NFS4_OK) {
2437                 if (need_start_op)
2438                         nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2439                             &recov_state, FALSE);
2440                 e.error = EIO;
2441                 goto out;
2442         }
2443 
2444         /*
2445          * It is possible for remap_lookup() to return with no error,
2446          * but without providing the parent filehandle and attrs.
2447          */
2448         if (pva.va_type != VDIR) {
2449                 /*
2450                  * Call remap_lookup() again, this time with the
2451                  * newpfh and pgar args in the first position.
2452                  */
2453                 pnp = fn_parent(np);
2454                 if (pnp != NULL) {
2455                         remap_lookup(pnp, rootvp, RML_ORDINARY, cr,
2456                             &newpfh, &pgar, NULL, NULL, &e);
2457                         if (nfs4_needs_recovery(&e, FALSE,
2458                             mi->mi_vfsp)) {
2459                                 if (need_start_op) {
2460                                         bool_t abort;
2461 
2462                                         abort = nfs4_start_recovery(&e, mi,
2463                                             rootvp, NULL, NULL, NULL,
2464                                             OP_LOOKUP, NULL);
2465                                         if (abort) {
2466                                                 nfs4_end_fop(mi, rootvp, NULL,
2467                                                     OH_LOOKUP, &recov_state,
2468                                                     FALSE);
2469                                                 if (e.error == 0)
2470                                                         e.error = EIO;
2471                                                 goto out;
2472                                         }
2473                                         nfs4_end_fop(mi, rootvp, NULL,
2474                                             OH_LOOKUP, &recov_state, TRUE);
2475                                         goto recov_retry;
2476                                 }
2477                                 if (e.error == 0)
2478                                         e.error = EIO;
2479                                 goto out;
2480                         }
2481 
2482                         if (e.stat != NFS4_OK) {
2483                                 if (need_start_op)
2484                                         nfs4_end_fop(mi, rootvp, NULL,
2485                                             OH_LOOKUP, &recov_state, FALSE);
2486                                 e.error = EIO;
2487                                 goto out;
2488                         }
2489                 }
2490                 if ((pnp == NULL) ||
2491                     (e.error != 0) ||
2492                     (pva.va_type == VNON)) {
2493                         if (need_start_op)
2494                                 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP,
2495                                     &recov_state, FALSE);
2496                         if (e.error == 0)
2497                                 e.error = EIO;
2498                         goto out;
2499                 }
2500         }
2501         ASSERT(newpfh.nfs_fh4_len != 0);
2502         if (need_start_op)
2503                 nfs4_end_fop(mi, rootvp, NULL, OH_LOOKUP, &recov_state, FALSE);
2504         psfh = sfh4_get(&newpfh, mi);
2505 
2506         sfh = sfh4_get(&newfh, mi);
2507         vp = makenfs4node_by_fh(sfh, psfh, &np, &gar, mi, cr, t);
2508 
2509 out:
2510         if (np != NULL)
2511                 fn_rele(&np);
2512         if (pnp != NULL)
2513                 fn_rele(&pnp);
2514         if (newfh.nfs_fh4_len != 0)
2515                 kmem_free(newfh.nfs_fh4_val, newfh.nfs_fh4_len);
2516         if (newpfh.nfs_fh4_len != 0)
2517                 kmem_free(newpfh.nfs_fh4_val, newpfh.nfs_fh4_len);
2518         if (sfh != NULL)
2519                 sfh4_rele(&sfh);
2520         if (psfh != NULL)
2521                 sfh4_rele(&psfh);
2522         if (rootvp != NULL)
2523                 VN_RELE(rootvp);
2524         *vpp = vp;
2525         return (e.error);
2526 }
2527 
2528 #ifdef DEBUG
2529 size_t r_path_memuse = 0;
2530 #endif
2531 
2532 /*
2533  * NFS client failover support
2534  *
2535  * sv4_free() frees the malloc'd portion of a "servinfo_t".
2536  */
2537 void
2538 sv4_free(servinfo4_t *svp)
2539 {
2540         servinfo4_t *next;
2541         struct knetconfig *knconf;
2542 
2543         while (svp != NULL) {
2544                 next = svp->sv_next;
2545                 if (svp->sv_dhsec)
2546                         sec_clnt_freeinfo(svp->sv_dhsec);
2547                 if (svp->sv_secdata)
2548                         sec_clnt_freeinfo(svp->sv_secdata);
2549                 if (svp->sv_save_secinfo &&
2550                     svp->sv_save_secinfo != svp->sv_secinfo)
2551                         secinfo_free(svp->sv_save_secinfo);
2552                 if (svp->sv_secinfo)
2553                         secinfo_free(svp->sv_secinfo);
2554                 if (svp->sv_hostname && svp->sv_hostnamelen > 0)
2555                         kmem_free(svp->sv_hostname, svp->sv_hostnamelen);
2556                 knconf = svp->sv_knconf;
2557                 if (knconf != NULL) {
2558                         if (knconf->knc_protofmly != NULL)
2559                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2560                         if (knconf->knc_proto != NULL)
2561                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
2562                         kmem_free(knconf, sizeof (*knconf));
2563                 }
2564                 knconf = svp->sv_origknconf;
2565                 if (knconf != NULL) {
2566                         if (knconf->knc_protofmly != NULL)
2567                                 kmem_free(knconf->knc_protofmly, KNC_STRSIZE);
2568                         if (knconf->knc_proto != NULL)
2569                                 kmem_free(knconf->knc_proto, KNC_STRSIZE);
2570                         kmem_free(knconf, sizeof (*knconf));
2571                 }
2572                 if (svp->sv_addr.buf != NULL && svp->sv_addr.maxlen != 0)
2573                         kmem_free(svp->sv_addr.buf, svp->sv_addr.maxlen);
2574                 if (svp->sv_path != NULL) {
2575                         kmem_free(svp->sv_path, svp->sv_pathlen);
2576                 }
2577                 nfs_rw_destroy(&svp->sv_lock);
2578 
2579                 /*
2580                  * If we have an nfs4_server from a pnfs data server...
2581                  * XXXrsb This may go away or change
2582                  */
2583                 if (svp->sv_ds_n4sp)
2584                         nfs4_server_rele(svp->sv_ds_n4sp);
2585 
2586                 kmem_free(svp, sizeof (*svp));
2587                 svp = next;
2588         }
2589 }
2590 
2591 void
2592 nfs4_printfhandle(nfs4_fhandle_t *fhp)
2593 {
2594         int *ip;
2595         char *buf;
2596         size_t bufsize;
2597         char *cp;
2598 
2599         /*
2600          * 13 == "(file handle:"
2601          * maximum of NFS_FHANDLE / sizeof (*ip) elements in fh_buf times
2602          *      1 == ' '
2603          *      8 == maximum strlen of "%x"
2604          * 3 == ")\n\0"
2605          */
2606         bufsize = 13 + ((NFS_FHANDLE_LEN / sizeof (*ip)) * (1 + 8)) + 3;
2607         buf = kmem_alloc(bufsize, KM_NOSLEEP);
2608         if (buf == NULL)
2609                 return;
2610 
2611         cp = buf;
2612         (void) strcpy(cp, "(file handle:");
2613         while (*cp != '\0')
2614                 cp++;
2615         for (ip = (int *)fhp->fh_buf;
2616             ip < (int *)&fhp->fh_buf[fhp->fh_len];
2617             ip++) {
2618                 (void) sprintf(cp, " %x", *ip);
2619                 while (*cp != '\0')
2620                         cp++;
2621         }
2622         (void) strcpy(cp, ")\n");
2623 
2624         zcmn_err(getzoneid(), CE_CONT, "%s", buf);
2625 
2626         kmem_free(buf, bufsize);
2627 }
2628 
2629 /*
2630  * The NFSv4 readdir cache subsystem.
2631  *
2632  * We provide a set of interfaces to allow the rest of the system to utilize
2633  * a caching mechanism while encapsulating the details of the actual
2634  * implementation.  This should allow for better maintainability and
2635  * extensibility by consolidating the implementation details in one location.
2636  */
2637 
2638 /*
2639  * Comparator used by AVL routines.
2640  */
2641 static int
2642 rddir4_cache_compar(const void *x, const void *y)
2643 {
2644         rddir4_cache_impl *ai = (rddir4_cache_impl *)x;
2645         rddir4_cache_impl *bi = (rddir4_cache_impl *)y;
2646         rddir4_cache *a = &ai->rc;
2647         rddir4_cache *b = &bi->rc;
2648 
2649         if (a->nfs4_cookie == b->nfs4_cookie) {
2650                 if (a->buflen == b->buflen)
2651                         return (0);
2652                 if (a->buflen < b->buflen)
2653                         return (-1);
2654                 return (1);
2655         }
2656 
2657         if (a->nfs4_cookie < b->nfs4_cookie)
2658                         return (-1);
2659 
2660         return (1);
2661 }
2662 
2663 /*
2664  * Allocate an opaque handle for the readdir cache.
2665  */
2666 void
2667 rddir4_cache_create(rnode4_t *rp)
2668 {
2669         ASSERT(rp->r_dir == NULL);
2670 
2671         rp->r_dir = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2672 
2673         avl_create(rp->r_dir, rddir4_cache_compar, sizeof (rddir4_cache_impl),
2674             offsetof(rddir4_cache_impl, tree));
2675 }
2676 
2677 /*
2678  *  Purge the cache of all cached readdir responses.
2679  */
2680 void
2681 rddir4_cache_purge(rnode4_t *rp)
2682 {
2683         rddir4_cache_impl       *rdip;
2684         rddir4_cache_impl       *nrdip;
2685 
2686         ASSERT(MUTEX_HELD(&rp->r_statelock));
2687 
2688         if (rp->r_dir == NULL)
2689                 return;
2690 
2691         rdip = avl_first(rp->r_dir);
2692 
2693         while (rdip != NULL) {
2694                 nrdip = AVL_NEXT(rp->r_dir, rdip);
2695                 avl_remove(rp->r_dir, rdip);
2696                 rdip->rc.flags &= ~RDDIRCACHED;
2697                 rddir4_cache_rele(rp, &rdip->rc);
2698                 rdip = nrdip;
2699         }
2700         ASSERT(avl_numnodes(rp->r_dir) == 0);
2701 }
2702 
2703 /*
2704  * Destroy the readdir cache.
2705  */
2706 void
2707 rddir4_cache_destroy(rnode4_t *rp)
2708 {
2709         ASSERT(MUTEX_HELD(&rp->r_statelock));
2710         if (rp->r_dir == NULL)
2711                 return;
2712 
2713         rddir4_cache_purge(rp);
2714         avl_destroy(rp->r_dir);
2715         kmem_free(rp->r_dir, sizeof (avl_tree_t));
2716         rp->r_dir = NULL;
2717 }
2718 
2719 /*
2720  * Locate a readdir response from the readdir cache.
2721  *
2722  * Return values:
2723  *
2724  * NULL - If there is an unrecoverable situation like the operation may have
2725  *        been interrupted.
2726  *
2727  * rddir4_cache * - A pointer to a rddir4_cache is returned to the caller.
2728  *                  The flags are set approprately, such that the caller knows
2729  *                  what state the entry is in.
2730  */
2731 rddir4_cache *
2732 rddir4_cache_lookup(rnode4_t *rp, offset_t cookie, int count)
2733 {
2734         rddir4_cache_impl       *rdip = NULL;
2735         rddir4_cache_impl       srdip;
2736         rddir4_cache            *srdc;
2737         rddir4_cache            *rdc = NULL;
2738         rddir4_cache            *nrdc = NULL;
2739         avl_index_t             where;
2740 
2741 top:
2742         ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2743         ASSERT(MUTEX_HELD(&rp->r_statelock));
2744         /*
2745          * Check to see if the readdir cache has been disabled.  If so, then
2746          * simply allocate an rddir4_cache entry and return it, since caching
2747          * operations do not apply.
2748          */
2749         if (rp->r_dir == NULL) {
2750                 if (nrdc == NULL) {
2751                         /*
2752                          * Drop the lock because we are doing a sleeping
2753                          * allocation.
2754                          */
2755                         mutex_exit(&rp->r_statelock);
2756                         rdc = rddir4_cache_alloc(KM_SLEEP);
2757                         rdc->nfs4_cookie = cookie;
2758                         rdc->buflen = count;
2759                         mutex_enter(&rp->r_statelock);
2760                         return (rdc);
2761                 }
2762                 return (nrdc);
2763         }
2764 
2765         srdc = &srdip.rc;
2766         srdc->nfs4_cookie = cookie;
2767         srdc->buflen = count;
2768 
2769         rdip = avl_find(rp->r_dir, &srdip, &where);
2770 
2771         /*
2772          * If we didn't find an entry then create one and insert it
2773          * into the cache.
2774          */
2775         if (rdip == NULL) {
2776                 /*
2777                  * Check for the case where we have made a second pass through
2778                  * the cache due to a lockless allocation.  If we find that no
2779                  * thread has already inserted this entry, do the insert now
2780                  * and return.
2781                  */
2782                 if (nrdc != NULL) {
2783                         avl_insert(rp->r_dir, nrdc->data, where);
2784                         nrdc->flags |= RDDIRCACHED;
2785                         rddir4_cache_hold(nrdc);
2786                         return (nrdc);
2787                 }
2788 
2789 #ifdef DEBUG
2790                 nfs4_readdir_cache_misses++;
2791 #endif
2792                 /*
2793                  * First, try to allocate an entry without sleeping.  If that
2794                  * fails then drop the lock and do a sleeping allocation.
2795                  */
2796                 nrdc = rddir4_cache_alloc(KM_NOSLEEP);
2797                 if (nrdc != NULL) {
2798                         nrdc->nfs4_cookie = cookie;
2799                         nrdc->buflen = count;
2800                         avl_insert(rp->r_dir, nrdc->data, where);
2801                         nrdc->flags |= RDDIRCACHED;
2802                         rddir4_cache_hold(nrdc);
2803                         return (nrdc);
2804                 }
2805 
2806                 /*
2807                  * Drop the lock and do a sleeping allocation.  We incur
2808                  * additional overhead by having to search the cache again,
2809                  * but this case should be rare.
2810                  */
2811                 mutex_exit(&rp->r_statelock);
2812                 nrdc = rddir4_cache_alloc(KM_SLEEP);
2813                 nrdc->nfs4_cookie = cookie;
2814                 nrdc->buflen = count;
2815                 mutex_enter(&rp->r_statelock);
2816                 /*
2817                  * We need to take another pass through the cache
2818                  * since we dropped our lock to perform the alloc.
2819                  * Another thread may have come by and inserted the
2820                  * entry we are interested in.
2821                  */
2822                 goto top;
2823         }
2824 
2825         /*
2826          * Check to see if we need to free our entry.  This can happen if
2827          * another thread came along beat us to the insert.  We can
2828          * safely call rddir4_cache_free directly because no other thread
2829          * would have a reference to this entry.
2830          */
2831         if (nrdc != NULL)
2832                 rddir4_cache_free((rddir4_cache_impl *)nrdc->data);
2833 
2834 #ifdef DEBUG
2835         nfs4_readdir_cache_hits++;
2836 #endif
2837         /*
2838          * Found something.  Make sure it's ready to return.
2839          */
2840         rdc = &rdip->rc;
2841         rddir4_cache_hold(rdc);
2842         /*
2843          * If the cache entry is in the process of being filled in, wait
2844          * until this completes.  The RDDIRWAIT bit is set to indicate that
2845          * someone is waiting and when the thread currently filling the entry
2846          * is done, it should do a cv_broadcast to wakeup all of the threads
2847          * waiting for it to finish. If the thread wakes up to find that
2848          * someone new is now trying to complete the the entry, go back
2849          * to sleep.
2850          */
2851         while (rdc->flags & RDDIR) {
2852                 /*
2853                  * The entry is not complete.
2854                  */
2855                 nfs_rw_exit(&rp->r_rwlock);
2856                 rdc->flags |= RDDIRWAIT;
2857 #ifdef DEBUG
2858                 nfs4_readdir_cache_waits++;
2859 #endif
2860                 while (rdc->flags & RDDIRWAIT) {
2861                         if (!cv_wait_sig(&rdc->cv, &rp->r_statelock)) {
2862                                 /*
2863                                  * We got interrupted, probably the user
2864                                  * typed ^C or an alarm fired.  We free the
2865                                  * new entry if we allocated one.
2866                                  */
2867                                 rddir4_cache_rele(rp, rdc);
2868                                 mutex_exit(&rp->r_statelock);
2869                                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
2870                                     RW_READER, FALSE);
2871                                 mutex_enter(&rp->r_statelock);
2872                                 return (NULL);
2873                         }
2874                 }
2875                 mutex_exit(&rp->r_statelock);
2876                 (void) nfs_rw_enter_sig(&rp->r_rwlock,
2877                     RW_READER, FALSE);
2878                 mutex_enter(&rp->r_statelock);
2879         }
2880 
2881         /*
2882          * The entry we were waiting on may have been purged from
2883          * the cache and should no longer be used, release it and
2884          * start over.
2885          */
2886         if (!(rdc->flags & RDDIRCACHED)) {
2887                 rddir4_cache_rele(rp, rdc);
2888                 goto top;
2889         }
2890 
2891         /*
2892          * The entry is completed.  Return it.
2893          */
2894         return (rdc);
2895 }
2896 
2897 /*
2898  * Allocate a cache element and return it.  Can return NULL if memory is
2899  * low.
2900  */
2901 static rddir4_cache *
2902 rddir4_cache_alloc(int flags)
2903 {
2904         rddir4_cache_impl       *rdip = NULL;
2905         rddir4_cache            *rc = NULL;
2906 
2907         rdip = kmem_alloc(sizeof (rddir4_cache_impl), flags);
2908 
2909         if (rdip != NULL) {
2910                 rc = &rdip->rc;
2911                 rc->data = (void *)rdip;
2912                 rc->nfs4_cookie = 0;
2913                 rc->nfs4_ncookie = 0;
2914                 rc->entries = NULL;
2915                 rc->eof = 0;
2916                 rc->entlen = 0;
2917                 rc->buflen = 0;
2918                 rc->actlen = 0;
2919                 /*
2920                  * A readdir is required so set the flag.
2921                  */
2922                 rc->flags = RDDIRREQ;
2923                 cv_init(&rc->cv, NULL, CV_DEFAULT, NULL);
2924                 rc->error = 0;
2925                 mutex_init(&rdip->lock, NULL, MUTEX_DEFAULT, NULL);
2926                 rdip->count = 1;
2927 #ifdef DEBUG
2928                 atomic_add_64(&clstat4_debug.dirent.value.ui64, 1);
2929 #endif
2930         }
2931         return (rc);
2932 }
2933 
2934 /*
2935  * Increment the reference count to this cache element.
2936  */
2937 static void
2938 rddir4_cache_hold(rddir4_cache *rc)
2939 {
2940         rddir4_cache_impl *rdip = (rddir4_cache_impl *)rc->data;
2941 
2942         mutex_enter(&rdip->lock);
2943         rdip->count++;
2944         mutex_exit(&rdip->lock);
2945 }
2946 
2947 /*
2948  * Release a reference to this cache element.  If the count is zero then
2949  * free the element.
2950  */
2951 void
2952 rddir4_cache_rele(rnode4_t *rp, rddir4_cache *rdc)
2953 {
2954         rddir4_cache_impl *rdip = (rddir4_cache_impl *)rdc->data;
2955 
2956         ASSERT(MUTEX_HELD(&rp->r_statelock));
2957 
2958         /*
2959          * Check to see if we have any waiters.  If so, we can wake them
2960          * so that they can proceed.
2961          */
2962         if (rdc->flags & RDDIRWAIT) {
2963                 rdc->flags &= ~RDDIRWAIT;
2964                 cv_broadcast(&rdc->cv);
2965         }
2966 
2967         mutex_enter(&rdip->lock);
2968         ASSERT(rdip->count > 0);
2969         if (--rdip->count == 0) {
2970                 mutex_exit(&rdip->lock);
2971                 rddir4_cache_free(rdip);
2972         } else
2973                 mutex_exit(&rdip->lock);
2974 }
2975 
2976 /*
2977  * Free a cache element.
2978  */
2979 static void
2980 rddir4_cache_free(rddir4_cache_impl *rdip)
2981 {
2982         rddir4_cache *rc = &rdip->rc;
2983 
2984 #ifdef DEBUG
2985         atomic_add_64(&clstat4_debug.dirent.value.ui64, -1);
2986 #endif
2987         if (rc->entries != NULL)
2988                 kmem_free(rc->entries, rc->buflen);
2989         cv_destroy(&rc->cv);
2990         mutex_destroy(&rdip->lock);
2991         kmem_free(rdip, sizeof (*rdip));
2992 }
2993 
2994 /*
2995  * Snapshot callback for nfs:0:nfs4_client as registered with the kstat
2996  * framework.
2997  */
2998 static int
2999 cl4_snapshot(kstat_t *ksp, void *buf, int rw)
3000 {
3001         ksp->ks_snaptime = gethrtime();
3002         if (rw == KSTAT_WRITE) {
3003                 bcopy(buf, ksp->ks_private, sizeof (clstat4_tmpl));
3004         } else {
3005                 bcopy(ksp->ks_private, buf, sizeof (clstat4_tmpl));
3006         }
3007         return (0);
3008 }
3009 
3010 #ifdef DEBUG
3011 static int
3012 cl4_debug_snapshot(kstat_t *ksp, void *buf, int rw)
3013 {
3014         ksp->ks_snaptime = gethrtime();
3015         if (rw == KSTAT_WRITE) {
3016                 /*
3017                  * Currently only the global zone can write to kstats, but we
3018                  * add the check just for paranoia.
3019                  */
3020                 if (INGLOBALZONE(curproc)) {
3021                         bcopy(buf, &clstat4_debug, sizeof (clstat4_debug));
3022                 }
3023         } else {
3024                 /*
3025                  * If we're displaying the "global" debug kstat values, we
3026                  * display them as-is to all zones since in fact they apply to
3027                  * the system as a whole.
3028                  */
3029                 bcopy(&clstat4_debug, buf, sizeof (clstat4_debug));
3030         }
3031         return (0);
3032 }
3033 #endif
3034 
3035 
3036 
3037 /*
3038  * Zone support
3039  */
3040 static void *
3041 clinit4_zone(zoneid_t zoneid)
3042 {
3043         kstat_t *nfs4_client_kstat;
3044         kstat_t *nfs41_client_kstat;
3045         struct nfs4_clnt *nfscl;
3046         uint_t ndata;
3047 
3048         nfscl = kmem_alloc(sizeof (*nfscl), KM_SLEEP);
3049         mutex_init(&nfscl->nfscl_chtable4_lock, NULL, MUTEX_DEFAULT, NULL);
3050         nfscl->nfscl_chtable4 = NULL;
3051         nfscl->nfscl_zoneid = zoneid;
3052 
3053         bcopy(&clstat4_tmpl, &nfscl->nfscl_stat[NFS4_MINOR_v0],
3054             sizeof (clstat4_tmpl));
3055         ndata = sizeof (clstat4_tmpl) / sizeof (kstat_named_t);
3056         if ((nfs4_client_kstat = kstat_create_zone("nfs", 0, "nfs4_client",
3057             "misc", KSTAT_TYPE_NAMED, ndata,
3058             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3059                 nfs4_client_kstat->ks_private =
3060                     &nfscl->nfscl_stat[NFS4_MINOR_v0];
3061                 nfs4_client_kstat->ks_snapshot = cl4_snapshot;
3062                 kstat_install(nfs4_client_kstat);
3063         }
3064 
3065         bcopy(&clstat4_tmpl, &nfscl->nfscl_stat[NFS4_MINOR_v1],
3066             sizeof (clstat4_tmpl));
3067         if ((nfs41_client_kstat = kstat_create_zone("nfs", 0, "nfs41_client",
3068             "misc", KSTAT_TYPE_NAMED, ndata,
3069             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE, zoneid)) != NULL) {
3070                 nfs41_client_kstat->ks_private =
3071                     &nfscl->nfscl_stat[NFS4_MINOR_v1];
3072                 nfs41_client_kstat->ks_snapshot = cl4_snapshot;
3073                 kstat_install(nfs41_client_kstat);
3074         }
3075 
3076         mutex_enter(&nfs4_clnt_list_lock);
3077         list_insert_head(&nfs4_clnt_list, nfscl);
3078         mutex_exit(&nfs4_clnt_list_lock);
3079         return (nfscl);
3080 }
3081 
3082 /*ARGSUSED*/
3083 static void
3084 clfini4_zone(zoneid_t zoneid, void *arg)
3085 {
3086         struct nfs4_clnt *nfscl = arg;
3087         chhead_t *chp, *next;
3088 
3089         if (nfscl == NULL)
3090                 return;
3091         mutex_enter(&nfs4_clnt_list_lock);
3092         list_remove(&nfs4_clnt_list, nfscl);
3093         mutex_exit(&nfs4_clnt_list_lock);
3094         clreclaim4_zone(nfscl, 0);
3095         for (chp = nfscl->nfscl_chtable4; chp != NULL; chp = next) {
3096                 ASSERT(chp->ch_list == NULL);
3097                 kmem_free(chp->ch_protofmly, strlen(chp->ch_protofmly) + 1);
3098                 next = chp->ch_next;
3099                 kmem_free(chp, sizeof (*chp));
3100         }
3101         kstat_delete_byname_zone("nfs", 0, "nfs4_client", zoneid);
3102         kstat_delete_byname_zone("nfs", 0, "nfs41_client", zoneid);
3103         mutex_destroy(&nfscl->nfscl_chtable4_lock);
3104         kmem_free(nfscl, sizeof (*nfscl));
3105 }
3106 
3107 /*
3108  * Called by endpnt_destructor to make sure the client handles are
3109  * cleaned up before the RPC endpoints.  This becomes a no-op if
3110  * clfini_zone (above) is called first.  This function is needed
3111  * (rather than relying on clfini_zone to clean up) because the ZSD
3112  * callbacks have no ordering mechanism, so we have no way to ensure
3113  * that clfini_zone is called before endpnt_destructor.
3114  */
3115 void
3116 clcleanup4_zone(zoneid_t zoneid)
3117 {
3118         struct nfs4_clnt *nfscl;
3119 
3120         mutex_enter(&nfs4_clnt_list_lock);
3121         nfscl = list_head(&nfs4_clnt_list);
3122         for (; nfscl != NULL; nfscl = list_next(&nfs4_clnt_list, nfscl)) {
3123                 if (nfscl->nfscl_zoneid == zoneid) {
3124                         clreclaim4_zone(nfscl, 0);
3125                         break;
3126                 }
3127         }
3128         mutex_exit(&nfs4_clnt_list_lock);
3129 }
3130 
3131 int
3132 nfs4_subr_init(void)
3133 {
3134         /*
3135          * Allocate and initialize the client handle cache
3136          */
3137 #ifdef DEBUG
3138         uint_t ndata;
3139         kstat_t *nfs4_debug_kstat;
3140 #endif
3141         chtab4_cache = kmem_cache_create("client_handle4_cache",
3142             sizeof (struct chtab), 0, NULL, NULL, clreclaim4, NULL,
3143             NULL, 0);
3144 
3145 #ifdef DEBUG
3146         /*
3147          * Create a kstat to maintain debug statistics across all zones
3148          */
3149         ndata = sizeof (clstat4_debug) / sizeof (kstat_named_t);
3150         if ((nfs4_debug_kstat = kstat_create("nfs", 0, "nfs4_client_debug",
3151             "misc", KSTAT_TYPE_NAMED, ndata,
3152             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE)) != NULL) {
3153                 nfs4_debug_kstat->ks_private = &clstat4_debug;
3154                 nfs4_debug_kstat->ks_snapshot = cl4_debug_snapshot;
3155                 kstat_install(nfs4_debug_kstat);
3156         }
3157 #endif
3158 
3159 
3160         /*
3161          * Initialize the list of per-zone client handles (and associated data).
3162          * This needs to be done before we call zone_key_create().
3163          */
3164         list_create(&nfs4_clnt_list, sizeof (struct nfs4_clnt),
3165             offsetof(struct nfs4_clnt, nfscl_node));
3166 
3167         /*
3168          * Initialize the zone_key for per-zone client handle lists.
3169          */
3170         zone_key_create(&nfs4clnt_zone_key, clinit4_zone, NULL, clfini4_zone);
3171 
3172         if (nfs4err_delay_time == 0)
3173                 nfs4err_delay_time = NFS4ERR_DELAY_TIME;
3174 
3175         return (0);
3176 }
3177 
3178 int
3179 nfs4_subr_fini(void)
3180 {
3181         /*
3182          * Deallocate the client handle cache
3183          */
3184         kmem_cache_destroy(chtab4_cache);
3185 #ifdef DEBUG
3186         kstat_delete_byname("nfs", 0, "nfs4_client_debug");
3187 #endif
3188 
3189         /*
3190          * Destroy the zone_key
3191          */
3192         (void) zone_key_delete(nfs4clnt_zone_key);
3193 
3194         return (0);
3195 }
3196 /*
3197  * Set or Clear direct I/O flag
3198  * VOP_RWLOCK() is held for write access to prevent a race condition
3199  * which would occur if a process is in the middle of a write when
3200  * directio flag gets set. It is possible that all pages may not get flushed.
3201  *
3202  * This is a copy of nfs_directio, changes here may need to be made
3203  * there and vice versa.
3204  */
3205 
3206 int
3207 nfs4_directio(vnode_t *vp, int cmd, cred_t *cr)
3208 {
3209         int     error = 0;
3210         rnode4_t *rp;
3211 
3212         rp = VTOR4(vp);
3213 
3214         if (cmd == DIRECTIO_ON) {
3215 
3216                 if (rp->r_flags & R4DIRECTIO)
3217                         return (0);
3218 
3219                 /*
3220                  * Flush the page cache.
3221                  */
3222 
3223                 (void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
3224 
3225                 if (rp->r_flags & R4DIRECTIO) {
3226                         VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
3227                         return (0);
3228                 }
3229 
3230                 if (nfs4_has_pages(vp) &&
3231                     ((rp->r_flags & R4DIRTY) || rp->r_awcount > 0)) {
3232                         error = VOP_PUTPAGE(vp, (offset_t)0, (uint_t)0,
3233                             B_INVAL, cr, NULL);
3234                         if (error) {
3235                                 if (error == ENOSPC || error == EDQUOT) {
3236                                         mutex_enter(&rp->r_statelock);
3237                                         if (!rp->r_error)
3238                                                 rp->r_error = error;
3239                                         mutex_exit(&rp->r_statelock);
3240                                 }
3241                                 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
3242                                 return (error);
3243                         }
3244                 }
3245 
3246                 mutex_enter(&rp->r_statelock);
3247                 rp->r_flags |= R4DIRECTIO;
3248                 mutex_exit(&rp->r_statelock);
3249                 VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
3250                 return (0);
3251         }
3252 
3253         if (cmd == DIRECTIO_OFF) {
3254                 mutex_enter(&rp->r_statelock);
3255                 rp->r_flags &= ~R4DIRECTIO;      /* disable direct mode */
3256                 mutex_exit(&rp->r_statelock);
3257                 return (0);
3258         }
3259 
3260         return (EINVAL);
3261 }
3262 
3263 /*
3264  * Return TRUE if the file has any pages.  Always go back to
3265  * the master vnode to check v_pages since none of the shadows
3266  * can have pages.
3267  */
3268 
3269 bool_t
3270 nfs4_has_pages(vnode_t *vp)
3271 {
3272         rnode4_t *rp;
3273 
3274         rp = VTOR4(vp);
3275         if (IS_SHADOW(vp, rp))
3276                 vp = RTOV4(rp); /* RTOV4 always gives the master */
3277 
3278         return (vn_has_cached_data(vp));
3279 }
3280 
3281 /*
3282  * This table is used to determine whether the client should attempt
3283  * failover based on the clnt_stat value returned by CLNT_CALL.  The
3284  * clnt_stat is used as an index into the table.  If
3285  * the error value that corresponds to the clnt_stat value in the
3286  * table is non-zero, then that is the error to be returned AND
3287  * that signals that failover should be attempted.
3288  *
3289  * Special note: If the RPC_ values change, then direct indexing of the
3290  * table is no longer valid, but having the RPC_ values in the table
3291  * allow the functions to detect the change and issue a warning.
3292  * In this case, the code will always attempt failover as a defensive
3293  * measure.
3294  */
3295 
3296 static struct try_failover_tab {
3297         enum clnt_stat  cstat;
3298         int             error;
3299 } try_failover_table [] = {
3300 
3301         RPC_SUCCESS,            0,
3302         RPC_CANTENCODEARGS,     0,
3303         RPC_CANTDECODERES,      0,
3304         RPC_CANTSEND,           ECOMM,
3305         RPC_CANTRECV,           ECOMM,
3306         RPC_TIMEDOUT,           ETIMEDOUT,
3307         RPC_VERSMISMATCH,       0,
3308         RPC_AUTHERROR,          0,
3309         RPC_PROGUNAVAIL,        0,
3310         RPC_PROGVERSMISMATCH,   0,
3311         RPC_PROCUNAVAIL,        0,
3312         RPC_CANTDECODEARGS,     0,
3313         RPC_SYSTEMERROR,        ENOSR,
3314         RPC_UNKNOWNHOST,        EHOSTUNREACH,
3315         RPC_RPCBFAILURE,        ENETUNREACH,
3316         RPC_PROGNOTREGISTERED,  ECONNREFUSED,
3317         RPC_FAILED,             ETIMEDOUT,
3318         RPC_UNKNOWNPROTO,       EHOSTUNREACH,
3319         RPC_INTR,               0,
3320         RPC_UNKNOWNADDR,        EHOSTUNREACH,
3321         RPC_TLIERROR,           0,
3322         RPC_NOBROADCAST,        EHOSTUNREACH,
3323         RPC_N2AXLATEFAILURE,    ECONNREFUSED,
3324         RPC_UDERROR,            0,
3325         RPC_INPROGRESS,         0,
3326         RPC_STALERACHANDLE,     EINVAL,
3327         RPC_CANTCONNECT,        ECONNREFUSED,
3328         RPC_XPRTFAILED,         ECONNABORTED,
3329         RPC_CANTCREATESTREAM,   ECONNREFUSED,
3330         RPC_CANTSTORE,          ENOBUFS,
3331         RPC_CONN_NOT_BOUND,     0
3332 };
3333 
3334 /*
3335  * nfs4_try_failover - determine whether the client should
3336  * attempt failover based on the values stored in the nfs4_error_t.
3337  */
3338 int
3339 nfs4_try_failover(nfs4_error_t *ep)
3340 {
3341         if (ep->error == ETIMEDOUT || ep->stat == NFS4ERR_RESOURCE)
3342                 return (TRUE);
3343 
3344         if (ep->error && ep->rpc_status != RPC_SUCCESS)
3345                 return (try_failover(ep->rpc_status) != 0 ? TRUE : FALSE);
3346 
3347         return (FALSE);
3348 }
3349 
3350 /*
3351  * try_failover - internal version of nfs4_try_failover, called
3352  * only by rfscall and aclcall.  Determine if failover is warranted
3353  * based on the clnt_stat and return the error number if it is.
3354  */
3355 static int
3356 try_failover(enum clnt_stat rpc_status)
3357 {
3358         int err = 0;
3359 
3360         if (rpc_status == RPC_SUCCESS)
3361                 return (0);
3362 
3363 #ifdef  DEBUG
3364         if (rpc_status != 0 && nfs4_try_failover_any) {
3365                 err = ETIMEDOUT;
3366                 goto done;
3367         }
3368 #endif
3369         /*
3370          * The rpc status is used as an index into the table.
3371          * If the rpc status is outside of the range of the
3372          * table or if the rpc error numbers have been changed
3373          * since the table was constructed, then print a warning
3374          * (DEBUG only) and try failover anyway.  Otherwise, just
3375          * grab the resulting error number out of the table.
3376          */
3377         if (rpc_status < RPC_SUCCESS || rpc_status >=
3378             sizeof (try_failover_table)/sizeof (try_failover_table[0]) ||
3379             try_failover_table[rpc_status].cstat != rpc_status) {
3380 
3381                 err = ETIMEDOUT;
3382 #ifdef  DEBUG
3383                 cmn_err(CE_NOTE, "try_failover: unexpected rpc error %d",
3384                     rpc_status);
3385 #endif
3386         } else
3387                 err = try_failover_table[rpc_status].error;
3388 
3389 done:
3390         if (rpc_status)
3391                 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
3392                     "nfs4_try_failover: %strying failover on error %d",
3393                     err ? "" : "NOT ", rpc_status));
3394 
3395         return (err);
3396 }
3397 
3398 void
3399 nfs4_error_zinit(nfs4_error_t *ep)
3400 {
3401         ep->error = 0;
3402         ep->stat = NFS4_OK;
3403         ep->rpc_status = RPC_SUCCESS;
3404 }
3405 
3406 void
3407 nfs4_error_init(nfs4_error_t *ep, int error)
3408 {
3409         ep->error = error;
3410         ep->stat = NFS4_OK;
3411         ep->rpc_status = RPC_SUCCESS;
3412 }
3413 
3414 
3415 #ifdef DEBUG
3416 
3417 /*
3418  * Return a 16-bit hash for filehandle, stateid, clientid, owner.
3419  * use the same algorithm as for NFS v3.
3420  *
3421  */
3422 int
3423 hash16(void *p, int len)
3424 {
3425         int i, rem;
3426         uint_t *wp;
3427         uint_t key = 0;
3428 
3429         /* protect against non word aligned */
3430         if ((rem = len & 3) != 0)
3431                 len &= ~3;
3432 
3433         for (i = 0, wp = (uint_t *)p; i < len; i += 4, wp++) {
3434                 key ^= (*wp >> 16) ^ *wp;
3435         }
3436 
3437         /* hash left-over bytes */
3438         for (i = 0; i < rem; i++)
3439                 key ^= *((uchar_t *)p + i);
3440 
3441         return (key & 0xffff);
3442 }
3443 
3444 /*
3445  * rnode4info - return filehandle and path information for an rnode.
3446  * XXX MT issues: uses a single static buffer, no locking of path.
3447  */
3448 char *
3449 rnode4info(rnode4_t *rp)
3450 {
3451         static char buf[80];
3452         nfs4_fhandle_t fhandle;
3453         char *path;
3454         char *type;
3455 
3456         if (rp == NULL)
3457                 return ("null");
3458         if (rp->r_flags & R4ISXATTR)
3459                 type = "attr";
3460         else if (RTOV4(rp)->v_flag & V_XATTRDIR)
3461                 type = "attrdir";
3462         else if (RTOV4(rp)->v_flag & VROOT)
3463                 type = "root";
3464         else if (RTOV4(rp)->v_type == VDIR)
3465                 type = "dir";
3466         else if (RTOV4(rp)->v_type == VREG)
3467                 type = "file";
3468         else
3469                 type = "other";
3470         sfh4_copyval(rp->r_fh, &fhandle);
3471         path = fn_path(rp->r_svnode.sv_name);
3472         (void) snprintf(buf, 80, "$%p[%s], type=%s, flags=%04X, FH=%04X\n",
3473             (void *)rp, path, type, rp->r_flags,
3474             hash16((void *)&fhandle.fh_buf, fhandle.fh_len));
3475         kmem_free(path, strlen(path)+1);
3476         return (buf);
3477 }
3478 #endif
3479 
3480 int nfs4_sessions_debug;
3481 
3482 void
3483 nfs4sequence_setup(nfs4_session_t *np, COMPOUND4args_clnt *rfsargp,
3484         nfs4_slot_t **slotpp)
3485 {
3486         int                     slot_id = 0;
3487         nfs4_slot_t             *slot;
3488 
3489         bcopy(&np->sessionid,
3490             rfsargp->array->nfs_argop4_u.opsequence.sa_sessionid,
3491             sizeof (sessionid4));
3492 
3493         /*
3494          * Find a slot to use.
3495          */
3496         (void) nfs_rw_enter_sig(&np->slot_table_rwlock, RW_READER, 0);
3497         mutex_enter(&np->slot_lock);
3498         slot_id = np->next_slot;
3499         while ((np->slot_table[slot_id]->slot_inuse != 0) ||
3500             (np->slot_table[slot_id]->slot_bad != 0)) {
3501                 /*
3502                  * Can drop the rwlock here so we don't hold it over
3503                  * a possible cv_wait.
3504                  */
3505                 nfs_rw_exit(&np->slot_table_rwlock);
3506 
3507                 /*
3508                  * This slot is still in use.
3509                  * Check next slot if there are still some available.
3510                  */
3511 
3512                 while (np->slots_available == 0) {
3513                         if (nfs4_sessions_debug)
3514                                 cmn_err(CE_WARN, "Waiting for Available Slot");
3515                         cv_wait(&np->slot_wait, &np->slot_lock);
3516                 }
3517                 slot_id++;
3518                 if (slot_id == np->maxslots)
3519                         slot_id = 0;
3520                 (void) nfs_rw_enter_sig(&np->slot_table_rwlock, RW_READER, 0);
3521         }
3522         *slotpp = slot = np->slot_table[slot_id];
3523         slot->slot_inuse = 1;
3524         np->slots_available--;
3525         np->next_slot = slot_id + 1 == np->maxslots ? 0 : slot_id + 1;
3526 
3527         /*
3528          * Update SEQUENCE args
3529          */
3530         rfsargp->array->nfs_argop4_u.opsequence.sa_sequenceid =
3531             slot->slot_seqid;
3532         rfsargp->array->nfs_argop4_u.opsequence.sa_slotid = slot->slot_id;
3533         rfsargp->array->nfs_argop4_u.opsequence.sa_highest_slotid  =
3534             np->maxslots - np->slots_available;
3535         /* XXX - rick - need sr_target_highest_slotid */
3536         mutex_exit(&np->slot_lock);
3537         nfs_rw_exit(&np->slot_table_rwlock);
3538 }
3539 
3540 void
3541 nfs4sequence_fin(nfs4_session_t *np, COMPOUND4res_clnt *rfsresp,
3542         nfs4_slot_t *slot, nfs4_error_t *ep)
3543 {
3544         SEQUENCE4resok          *seqres;
3545 
3546         mutex_enter(&np->slot_lock);
3547 
3548         ASSERT(slot->slot_inuse);
3549         slot->slot_inuse = 0;
3550 
3551         /* if call started but not completed, mark slot as bad */
3552         if ((ep->error != 0) &&
3553             ((ep->rpc_status == RPC_TIMEDOUT) ||
3554             (ep->rpc_status == RPC_INTR))) {
3555                 cmn_err(CE_WARN, "SEQUENCE failed %d, bad slot %d:%d",
3556                     ep->rpc_status, slot->slot_id, slot->slot_seqid);
3557                 slot->slot_bad = 1;
3558         } else {
3559                 if (slot->slot_id < np->next_slot)
3560                         np->next_slot = slot->slot_id;
3561 
3562                 /* Update slot seqid on successful op_sequence */
3563                 if (ep->error == 0 && (rfsresp->array != NULL &&
3564                     rfsresp->array->nfs_resop4_u.opsequence.sr_status ==
3565                     NFS4_OK))
3566                         slot->slot_seqid++;
3567 
3568                 if (np->slots_available++ == 0) {
3569                         if (nfs4_sessions_debug)
3570                                 cmn_err(CE_WARN, "Slots Available");
3571                         cv_broadcast(&np->slot_wait);
3572                 }
3573         }
3574 
3575         mutex_exit(&np->slot_lock);
3576 
3577         /* SEQUENCE Op Successful? */
3578         if (ep->error != 0 || rfsresp->status != NFS4_OK ||
3579             (rfsresp->array != NULL &&
3580             rfsresp->array->nfs_resop4_u.opsequence.sr_status != NFS4_OK)) {
3581                 /*
3582                  * cmn_err(CE_WARN, "sequence op failed or missing\n");
3583                  */
3584                 return;
3585         }
3586 
3587         seqres = &rfsresp->array->nfs_resop4_u.opsequence.
3588             SEQUENCE4res_u.sr_resok4;
3589 
3590         /*
3591          * Sequence Op Successful, Handle Errors and maxslot changes.
3592          */
3593 
3594         if (seqres->sr_status_flags & SEQ4_STATUS_CB_PATH_DOWN) {
3595 #ifdef  notyet
3596                 nfs4_delegreturn_all(np);
3597 #else
3598                 cmn_err(CE_WARN, "SEQ4_STATUS_CB_PATH_DOWN not handled");
3599 #endif
3600         }
3601 
3602         if (seqres->sr_status_flags & SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRING) {
3603                 cmn_err(CE_WARN, "SEQUENCE got CB_GSS_CONTEXTS_EXPIRING");
3604         }
3605 
3606         if (seqres->sr_status_flags & SEQ4_STATUS_CB_GSS_CONTEXTS_EXPIRED) {
3607                 cmn_err(CE_WARN, "SEQUENCE got CB_GSS_CONTEXTS_EXPIRED");
3608         }
3609 
3610         if (seqres->sr_status_flags & SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED) {
3611                 cmn_err(CE_WARN, "SEQUENCE got EXIPRED_ALL_STATE_REVOKED");
3612         }
3613 
3614         if (seqres->sr_status_flags & SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED) {
3615                 cmn_err(CE_WARN, "SEQUENCE got EXPIRED_SOME_STATE_REVOKED");
3616         }
3617 
3618         if (seqres->sr_status_flags & SEQ4_STATUS_ADMIN_STATE_REVOKED) {
3619                 cmn_err(CE_WARN, "SEQUENCE got ADMIN_STATE_REVOKED");
3620         }
3621 
3622         if (seqres->sr_status_flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) {
3623                 cmn_err(CE_WARN, "SEQUENCE got RECALLABLE_STATE_REVOKED");
3624         }
3625 
3626         if (seqres->sr_status_flags & SEQ4_STATUS_LEASE_MOVED) {
3627                 cmn_err(CE_WARN, "SEQUENCE got LEASE_MOVED");
3628         }
3629 }
3630 
3631 kmutex_t nfs4_session_lst_lock;
3632 list_t nfs4_session_list;
3633 
3634 void
3635 nfs4session_init()
3636 {
3637         mutex_init(&nfs4_session_lst_lock, NULL, MUTEX_DEFAULT, NULL);
3638         list_create(&nfs4_session_list, sizeof (nfs4_session_t),
3639             offsetof(nfs4_session_t, ssx_list));
3640 }
3641 
3642 /*
3643  * Compare 2 netbufs, return true of they match
3644  */
3645 int
3646 netbuf_match(struct netbuf *n1, struct netbuf *n2)
3647 {
3648         if (n1->len == n2->len && bcmp(n1->buf, n2->buf, n1->len) == 0)
3649                 return (1);
3650         return (0);
3651 }
3652 
3653 void *
3654 new_string(void *cur)
3655 {
3656         void *v;
3657 
3658         v = kmem_alloc(strlen(cur)+1, KM_SLEEP);
3659         (void) strcpy(v, cur);
3660         return (v);
3661 }
3662 
3663 servinfo4_t *
3664 new_servinfo4(struct knetconfig *knc, struct netbuf *nb, int flags)
3665 {
3666         servinfo4_t *svp;
3667         struct sec_data *secdata;
3668 
3669         /*
3670          * Allocate a servinfo4 struct.
3671          */
3672         svp = kmem_zalloc(sizeof (*svp), KM_SLEEP);
3673         nfs_rw_init(&svp->sv_lock, NULL, RW_DEFAULT, NULL);
3674         svp->sv_flags = flags;
3675 
3676         svp->sv_knconf = kmem_alloc(sizeof (*knc), KM_SLEEP);
3677         svp->sv_knconf->knc_semantics = knc->knc_semantics;
3678         svp->sv_knconf->knc_protofmly = new_string(knc->knc_protofmly);
3679         svp->sv_knconf->knc_proto = new_string(knc->knc_proto);
3680         svp->sv_knconf->knc_rdev = knc->knc_rdev;
3681         bzero(svp->sv_knconf->knc_unused, sizeof (knc->knc_unused));
3682 
3683         svp->sv_addr.maxlen = nb->maxlen;
3684         svp->sv_addr.len = nb->len;
3685         svp->sv_addr.buf = kmem_alloc(nb->maxlen, KM_SLEEP);
3686         bcopy(nb->buf, svp->sv_addr.buf, nb->len);
3687 
3688         /* XXX, ought to inherit sec data from parent servinfo4 */
3689         secdata = kmem_alloc(sizeof (*secdata), KM_SLEEP);
3690         secdata->secmod = secdata->rpcflavor = AUTH_SYS;
3691         secdata->data = NULL;
3692         svp->sv_secdata = secdata;
3693 
3694         /* XXX */
3695         svp->sv_path = "/";
3696         svp->sv_pathlen = 1;
3697         svp->sv_hostname = "data-server";
3698         svp->sv_hostnamelen = strlen("data-server");
3699 
3700         return (svp);
3701 }
3702 
3703 /*
3704  * XXX - this will be eliminated once everyone is calling rfs4call()
3705  * emulate the behavior of rfs4call for those who call
3706  * CLNT_CALL directly
3707  */
3708 void
3709 nfs4_error_set(nfs4_error_t *ep, enum clnt_stat rpc_status, enum nfsstat4 stat)
3710 {
3711         if (rpc_status == RPC_SUCCESS) {
3712                 ep->error = 0;       /* geterrno4 happens higher up */
3713                 ep->stat = stat;
3714                 ep->rpc_status = RPC_SUCCESS;
3715         } else {
3716                 ep->error = EPROTO;  /* XXX */
3717                 ep->stat = 0;
3718                 ep->rpc_status = rpc_status;
3719         }
3720 }
3721 
3722 /*
3723  * A function to interface with RPC tags.
3724  * Returns 0 on success
3725  */
3726 int
3727 nfs4_tag_ctl(nfs4_server_t *np, mntinfo4_t *mi, servinfo4_t *svp,
3728     sessionid4 oldsid, int cmd, cred_t *cr)
3729 {
3730         int error;
3731         CLIENT *client;
3732         struct chtab *ch;
3733         struct nfs4_clnt *nfscl;
3734 
3735         nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
3736         ASSERT(nfscl != NULL);
3737 
3738         if (svp == NULL) {
3739                 /*
3740                  * We just pick the current servinfo ptr. Even if
3741                  * this changes midstream, we should be alright, since
3742                  * we are not really going OTW. Just used to get a
3743                  * client handle.
3744                  */
3745                 mutex_enter(&mi->mi_lock);
3746                 svp = mi->mi_curr_serv;
3747                 mutex_exit(&mi->mi_lock);
3748         }
3749 
3750         error = nfs_clget4(mi, svp, cr, &client, &ch, nfscl);
3751 
3752         if (error)
3753                 return (error);
3754 
3755         switch (cmd) {
3756         case NFS4_TAG_SWAP:
3757 
3758                 /*
3759                  * To do the sessid swap first set the old tag and
3760                  * then call to swap to the new one
3761                  */
3762 
3763                 if (!CLNT_CONTROL(client, CLSET_TAG, (char *)oldsid)) {
3764                         zcmn_err(getzoneid(), CE_WARN,
3765                             "Failed to set tag on client handle");
3766                         error = EIO;
3767                         break;
3768                 }
3769 
3770                 /*
3771                  * This switches the tag value in the RPC layer
3772                  * The client handle's tag (client->cku_tag) is set
3773                  * to new tag as well.
3774                  */
3775 
3776                 if (!CLNT_CONTROL(client, CLSET_TAG_SWAP,
3777                     (char *)(np->ssx.sessionid))) {
3778                         zcmn_err(getzoneid(), CE_WARN,
3779                             "Failed to swap rpc tags");
3780                         error = EIO;
3781                 }
3782 
3783                 break;
3784 
3785         case NFS4_TAG_DESTROY:
3786 
3787                 if (!CLNT_CONTROL(client, CLSET_TAG_DESTROY,
3788                     (char *)(np->ssx.sessionid))) {
3789                         zcmn_err(getzoneid(), CE_WARN,
3790                             "Failed destroy rpc tags");
3791                         error = EIO;
3792                 }
3793                 break;
3794 
3795         case NFS4_CBSERVER_CLEANUP:
3796                 if (!CLNT_CONTROL(client, CLSET_CBSERVER_CLEANUP,
3797                     (char *)(np->ssx.sessionid))) {
3798                         zcmn_err(getzoneid(), CE_WARN,
3799                             "Failed destroy rpc tags");
3800                         error = EIO;
3801                 }
3802                 break;
3803         }
3804 
3805         clfree4(client, ch, nfscl);
3806         return (error);
3807 }
3808 
3809 /*
3810  * All NFSv4.1 defined errors
3811  */
3812 char *
3813 nfs41_strerror(nfsstat4 err)
3814 {
3815         switch (err) {
3816         case NFS4_OK:
3817                 return ("NFS4_OK");
3818         case NFS4ERR_PERM:
3819                 return ("NFS4ERR_PERM");
3820         case NFS4ERR_NOENT:
3821                 return ("NFS4ERR_NOENT");
3822         case NFS4ERR_IO:
3823                 return ("NFS4ERR_IO");
3824         case NFS4ERR_NXIO:
3825                 return ("NFS4ERR_NXIO");
3826         case NFS4ERR_ACCESS:
3827                 return ("NFS4ERR_ACCESS");
3828         case NFS4ERR_EXIST:
3829                 return ("NFS4ERR_EXIST");
3830         case NFS4ERR_XDEV:
3831                 return ("NFS4ERR_XDEV");
3832         case NFS4ERR_NOTDIR:
3833                 return ("NFS4ERR_NOTDIR");
3834         case NFS4ERR_ISDIR:
3835                 return ("NFS4ERR_ISDIR");
3836         case NFS4ERR_INVAL:
3837                 return ("NFS4ERR_INVAL");
3838         case NFS4ERR_FBIG:
3839                 return ("NFS4ERR_FBIG");
3840         case NFS4ERR_NOSPC:
3841                 return ("NFS4ERR_NOSPC");
3842         case NFS4ERR_ROFS:
3843                 return ("NFS4ERR_ROFS");
3844         case NFS4ERR_MLINK:
3845                 return ("NFS4ERR_MLINK");
3846         case NFS4ERR_NAMETOOLONG:
3847                 return ("NFS4ERR_NAMETOOLONG");
3848         case NFS4ERR_NOTEMPTY:
3849                 return ("NFS4ERR_NOTEMPTY");
3850         case NFS4ERR_DQUOT:
3851                 return ("NFS4ERR_DQUOT");
3852         case NFS4ERR_STALE:
3853                 return ("NFS4ERR_STALE");
3854         case NFS4ERR_BADHANDLE:
3855                 return ("NFS4ERR_BADHANDLE");
3856         case NFS4ERR_BAD_COOKIE:
3857                 return ("NFS4ERR_BAD_COOKIE");
3858         case NFS4ERR_NOTSUPP:
3859                 return ("NFS4ERR_NOTSUPP");
3860         case NFS4ERR_TOOSMALL:
3861                 return ("NFS4ERR_TOOSMALL");
3862         case NFS4ERR_SERVERFAULT:
3863                 return ("NFS4ERR_SERVERFAULT");
3864         case NFS4ERR_BADTYPE:
3865                 return ("NFS4ERR_BADTYPE");
3866         case NFS4ERR_DELAY:
3867                 return ("NFS4ERR_DELAY");
3868         case NFS4ERR_SAME:
3869                 return ("NFS4ERR_SAME");
3870         case NFS4ERR_DENIED:
3871                 return ("NFS4ERR_DENIED");
3872         case NFS4ERR_EXPIRED:
3873                 return ("NFS4ERR_EXPIRED");
3874         case NFS4ERR_LOCKED:
3875                 return ("NFS4ERR_LOCKED");
3876         case NFS4ERR_GRACE:
3877                 return ("NFS4ERR_GRACE");
3878         case NFS4ERR_FHEXPIRED:
3879                 return ("NFS4ERR_FHEXPIRED");
3880         case NFS4ERR_SHARE_DENIED:
3881                 return ("NFS4ERR_SHARE_DENIED");
3882         case NFS4ERR_WRONGSEC:
3883                 return ("NFS4ERR_WRONGSEC");
3884         case NFS4ERR_CLID_INUSE:
3885                 return ("NFS4ERR_CLID_INUSE");
3886         case NFS4ERR_RESOURCE:
3887                 return ("NFS4ERR_RESOURCE");
3888         case NFS4ERR_MOVED:
3889                 return ("NFS4ERR_MOVED");
3890         case NFS4ERR_NOFILEHANDLE:
3891                 return ("NFS4ERR_NOFILEHANDLE");
3892         case NFS4ERR_MINOR_VERS_MISMATCH:
3893                 return ("NFS4ERR_MINOR_VERS_MISMATCH");
3894         case NFS4ERR_STALE_CLIENTID:
3895                 return ("NFS4ERR_STALE_CLIENTID");
3896         case NFS4ERR_STALE_STATEID:
3897                 return ("NFS4ERR_STALE_STATEID");
3898         case NFS4ERR_OLD_STATEID:
3899                 return ("NFS4ERR_OLD_STATEID");
3900         case NFS4ERR_BAD_STATEID:
3901                 return ("NFS4ERR_BAD_STATEID");
3902         case NFS4ERR_BAD_SEQID:
3903                 return ("NFS4ERR_BAD_SEQID");
3904         case NFS4ERR_NOT_SAME:
3905                 return ("NFS4ERR_NOT_SAME");
3906         case NFS4ERR_LOCK_RANGE:
3907                 return ("NFS4ERR_LOCK_RANGE");
3908         case NFS4ERR_SYMLINK:
3909                 return ("NFS4ERR_SYMLINK");
3910         case NFS4ERR_RESTOREFH:
3911                 return ("NFS4ERR_RESTOREFH");
3912         case NFS4ERR_LEASE_MOVED:
3913                 return ("NFS4ERR_LEASE_MOVED");
3914         case NFS4ERR_ATTRNOTSUPP:
3915                 return ("NFS4ERR_ATTRNOTSUPP");
3916         case NFS4ERR_NO_GRACE:
3917                 return ("NFS4ERR_NO_GRACE");
3918         case NFS4ERR_RECLAIM_BAD:
3919                 return ("NFS4ERR_RECLAIM_BAD");
3920         case NFS4ERR_RECLAIM_CONFLICT:
3921                 return ("NFS4ERR_RECLAIM_CONFLICT");
3922         case NFS4ERR_BADXDR:
3923                 return ("NFS4ERR_BADXDR");
3924         case NFS4ERR_LOCKS_HELD:
3925                 return ("NFS4ERR_LOCKS_HELD");
3926         case NFS4ERR_OPENMODE:
3927                 return ("NFS4ERR_OPENMODE");
3928         case NFS4ERR_BADOWNER:
3929                 return ("NFS4ERR_BADOWNER");
3930         case NFS4ERR_BADCHAR:
3931                 return ("NFS4ERR_BADCHAR");
3932         case NFS4ERR_BADNAME:
3933                 return ("NFS4ERR_BADNAME");
3934         case NFS4ERR_BAD_RANGE:
3935                 return ("NFS4ERR_BAD_RANGE");
3936         case NFS4ERR_LOCK_NOTSUPP:
3937                 return ("NFS4ERR_LOCK_NOTSUPP");
3938         case NFS4ERR_OP_ILLEGAL:
3939                 return ("NFS4ERR_OP_ILLEGAL");
3940         case NFS4ERR_DEADLOCK:
3941                 return ("NFS4ERR_DEADLOCK");
3942         case NFS4ERR_FILE_OPEN:
3943                 return ("NFS4ERR_FILE_OPEN");
3944         case NFS4ERR_ADMIN_REVOKED:
3945                 return ("NFS4ERR_ADMIN_REVOKED");
3946         case NFS4ERR_CB_PATH_DOWN:
3947                 return ("NFS4ERR_CB_PATH_DOWN");
3948         case NFS4ERR_BADIOMODE:
3949                 return ("NFS4ERR_BADIOMODE");
3950         case NFS4ERR_BADLAYOUT:
3951                 return ("NFS4ERR_BADLAYOUT");
3952         case NFS4ERR_BAD_SESSION_DIGEST:
3953                 return ("NFS4ERR_BAD_SESSION_DIGEST");
3954         case NFS4ERR_BADSESSION:
3955                 return ("NFS4ERR_BADSESSION");
3956         case NFS4ERR_BADSLOT:
3957                 return ("NFS4ERR_BADSLOT");
3958         case NFS4ERR_COMPLETE_ALREADY:
3959                 return ("NFS4ERR_COMPLETE_ALREADY");
3960         case NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
3961                 return ("NFS4ERR_CONN_NOT_BOUND_TO_SESSION");
3962         case NFS4ERR_DELEG_ALREADY_WANTED:
3963                 return ("NFS4ERR_DELEG_ALREADY_WANTED");
3964         case NFS4ERR_BACK_CHAN_BUSY:
3965                 return ("NFS4ERR_BACK_CHAN_BUSY");
3966         case NFS4ERR_LAYOUTTRYLATER:
3967                 return ("NFS4ERR_LAYOUTTRYLATER");
3968         case NFS4ERR_LAYOUTUNAVAILABLE:
3969                 return ("NFS4ERR_LAYOUTUNAVAILABLE");
3970         case NFS4ERR_NOMATCHING_LAYOUT:
3971                 return ("NFS4ERR_NOMATCHING_LAYOUT");
3972         case NFS4ERR_RECALLCONFLICT:
3973                 return ("NFS4ERR_RECALLCONFLICT");
3974         case NFS4ERR_UNKNOWN_LAYOUTTYPE:
3975                 return ("NFS4ERR_UNKNOWN_LAYOUTTYPE");
3976         case NFS4ERR_SEQ_MISORDERED:
3977                 return ("NFS4ERR_SEQ_MISORDERED");
3978         case NFS4ERR_SEQUENCE_POS:
3979                 return ("NFS4ERR_SEQUENCE_POS");
3980         case NFS4ERR_REQ_TOO_BIG:
3981                 return ("NFS4ERR_REQ_TOO_BIG");
3982         case NFS4ERR_REP_TOO_BIG:
3983                 return ("NFS4ERR_REP_TOO_BIG");
3984         case NFS4ERR_REP_TOO_BIG_TO_CACHE:
3985                 return ("NFS4ERR_REP_TOO_BIG_TO_CACHE");
3986         case NFS4ERR_RETRY_UNCACHED_REP:
3987                 return ("NFS4ERR_RETRY_UNCACHED_REP");
3988         case NFS4ERR_UNSAFE_COMPOUND:
3989                 return ("NFS4ERR_UNSAFE_COMPOUND");
3990         case NFS4ERR_TOO_MANY_OPS:
3991                 return ("NFS4ERR_TOO_MANY_OPS");
3992         case NFS4ERR_OP_NOT_IN_SESSION:
3993                 return ("NFS4ERR_OP_NOT_IN_SESSION");
3994         case NFS4ERR_HASH_ALG_UNSUPP:
3995                 return ("NFS4ERR_HASH_ALG_UNSUPP");
3996         case NFS4ERR_CLIENTID_BUSY:
3997                 return ("NFS4ERR_CLIENTID_BUSY");
3998         case NFS4ERR_PNFS_IO_HOLE:
3999                 return ("NFS4ERR_PNFS_IO_HOLE");
4000         case NFS4ERR_SEQ_FALSE_RETRY:
4001                 return ("NFS4ERR_SEQ_FALSE_RETRY");
4002         case NFS4ERR_BAD_HIGH_SLOT:
4003                 return ("NFS4ERR_BAD_HIGH_SLOT");
4004         case NFS4ERR_DEADSESSION:
4005                 return ("NFS4ERR_DEADSESSION");
4006         case NFS4ERR_ENCR_ALG_UNSUPP:
4007                 return ("NFS4ERR_ENCR_ALG_UNSUPP");
4008         case NFS4ERR_PNFS_NO_LAYOUT:
4009                 return ("NFS4ERR_PNFS_NO_LAYOUT");
4010         case NFS4ERR_NOT_ONLY_OP:
4011                 return ("NFS4ERR_NOT_ONLY_OP");
4012         case NFS4ERR_WRONG_CRED:
4013                 return ("NFS4ERR_WRONG_CRED");
4014         case NFS4ERR_WRONG_TYPE:
4015                 return ("NFS4ERR_WRONG_TYPE");
4016         case NFS4ERR_DIRDELEG_UNAVAIL:
4017                 return ("NFS4ERR_DIRDELEG_UNAVAIL");
4018         case NFS4ERR_REJECT_DELEG:
4019                 return ("NFS4ERR_REJECT_DELEG");
4020         case NFS4ERR_RETURNCONFLICT:
4021                 return ("NFS4ERR_RETURNCONFLICT");
4022         default:
4023                 {
4024                         static char      msg[99];
4025                         static char     *ies = "Unknown NFSv4.1 error";
4026 
4027                         (void) snprintf(msg, 99, "%s: %d", ies, (int)err);
4028                         return (msg);
4029                 }
4030         }
4031 }
--- EOF ---