Move CallBack Server thread creation, initial processing and destruction to RPC
Cleanup some RPC code.
Remove extraneous fields from nfs41_cb_info and clean up the code.
Change KM_SLEEP in mir_nfs41_callback_thread to KM_NOSLEEP.
Fix lint warnings

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /* All Rights Reserved */
  28 
  29 #include <sys/param.h>
  30 #include <sys/types.h>
  31 #include <sys/systm.h>
  32 #include <sys/cred.h>
  33 #include <sys/vfs.h>
  34 #include <sys/vnode.h>
  35 #include <sys/pathname.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/kmem.h>
  38 #include <sys/kstat.h>
  39 #include <sys/mkdev.h>
  40 #include <sys/mount.h>
  41 #include <sys/statvfs.h>
  42 #include <sys/errno.h>
  43 #include <sys/debug.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/utsname.h>
  46 #include <sys/bootconf.h>
  47 #include <sys/modctl.h>
  48 #include <sys/acl.h>
  49 #include <sys/flock.h>
  50 #include <sys/kstr.h>
  51 #include <sys/stropts.h>
  52 #include <sys/strsubr.h>
  53 #include <sys/atomic.h>
  54 #include <sys/disp.h>
  55 #include <sys/policy.h>
  56 #include <sys/list.h>
  57 #include <sys/zone.h>
  58 #include <sys/sdt.h>
  59 
  60 #include <rpc/types.h>
  61 #include <rpc/auth.h>
  62 #include <rpc/rpcsec_gss.h>
  63 #include <rpc/clnt.h>
  64 #include <rpc/xdr.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/nfs_clnt.h>
  68 #include <nfs/mount.h>
  69 #include <nfs/nfs_acl.h>
  70 
  71 #include <fs/fs_subr.h>
  72 
  73 #include <nfs/nfs4.h>
  74 #include <nfs/rnode4.h>
  75 #include <nfs/nfs4_clnt.h>
  76 #include <nfs/nfssys.h>
  77 #include <nfs/nfs4_pnfs.h>
  78 
  79 #ifdef  DEBUG
  80 /*
  81  * These are "special" state IDs and file handles that
  82  * match any delegation state ID or file handled.  This
  83  * is for testing purposes only.
  84  */
  85 
  86 
  87 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
  88 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
  89 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
  90 nfsstat4 cb4_getattr_fail = NFS4_OK;
  91 nfsstat4 cb4_recall_fail = NFS4_OK;
  92 
  93 int nfs4_callback_debug;
  94 int nfs4_recall_debug;
  95 int nfs4_drat_debug;
  96 
  97 #endif
  98 
  99 int     nfs41_birpc = 1;        /* Use bidirectional rpc */
 100 
 101 #define CB_NOTE(x)      NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
 102 #define CB_WARN(x)      NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
 103 #define CB_WARN1(x, y)  NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
 104 
 105 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
 106 
 107 static zone_key_t nfs4_callback_zone_key;
 108 
 109 /*
 110  * NFS4_MAPSIZE is the number of bytes we are willing to consume
 111  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
 112  * style delegation.
 113  */
 114 
 115 #define NFS4_MAPSIZE    8192
 116 #define NFS4_MAPWORDS   NFS4_MAPSIZE/sizeof (uint_t)
 117 #define NbPW            (NBBY*sizeof (uint_t))
 118 
 119 static int nfs4_num_prognums = 1024;
 120 static SVC_CALLOUT_TABLE nfs4_cb_sct;
 121 
 122 struct nfs4_dnode {
 123         list_node_t     linkage;
 124         rnode4_t        *rnodep;
 125         int             flags;          /* Flags for nfs4delegreturn_impl() */
 126 };
 127 
 128 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
 129         { "delegations",        KSTAT_DATA_UINT64 },
 130         { "cb_getattr",         KSTAT_DATA_UINT64 },
 131         { "cb_recall",          KSTAT_DATA_UINT64 },
 132         { "cb_null",            KSTAT_DATA_UINT64 },
 133         { "cb_dispatch",        KSTAT_DATA_UINT64 },
 134         { "delegaccept_r",      KSTAT_DATA_UINT64 },
 135         { "delegaccept_rw",     KSTAT_DATA_UINT64 },
 136         { "delegreturn",        KSTAT_DATA_UINT64 },
 137         { "callbacks",          KSTAT_DATA_UINT64 },
 138         { "claim_cur",          KSTAT_DATA_UINT64 },
 139         { "claim_cur_ok",       KSTAT_DATA_UINT64 },
 140         { "recall_trunc",       KSTAT_DATA_UINT64 },
 141         { "recall_failed",      KSTAT_DATA_UINT64 },
 142         { "return_limit_write", KSTAT_DATA_UINT64 },
 143         { "return_limit_addmap", KSTAT_DATA_UINT64 },
 144         { "deleg_recover",      KSTAT_DATA_UINT64 },
 145         { "cb_illegal",         KSTAT_DATA_UINT64 },
 146         { "cb_sequence",        KSTAT_DATA_UINT64 }
 147 };
 148 
 149 struct nfs4_cb_port {
 150         list_node_t             linkage; /* linkage into per-zone port list */
 151         char                    netid[KNC_STRSIZE];
 152         char                    uaddr[KNC_STRSIZE];
 153         char                    protofmly[KNC_STRSIZE];
 154         char                    proto[KNC_STRSIZE];
 155 };
 156 
 157 static int cb_getattr_bytes;
 158 
 159 struct cb_recall_pass {
 160         rnode4_t        *rp;
 161         int             flags;          /* Flags for nfs4delegreturn_impl() */
 162         bool_t          truncate;
 163 };
 164 
 165 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
 166 static void nfs4delegreturn_thread(struct cb_recall_pass *);
 167 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
 168     int);
 169 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
 170 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
 171 static int nfs4delegreturn_impl(rnode4_t *, int,
 172     struct nfs4_callback_globals *);
 173 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
 174     struct nfs4_callback_globals *);
 175 
 176 
 177 /*
 178  * Only used for non-bidirectional RPC --Performs a BC2S and
 179  * starts the cbconn_thread.
 180  * (expects np->s_lock to be held)
 181  */
 182 
 183 void
 184 nfs41set_callback(nfs4_server_t *np, servinfo4_t *svp, mntinfo4_t *mi,
 185     cred_t *cr)
 186 {
 187         struct nfs41_cb_info    *cbi;
 188         CLIENT                  *client;
 189         struct nfs4_clnt        *nfscl;
 190         int                     error;
 191 
 192         ASSERT(MUTEX_HELD(&np->s_lock));
 193 
 194         if (nfs4bind_conn_to_session(np, svp, mi, cr, CDFC4_BACK)) {
 195                 zcmn_err(getzoneid(), CE_WARN,
 196                     "Callback Channel Binding Failed");
 197                 return;
 198         }
 199 
 200         /*
 201          * The following below is to create a client handle
 202          * used only by the cbconn_thread to send out NFSPROC4_NULL
 203          * and should not be used for anything else.
 204          */
 205         cbi = np->zone_globals->nfs4prog2cbinfo[np->s_program-NFS4_CALLBACK];
 206         ASSERT(cbi != NULL);
 207         client = cbi->cb_client;
 208 
 209         /*
 210          * If client from a previous session, destroy it first
 211          */
 212         if (client) {
 213                 AUTH_DESTROY(client->cl_auth);
 214                 CLNT_DESTROY(client);
 215         }
 216 
 217         nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
 218         ASSERT(nfscl != NULL);
 219 
 220         /* Get a CLIENT handle */
 221         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
 222             NFS4_PROGRAM, NFS_V4, 0, 0, np->s_cred, &client);
 223 
 224         if (error != 0) {
 225                 zcmn_err(getzoneid(), CE_WARN,
 226                     "Failed to get handle for callback");
 227                 cbi->cb_client = NULL;
 228                 return;
 229         }
 230 
 231         /* Define this handle as a back channel handle */
 232         if (!(CLNT_CONTROL(client, CLSET_BACKCHANNEL, NULL))) {
 233                 zcmn_err(getzoneid(), CE_WARN,
 234                     "Failed to set client handle as callback");
 235                 CLNT_DESTROY(client);
 236                 cbi->cb_client = NULL;
 237                 return;
 238         }
 239 
 240         /* Associate it with the session */
 241         if (!CLNT_CONTROL(client, CLSET_TAG, (char *)(np->ssx.sessionid))) {
 242                 zcmn_err(getzoneid(), CE_WARN,
 243                     "Failed to set tag on client handle");
 244                 CLNT_DESTROY(client);
 245                 cbi->cb_client = NULL;
 246                 return;
 247         }
 248 
 249         cbi->cb_nfscl = nfscl;
 250         cbi->cb_client = client;
 251 
 252         /*
 253          * Now start the cbconn_thread
 254          */
 255 
 256         np->s_refcnt++;
 257         mutex_enter(&cbi->cb_reflock);
 258         cbi->cb_refcnt++;
 259         mutex_exit(&cbi->cb_reflock);
 260         (void) zthread_create(NULL, 0, nfs4_cbconn_thread, np, 0,
 261             minclsyspri);
 262 }
 263 
 264 /*
 265  * nfs4_cbconn_thread is used to send a null op to the server over the
 266  * backchannel connection, to keep the back channel connection up.
 267  * This is not needed for bidirectional rpc as the op_sequence
 268  * heartbeat thread is doing the same thing.
 269  */
 270 void
 271 nfs4_cbconn_thread(nfs4_server_t *np)
 272 {
 273         clock_t                 tick_delay;
 274         callb_cpr_t             cpr_info;
 275         kmutex_t                cpr_lock;
 276         struct nfs41_cb_info    *cbi;
 277         uint32_t                zilch = 0;
 278         int                     timeo;
 279         struct timeval          wait;
 280         enum clnt_stat          rpc_stat;
 281 
 282         cbi = np->zone_globals->nfs4prog2cbinfo[np->s_program-NFS4_CALLBACK];
 283         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
 284         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4cbconn");
 285 
 286         timeo = (NFS_TIMEO * hz) / 10;
 287         timeo = (MIN(NFS_TIMEO, (NFS_COTS_TIMEO / 10)) * hz) / 10;
 288         TICK_TO_TIMEVAL(timeo, &wait);
 289         tick_delay = MSEC_TO_TICK((4 * (60 * 1000L)));
 290 
 291         while (!(cbi->cb_cbconn_exit)) {
 292                 if (!(CLNT_CONTROL(cbi->cb_client, CLSET_XID,
 293                     (char *)&zilch))) {
 294                         zcmn_err(getzoneid(), CE_WARN,
 295                             "Failed to zero xid, cbconn thread exiting");
 296                         break;
 297                 }
 298                 /* Execute remote NULL procedure to establish the connection */
 299                 rpc_stat = CLNT_CALL(cbi->cb_client, NFSPROC4_NULL,
 300                     xdr_void, NULL, xdr_void, NULL, wait);
 301                 if (rpc_stat != RPC_SUCCESS) {
 302                         zcmn_err(getzoneid(), CE_WARN,
 303                             "OP_NULL failed to transmit "
 304                             " on callback connection "
 305                             "status: 0x%x, cbconn thread exiting", rpc_stat);
 306                         break;
 307                 }
 308                 mutex_enter(&cpr_lock);
 309                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
 310                 mutex_exit(&cpr_lock);
 311 
 312                 mutex_enter(&cbi->cb_cbconn_lock);
 313                 (void) cv_timedwait(&cbi->cb_cbconn_wait,
 314                     &cbi->cb_cbconn_lock, tick_delay + lbolt);
 315                 mutex_exit(&cbi->cb_cbconn_lock);
 316 
 317                 mutex_enter(&cpr_lock);
 318                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
 319                 mutex_exit(&cpr_lock);
 320         }
 321 
 322         nfs4_server_rele(np);
 323         nfs41_cbinfo_rele(cbi);
 324         mutex_enter(&cpr_lock);
 325         CALLB_CPR_EXIT(&cpr_info);
 326         cv_signal(&cbi->cb_destroy_wait);
 327         zthread_exit();
 328 }
 329 
 330 static void
 331 cb_sequence(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 332         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 333 {
 334         nfs4_server_t   *np;
 335         nfs41_cb_slot_t *cslot;
 336 
 337         CB_SEQUENCE4args *args = &argop->nfs_cb_argop4_u.opcbsequence;
 338         CB_SEQUENCE4res *resp = &resop->nfs_cb_resop4_u.opcbsequence;
 339 
 340         ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
 341 
 342         mutex_enter(&ncg->nfs4_cb_lock);
 343         np = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 344         mutex_exit(&ncg->nfs4_cb_lock);
 345         if (nfs4_server_vlock(np, 0) == FALSE) {
 346                 CB_WARN("cb_sequence: cannot find server\n");
 347                 *cs->statusp = resp->csr_status = NFS4ERR_BADHANDLE;
 348                 return;
 349         }
 350 
 351         bcopy(&args->csa_sessionid,
 352             &resp->CB_SEQUENCE4res_u.csr_resok4.csr_sessionid,
 353             sizeof (args->csa_sessionid));
 354         resp->CB_SEQUENCE4res_u.csr_resok4.csr_slotid = args->csa_slotid;
 355         resp->CB_SEQUENCE4res_u.csr_resok4.csr_sequenceid =
 356             args->csa_sequenceid;
 357         resp->CB_SEQUENCE4res_u.csr_resok4.csr_highest_slotid =
 358             args->csa_highest_slotid;
 359         resp->CB_SEQUENCE4res_u.csr_resok4.csr_target_highest_slotid =
 360             args->csa_highest_slotid;
 361 
 362         if (bcmp(&args->csa_sessionid, &np->ssx.sessionid,
 363             sizeof (np->ssx.sessionid)) != 0) {
 364                 CB_WARN("cb_sequence: Bad Sequence Id\n");
 365                 *cs->statusp = resp->csr_status = NFS4ERR_BADSESSION;
 366                 mutex_exit(&np->s_lock);
 367                 nfs4_server_rele(np);
 368                 return;
 369         }
 370 
 371         if (args->csa_slotid >= np->ssx.cb_slot_table_size) {
 372                 CB_WARN("cb_sequence: Bad Slotid\n");
 373                 *cs->statusp = resp->csr_status = NFS4ERR_BADSLOT;
 374                 mutex_exit(&np->s_lock);
 375                 nfs4_server_rele(np);
 376                 return;
 377         }
 378 
 379         cslot = np->ssx.cb_slot_table[args->csa_slotid];
 380 
 381         if (args->csa_sequenceid != cslot->cb_seq + 1 || (cslot->cb_inuse)) {
 382                 CB_WARN("cb_sequence: Bad Sequence\n");
 383                 *cs->statusp = resp->csr_status = NFS4ERR_SEQ_MISORDERED;
 384                 mutex_exit(&np->s_lock);
 385                 nfs4_server_rele(np);
 386                 return;
 387         }
 388 
 389         cslot->cb_seq = args->csa_sequenceid;
 390         /*
 391          * todo: need to set inuse and deal with server having
 392          * multiple callbacks in-flight.
 393          */
 394 
 395         *cs->statusp = resp->csr_status = NFS4_OK;
 396         mutex_exit(&np->s_lock);
 397         nfs4_server_rele(np);
 398 }
 399 
 400 static void
 401 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 402         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 403 {
 404         CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
 405         CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
 406         rnode4_t *rp;
 407         vnode_t *vp;
 408         bool_t found = FALSE;
 409         struct nfs4_server *sp;
 410         struct fattr4 *fap;
 411         rpc_inline_t *fdata;
 412         long mapcnt;
 413         fattr4_change change;
 414         fattr4_size size;
 415         uint_t rflag;
 416 
 417         ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
 418 
 419 #ifdef DEBUG
 420         /*
 421          * error injection hook: set cb_getattr_fail global to
 422          * NFS4 pcol error to be returned
 423          */
 424         if (cb4_getattr_fail != NFS4_OK) {
 425                 *cs->statusp = resp->status = cb4_getattr_fail;
 426                 return;
 427         }
 428 #endif
 429 
 430         resp->obj_attributes.attrmask =
 431             NFS4_EMPTY_ATTRMAP(RFS4_ATTRVERS(cs));
 432 
 433         mutex_enter(&ncg->nfs4_cb_lock);
 434         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 435         mutex_exit(&ncg->nfs4_cb_lock);
 436 
 437         if (nfs4_server_vlock(sp, 0) == FALSE) {
 438 
 439                 CB_WARN("cb_getattr: cannot find server\n");
 440 
 441                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 442                 return;
 443         }
 444 
 445         /*
 446          * In cb_compound, callback_ident was validated against rq_prog,
 447          * but we couldn't verify that it was set to the value we provided
 448          * at setclientid time (because we didn't have server struct yet).
 449          * Now we have the server struct, but don't have callback_ident
 450          * handy.  So, validate server struct program number against req
 451          * RPC's prog number.  At this point, we know the RPC prog num
 452          * is valid (else we wouldn't be here); however, we don't know
 453          * that it was the prog number we supplied to this server at
 454          * setclientid time.  If the prog numbers aren't equivalent, then
 455          * log the problem and fail the request because either cbserv
 456          * and/or cbclient are confused.  This will probably never happen.
 457          */
 458         if (sp->s_program != req->rq_prog) {
 459 #ifdef DEBUG
 460                 zcmn_err(getzoneid(), CE_WARN,
 461                     "cb_getattr: wrong server program number srv=%d req=%d\n",
 462                     sp->s_program, req->rq_prog);
 463 #else
 464                 zcmn_err(getzoneid(), CE_WARN,
 465                     "cb_getattr: wrong server program number\n");
 466 #endif
 467                 mutex_exit(&sp->s_lock);
 468                 nfs4_server_rele(sp);
 469                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 470                 return;
 471         }
 472 
 473         /*
 474          * Search the delegation list for a matching file handle;
 475          * mutex on sp prevents the list from changing.
 476          */
 477 
 478         rp = list_head(&sp->s_deleg_list);
 479         for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
 480                 nfs4_fhandle_t fhandle;
 481 
 482                 sfh4_copyval(rp->r_fh, &fhandle);
 483 
 484                 if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
 485                     bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
 486                     fhandle.fh_len) == 0)) {
 487 
 488                         found = TRUE;
 489                         break;
 490                 }
 491 #ifdef  DEBUG
 492                 if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
 493                     bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
 494                     args->fh.nfs_fh4_len) == 0) {
 495 
 496                         found = TRUE;
 497                         break;
 498                 }
 499 #endif
 500         }
 501 
 502         /*
 503          * VN_HOLD the vnode before releasing s_lock to guarantee
 504          * we have a valid vnode reference.
 505          */
 506         if (found == TRUE) {
 507                 vp = RTOV4(rp);
 508                 VN_HOLD(vp);
 509         }
 510 
 511         mutex_exit(&sp->s_lock);
 512         nfs4_server_rele(sp);
 513 
 514         if (found == FALSE) {
 515 
 516                 CB_WARN("cb_getattr: bad fhandle\n");
 517 
 518                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 519                 return;
 520         }
 521 
 522         /*
 523          * Figure out which attributes the server wants.  We only
 524          * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
 525          */
 526         fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
 527 
 528         /*
 529          * Don't actually need to create XDR to encode these
 530          * simple data structures.
 531          * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
 532          */
 533         fap = &resp->obj_attributes;
 534 
 535         fap->attrmask = NFS4_EMPTY_ATTRMAP(RFS4_ATTRVERS(cs));
 536         /* attrlist4_len starts at 0 and increases as attrs are processed */
 537         fap->attrlist4 = (char *)fdata;
 538         fap->attrlist4_len = 0;
 539 
 540         if (ATTR_ISSET(args->attr_request, CHANGE)) {
 541                 /*
 542                  * If the file is mmapped, then increment the change
 543                  * attribute and return it.  This will guarantee that
 544                  * the server will perceive that the file has changed
 545                  * if there is any chance that the client application
 546                  * has changed it.  Otherwise, just return the change
 547                  * attribute as it has been updated by nfs4write_deleg.
 548                  */
 549 
 550                 mutex_enter(&rp->r_statelock);
 551                 mapcnt = rp->r_mapcnt;
 552                 rflag = rp->r_flags;
 553                 mutex_exit(&rp->r_statelock);
 554 
 555                 mutex_enter(&rp->r_statev4_lock);
 556                 /*
 557                  * If object mapped, then always return new change.
 558                  * Otherwise, return change if object has dirty
 559                  * pages.  If object doesn't have any dirty pages,
 560                  * then all changes have been pushed to server, so
 561                  * reset change to grant change.
 562                  */
 563                 if (mapcnt)
 564                         rp->r_deleg_change++;
 565                 else if (! (rflag & R4DIRTY))
 566                 rp->r_deleg_change = rp->r_deleg_change_grant;
 567                 change = rp->r_deleg_change;
 568                 mutex_exit(&rp->r_statev4_lock);
 569 
 570                 /*
 571                  * Use inline XDR code directly, we know that we
 572                  * going to a memory buffer and it has enough
 573                  * space so it cannot fail.
 574                  */
 575                 IXDR_PUT_U_HYPER(fdata, change);
 576                 fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
 577                 ATTR_SET(fap->attrmask, CHANGE);
 578         }
 579 
 580         if (ATTR_ISSET(args->attr_request, SIZE)) {
 581                 /*
 582                  * Use an atomic add of 0 to fetch a consistent view
 583                  * of r_size; this avoids having to take rw_lock
 584                  * which could cause a deadlock.
 585                  */
 586                 size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
 587 
 588                 /*
 589                  * Use inline XDR code directly, we know that we
 590                  * going to a memory buffer and it has enough
 591                  * space so it cannot fail.
 592                  */
 593                 IXDR_PUT_U_HYPER(fdata, size);
 594                 fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
 595                 ATTR_SET(fap->attrmask, SIZE);
 596         }
 597 
 598         VN_RELE(vp);
 599 
 600         *cs->statusp = resp->status = NFS4_OK;
 601 }
 602 
 603 static void
 604 cb_getattr_free(nfs_cb_resop4 *resop)
 605 {
 606         if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
 607                 kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
 608                     obj_attributes.attrlist4, cb_getattr_bytes);
 609 }
 610 
 611 static nfsstat4
 612 layoutrecall_all(nfs4_server_t *np)
 613 {
 614         vnode_t *vp;
 615         rnode4_t *rp;
 616         mntinfo4_t *mi = NULL;
 617         nfs4_fsidlt_t *ltp;
 618         nfsstat4 nstatus = NFS4ERR_NOMATCHING_LAYOUT;
 619 
 620         /*
 621          * Walk thru all of the layout trees, and discard all
 622          * all the layouts, effectively discarding all the layouts
 623          * from this particular server, then do LAYOUTRETURN4_ALL.
 624          */
 625         mutex_enter(&np->s_lt_lock);
 626         for (ltp = avl_first(&np->s_fsidlt); ltp;
 627             ltp = AVL_NEXT(&np->s_fsidlt, ltp)) {
 628                 mutex_enter(&ltp->lt_rlt_lock);
 629                 for (rp = avl_first(&ltp->lt_rlayout_tree); rp;
 630                     rp = AVL_NEXT(&ltp->lt_rlayout_tree, rp)) {
 631 
 632                         vp = RTOV4(rp);
 633                         VN_HOLD(vp);
 634                         pnfs_layout_discard(rp, ltp, np);
 635                         /*
 636                          * Hold the mi to prevent it from disappearing
 637                          * after we drop the reference on the vnode.  This
 638                          * will remain held until we send the request down
 639                          * the taskq.
 640                          */
 641                         if (mi == NULL) {
 642                                 mi = VTOMI4(vp);
 643                                 MI4_HOLD(mi);
 644                         }
 645                         VN_RELE(vp);
 646                         nstatus = NFS4_OK;
 647                 }
 648                 mutex_exit(&ltp->lt_rlt_lock);
 649         }
 650         mutex_exit(&np->s_lt_lock);
 651         if (nstatus == NFS4_OK) {
 652                 pnfs_layoutreturn_bulk(mi, kcred, LAYOUTRETURN4_ALL);
 653                 MI4_RELE(mi);
 654         }
 655         return (nstatus);
 656 }
 657 
 658 
 659 static nfsstat4
 660 layoutrecall_fsid(fsid4 *recallfsid, nfs4_server_t *np)
 661 {
 662         vnode_t *vp;
 663         rnode4_t *rp;
 664         mntinfo4_t *mi = NULL;
 665         nfs4_fsidlt_t *ltp, lt;
 666         nfsstat4 nstatus = NFS4ERR_NOMATCHING_LAYOUT;
 667 
 668         lt.lt_fsid.major = recallfsid->major;
 669         lt.lt_fsid.minor = recallfsid->minor;
 670 
 671         mutex_enter(&np->s_lt_lock);
 672         ltp = avl_find(&np->s_fsidlt, &lt, NULL);
 673 
 674         /*
 675          * If no matching fsid layout tree is found, then no layouts exist
 676          * for this fsid.
 677          */
 678         if (ltp == NULL) {
 679                 mutex_exit(&np->s_lt_lock);
 680                 return (nstatus);
 681         }
 682 
 683         /*
 684          * Found a matching fsid tree, return and free all
 685          * layouts on this tree.
 686          */
 687         mutex_enter(&ltp->lt_rlt_lock);
 688         mutex_exit(&np->s_lt_lock);
 689 
 690         for (rp = avl_first(&ltp->lt_rlayout_tree); rp;
 691             rp = AVL_NEXT(&ltp->lt_rlayout_tree, rp)) {
 692                 /*
 693                  * For each rnode on this fsid's layout tree,
 694                  * discard the layout.  We do not return each
 695                  * layout individually, instead we return in
 696                  * bulk, at the end.
 697                  */
 698                 vp = RTOV4(rp);
 699                 VN_HOLD(vp);
 700                 pnfs_layout_discard(rp, ltp, np);
 701                 if (mi == NULL) {
 702                         mi = VTOMI4(vp);
 703                         MI4_HOLD(mi);
 704                 }
 705                 VN_RELE(vp);
 706                 nstatus = NFS4_OK;
 707         }
 708         mutex_exit(&ltp->lt_rlt_lock);
 709         if (nstatus == NFS4_OK) {
 710                 pnfs_layoutreturn_bulk(mi, kcred, LAYOUTRETURN4_FSID);
 711                 MI4_RELE(mi);
 712         }
 713         return (nstatus);
 714 }
 715 
 716 static nfsstat4
 717 layoutrecall_file(layoutrecall_file4 *lrf, nfs4_server_t *np)
 718 {
 719         nfs_fh4         *rawfh = &lrf->lor_fh;
 720         nfs4_sharedfh_t sfh;
 721         vnode_t         *vp;
 722         rnode4_t        lrp, *rp;
 723         nfs4_fsidlt_t   *ltp;
 724         nfsstat4 nstatus = NFS4ERR_NOMATCHING_LAYOUT;
 725 
 726         bcopy(rawfh, &sfh, sizeof (*rawfh));
 727         lrp.r_fh = &sfh;
 728 
 729         mutex_enter(&np->s_lt_lock);
 730         /*
 731          * Look thru the fsid layout trees until we find a matching
 732          * rnode on an fsid layout tree's rnode layout tree.
 733          */
 734         for (ltp = avl_first(&np->s_fsidlt); ltp;
 735             ltp = AVL_NEXT(&np->s_fsidlt, ltp)) {
 736                 /*
 737                  * Look at this fsid layout tree's rnode layout tree
 738                  * and see if it has the rnode we want based on the
 739                  * file handle.
 740                  */
 741                 mutex_enter(&ltp->lt_rlt_lock);
 742                 rp = avl_find(&ltp->lt_rlayout_tree, &lrp, NULL);
 743                 if (rp != NULL) {
 744                         vp = RTOV4(rp);
 745                         VN_HOLD(vp);
 746                         mutex_enter(&rp->r_statelock);
 747                         /*
 748                          * Since this client will only hold one layout
 749                          * for an rnode at a time, if we get a
 750                          * layoutrecall, the stateid it has should match
 751                          * ours!.
 752                          */
 753                         if (lrf->lor_stateid.seqid !=
 754                             rp->r_lostateid.seqid + 1) {
 755                                 cmn_err(CE_WARN, "our layout stateids are"
 756                                     "out of sync! rnode: %p", (void *)rp);
 757                         }
 758                         pnfs_layout_return(vp, kcred, lrf->lor_stateid,
 759                             LR_ASYNC);
 760                         mutex_exit(&rp->r_statelock);
 761                         mutex_exit(&ltp->lt_rlt_lock);
 762                         VN_RELE(vp);
 763                         nstatus = NFS4_OK;
 764                         break;
 765                 }
 766                 mutex_exit(&ltp->lt_rlt_lock);
 767         }
 768         mutex_exit(&np->s_lt_lock);
 769         return (nstatus);
 770 }
 771 
 772 static void
 773 cb_layoutrecall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 774         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 775 {
 776         CB_LAYOUTRECALL4args *args = &argop->nfs_cb_argop4_u.opcblayoutrecall;
 777         CB_LAYOUTRECALL4res *resp = &resop->nfs_cb_resop4_u.opcblayoutrecall;
 778         struct nfs4_server *sp;
 779 
 780         if (args->clora_type != LAYOUT4_NFSV4_1_FILES) {
 781                 DTRACE_PROBE1(nfsc__i__badlayoutype, int32_t,
 782                     args->clora_type);
 783                 *cs->statusp = resp->clorr_status = NFS4ERR_INVAL;
 784                 return;
 785         }
 786 
 787         mutex_enter(&ncg->nfs4_cb_lock);
 788         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 789         mutex_exit(&ncg->nfs4_cb_lock);
 790 
 791         if (nfs4_server_vlock(sp, 0) == FALSE) {
 792                 DTRACE_PROBE1(nfsc__i__bad_prog, int, req->rq_prog);
 793                 *cs->statusp = resp->clorr_status = NFS4ERR_NOMATCHING_LAYOUT;
 794                 return;
 795         }
 796         mutex_exit(&sp->s_lock);
 797 
 798         switch (args->clora_recall.lor_recalltype) {
 799         case LAYOUTRECALL4_FILE:
 800                 *cs->statusp = resp->clorr_status =
 801                     layoutrecall_file(&args->clora_recall.
 802                     layoutrecall4_u.lor_layout, sp);
 803                 break;
 804         case LAYOUTRECALL4_FSID:
 805                 *cs->statusp = resp->clorr_status =
 806                     layoutrecall_fsid(&args->clora_recall.
 807                     layoutrecall4_u.lor_fsid, sp);
 808                 break;
 809         case LAYOUTRECALL4_ALL:
 810                 *cs->statusp = resp->clorr_status = layoutrecall_all(sp);
 811                 break;
 812         default:
 813                 *cs->statusp = resp->clorr_status = NFS4ERR_INVAL;
 814         }
 815         nfs4_server_rele(sp);
 816 
 817         if (resp->clorr_status != NFS4_OK)
 818                 DTRACE_PROBE2(nfsc__i__cblayouterr,
 819                     nfs4_server_t *, sp, nfsstat, resp->clorr_status);
 820 }
 821 
 822 static nfsstat4
 823 cb_notify_device(nfs4_server_t *sp, notify4 *no)
 824 {
 825         nfsstat4 stat = NFS4_OK;
 826         XDR x;
 827         notify_deviceid_change4 ndc;
 828         notify_deviceid_delete4 ndd;
 829 
 830         /* check for missing or extra bits */
 831         if ((no->notify_mask &
 832             ~(NOTIFY_DEVICEID4_CHANGE_MASK|NOTIFY_DEVICEID4_DELETE_MASK)) ||
 833             (no->notify_mask == 0))
 834                 DTRACE_PROBE1(nfsc__i__bad_mask, bitmap4 *, no->notify_mask);
 835 
 836         xdrmem_create(&x, no->notify_vals.notifylist4_val,
 837             no->notify_vals.notifylist4_len, XDR_DECODE);
 838         /*
 839          * The order of checking is significant.  Oddly, both bits
 840          * could be set.
 841          */
 842         if (no->notify_mask & NOTIFY_DEVICEID4_CHANGE_MASK) {
 843 
 844                 if (!xdr_notify_deviceid_change4(&x, &ndc))
 845                         stat = NFS4ERR_BADXDR;
 846                 else {
 847                         stat = pnfs_change_device(sp, &ndc);
 848                         xdr_free(xdr_notify_deviceid_change4, (caddr_t)&ndc);
 849                 }
 850         }
 851         if (stat == NFS4_OK &&
 852             (no->notify_mask & NOTIFY_DEVICEID4_DELETE_MASK)) {
 853 
 854                 if (!xdr_notify_deviceid_delete4(&x, &ndd))
 855                         stat = NFS4ERR_BADXDR;
 856                 else {
 857                         stat = pnfs_delete_device(sp, &ndd);
 858                         xdr_free(xdr_notify_deviceid_change4, (caddr_t)&ndd);
 859                 }
 860         }
 861 
 862         return (stat);
 863 }
 864 
 865 static void
 866 cb_notify_deviceid(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop,
 867     struct svc_req *req, struct compound_state *cs,
 868     struct nfs4_callback_globals *ncg)
 869 {
 870         CB_NOTIFY_DEVICEID4args *args =
 871             &argop->nfs_cb_argop4_u.opcbnotify_deviceid;
 872         CB_NOTIFY_DEVICEID4res *resp =
 873             &resop->nfs_cb_resop4_u.opcbnotify_deviceid;
 874         struct nfs4_server *sp;
 875         int i;
 876         nfsstat4 stat;
 877 
 878         mutex_enter(&ncg->nfs4_cb_lock);
 879         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 880         mutex_exit(&ncg->nfs4_cb_lock);
 881 
 882         if (nfs4_server_vlock(sp, 0) == FALSE) {
 883                 DTRACE_PROBE1(nfsc__i__bad_prog, int, req->rq_prog);
 884                 *cs->statusp = resp->cndr_status = NFS4ERR_INVAL;
 885                 return;
 886         }
 887         mutex_exit(&sp->s_lock);
 888 
 889         stat = NFS4_OK;
 890         for (i = 0; i < args->cnda_changes.cnda_changes_len; i++)
 891                 if ((stat = cb_notify_device(sp,
 892                     &args->cnda_changes.cnda_changes_val[i])) != NFS4_OK)
 893                         break;
 894 
 895         *cs->statusp = resp->cndr_status = stat;
 896         nfs4_server_rele(sp);
 897 }
 898 
 899 
 900 static void
 901 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 902         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 903 {
 904         CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
 905         CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
 906         rnode4_t *rp;
 907         vnode_t *vp;
 908         struct nfs4_server *sp;
 909         bool_t found = FALSE;
 910 
 911         ncg->nfs4_callback_stats.cb_recall.value.ui64++;
 912 
 913         ASSERT(req->rq_prog >= NFS4_CALLBACK);
 914         ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
 915 
 916 #ifdef DEBUG
 917         /*
 918          * error injection hook: set cb_recall_fail global to
 919          * NFS4 pcol error to be returned
 920          */
 921         if (cb4_recall_fail != NFS4_OK) {
 922                 *cs->statusp = resp->status = cb4_recall_fail;
 923                 return;
 924         }
 925 #endif
 926 
 927         mutex_enter(&ncg->nfs4_cb_lock);
 928         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 929         mutex_exit(&ncg->nfs4_cb_lock);
 930 
 931         if (nfs4_server_vlock(sp, 0) == FALSE) {
 932 
 933                 CB_WARN("cb_recall: cannot find server\n");
 934 
 935                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 936                 return;
 937         }
 938 
 939         /*
 940          * Search the delegation list for a matching file handle
 941          * AND stateid; mutex on sp prevents the list from changing.
 942          */
 943 
 944         rp = list_head(&sp->s_deleg_list);
 945         for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
 946                 mutex_enter(&rp->r_statev4_lock);
 947 
 948                 /* check both state id and file handle! */
 949 
 950                 if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
 951                     sizeof (stateid4)) == 0)) {
 952                         nfs4_fhandle_t fhandle;
 953 
 954                         sfh4_copyval(rp->r_fh, &fhandle);
 955                         if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
 956                             bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
 957                             fhandle.fh_len) == 0)) {
 958 
 959                                 found = TRUE;
 960                                 break;
 961                         } else {
 962 #ifdef  DEBUG
 963                                 CB_WARN("cb_recall: stateid OK, bad fh");
 964 #endif
 965                         }
 966                 }
 967 #ifdef  DEBUG
 968                 if (bcmp(&args->stateid, &nfs4_deleg_any,
 969                     sizeof (stateid4)) == 0) {
 970 
 971                         found = TRUE;
 972                         break;
 973                 }
 974 #endif
 975                 mutex_exit(&rp->r_statev4_lock);
 976         }
 977 
 978         /*
 979          * VN_HOLD the vnode before releasing s_lock to guarantee
 980          * we have a valid vnode reference.  The async thread will
 981          * release the hold when it's done.
 982          */
 983         if (found == TRUE) {
 984                 mutex_exit(&rp->r_statev4_lock);
 985                 vp = RTOV4(rp);
 986                 VN_HOLD(vp);
 987         }
 988         mutex_exit(&sp->s_lock);
 989         nfs4_server_rele(sp);
 990 
 991         if (found == FALSE) {
 992 
 993                 CB_WARN("cb_recall: bad stateid\n");
 994 
 995                 *cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
 996                 return;
 997         }
 998 
 999         /* Fire up a thread to do the delegreturn */
1000         nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
1001             args->truncate);
1002 
1003         *cs->statusp = resp->status = 0;
1004 }
1005 
1006 /* ARGSUSED */
1007 static void
1008 cb_recall_free(nfs_cb_resop4 *resop)
1009 {
1010         /* nothing to do here, cb_recall doesn't kmem_alloc */
1011 }
1012 
1013 /*
1014  * This function handles the CB_NULL proc call from an NFSv4 Server.
1015  *
1016  * We take note that the server has sent a CB_NULL for later processing
1017  * in the recovery logic. It is noted so we may pause slightly after the
1018  * setclientid and before reopening files. The pause is to allow the
1019  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
1020  * its internal structures such that it has the opportunity to grant
1021  * delegations to reopened files.
1022  *
1023  */
1024 
1025 /* ARGSUSED */
1026 static void
1027 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
1028     struct nfs4_callback_globals *ncg)
1029 {
1030         struct nfs4_server *sp;
1031 
1032         ncg->nfs4_callback_stats.cb_null.value.ui64++;
1033 
1034         ASSERT(req->rq_prog >= NFS4_CALLBACK);
1035         ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
1036 
1037         mutex_enter(&ncg->nfs4_cb_lock);
1038         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
1039         mutex_exit(&ncg->nfs4_cb_lock);
1040 
1041         if (nfs4_server_vlock(sp, 0) != FALSE) {
1042                 sp->s_flags |= N4S_CB_PINGED;
1043                 cv_broadcast(&sp->wait_cb_null);
1044                 mutex_exit(&sp->s_lock);
1045                 nfs4_server_rele(sp);
1046         }
1047 }
1048 
1049 /*
1050  * cb_illegal   args: void
1051  *              res : status (NFS4ERR_OP_CB_ILLEGAL)
1052  */
1053 /* ARGSUSED */
1054 static void
1055 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
1056         struct compound_state *cs, struct nfs4_callback_globals *ncg)
1057 {
1058         CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
1059 
1060         ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
1061         resop->resop = OP_CB_ILLEGAL;
1062         *cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
1063 }
1064 
1065 static void
1066 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
1067         struct nfs4_callback_globals *ncg)
1068 {
1069         uint_t i;
1070         struct compound_state cs;
1071         nfs_cb_argop4 *argop;
1072         nfs_cb_resop4 *resop, *new_res;
1073         uint_t op, mvers_0;
1074         boolean_t       sequenced = FALSE;
1075 
1076         bzero(&cs, sizeof (cs));
1077         cs.statusp = &resp->status;
1078         cs.cont = TRUE;
1079 
1080         /*
1081          * Form a reply tag by copying over the reqeuest tag.
1082          */
1083         resp->tag.utf8string_len = args->tag.utf8string_len;
1084         resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
1085             KM_SLEEP);
1086         bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
1087             args->tag.utf8string_len);
1088 
1089         /*
1090          * minorversion should be zero or one
1091          */
1092         if (args->minorversion != CB4_MINOR_v0 &&
1093             args->minorversion != CB4_MINOR_v1) {
1094                 resp->array_len = 0;
1095                 resp->array = NULL;
1096                 resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
1097                 return;
1098         }
1099 
1100         /*
1101          * The XDR code for CB_COMPOUND decodes all cb ops regardless
1102          * of the minorversion of the compound containing the ops.
1103          *
1104          * mvers_0 is used to validate ops according to minor version:
1105          * - only mvers 0 cb ops are allowed in mv 0 cb compounds
1106          * - "is sequenced" checks only apply to mv 1 cb compunds
1107          */
1108         mvers_0 = (args->minorversion == CB4_MINOR_v0);
1109 
1110 #ifdef DEBUG
1111         /*
1112          * Verify callback_ident.  It doesn't really matter if it's wrong
1113          * because we don't really use callback_ident -- we use prog number
1114          * of the RPC request instead.  In this case, just print a DEBUG
1115          * console message to reveal brokenness of cbclient (at bkoff/cthon).
1116          */
1117         if (args->callback_ident != req->rq_prog)
1118                 zcmn_err(getzoneid(), CE_WARN,
1119                     "cb_compound: cb_client using wrong "
1120                     "callback_ident(%d), should be %d",
1121                     args->callback_ident, req->rq_prog);
1122 #endif
1123 
1124         resp->array_len = args->array_len;
1125         resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
1126             KM_SLEEP);
1127 
1128         for (i = 0; i < args->array_len && cs.cont; i++) {
1129 
1130                 argop = &args->array[i];
1131                 resop = &resp->array[i];
1132                 resop->resop = argop->argop;
1133                 op = (uint_t)resop->resop;
1134 
1135                 switch (op) {
1136 
1137                 case OP_CB_SEQUENCE:
1138 
1139                         if (mvers_0) {
1140                                 op = OP_CB_ILLEGAL;
1141                                 cb_illegal(argop, resop, req, &cs, ncg);
1142                                 break;
1143                         }
1144                         cb_sequence(argop, resop, req, &cs, ncg);
1145                         if (*cs.statusp == NFS4_OK)
1146                                 sequenced = TRUE;
1147                         break;
1148 
1149                 case OP_CB_GETATTR:
1150 
1151                         if (!sequenced && !mvers_0) {
1152                                 *cs.statusp = resp->status =
1153                                     NFS4ERR_SEQUENCE_POS;
1154                                 break;
1155                         }
1156                         cb_getattr(argop, resop, req, &cs, ncg);
1157                         break;
1158 
1159                 case OP_CB_RECALL:
1160                         if (!sequenced && !mvers_0) {
1161                                 *cs.statusp = resp->status =
1162                                     NFS4ERR_SEQUENCE_POS;
1163                                 break;
1164                         }
1165                         cb_recall(argop, resop, req, &cs, ncg);
1166                         break;
1167 
1168                 case OP_CB_LAYOUTRECALL:
1169                         if (mvers_0) {
1170                                 op = OP_CB_ILLEGAL;
1171                                 cb_illegal(argop, resop, req, &cs, ncg);
1172                                 break;
1173                         }
1174                         if (!sequenced) {
1175                                 *cs.statusp = resp->status =
1176                                     NFS4ERR_SEQUENCE_POS;
1177                                 break;
1178                         }
1179                         cb_layoutrecall(argop, resop, req, &cs, ncg);
1180                         break;
1181 
1182                 case OP_CB_NOTIFY_DEVICEID:
1183                         if (mvers_0) {
1184                                 op = OP_CB_ILLEGAL;
1185                                 cb_illegal(argop, resop, req, &cs, ncg);
1186                                 break;
1187                         }
1188                         if (!sequenced) {
1189                                 *cs.statusp = resp->status =
1190                                     NFS4ERR_SEQUENCE_POS;
1191                                 break;
1192                         }
1193                         cb_notify_deviceid(argop, resop, req, &cs, ncg);
1194                         break;
1195 
1196                 case OP_CB_ILLEGAL:
1197                         if (!sequenced && !mvers_0) {
1198                                 *cs.statusp = resp->status =
1199                                     NFS4ERR_SEQUENCE_POS;
1200                                 break;
1201                         }
1202                         /* fall through */
1203 
1204                 default:
1205                         /*
1206                          * Handle OP_CB_ILLEGAL and any undefined opcode.
1207                          * Currently, the XDR code will return BADXDR
1208                          * if cb op doesn't decode to legal value, so
1209                          * it really only handles OP_CB_ILLEGAL.
1210                          */
1211                         op = OP_CB_ILLEGAL;
1212                         cb_illegal(argop, resop, req, &cs, ncg);
1213                 }
1214 
1215                 if (*cs.statusp != NFS4_OK)
1216                         cs.cont = FALSE;
1217 
1218                 /*
1219                  * If not at last op, and if we are to stop, then
1220                  * compact the results array.
1221                  */
1222                 if ((i + 1) < args->array_len && !cs.cont) {
1223 
1224                         new_res = kmem_alloc(
1225                             (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
1226                         bcopy(resp->array,
1227                             new_res, (i+1) * sizeof (nfs_cb_resop4));
1228                         kmem_free(resp->array,
1229                             args->array_len * sizeof (nfs_cb_resop4));
1230 
1231                         resp->array_len =  i + 1;
1232                         resp->array = new_res;
1233                 }
1234         }
1235 
1236 }
1237 
1238 static void
1239 cb_compound_free(CB_COMPOUND4res *resp)
1240 {
1241         uint_t i, op;
1242         nfs_cb_resop4 *resop;
1243 
1244         if (resp->tag.utf8string_val) {
1245                 UTF8STRING_FREE(resp->tag)
1246         }
1247 
1248         for (i = 0; i < resp->array_len; i++) {
1249 
1250                 resop = &resp->array[i];
1251                 op = (uint_t)resop->resop;
1252 
1253                 switch (op) {
1254 
1255                 case OP_CB_GETATTR:
1256 
1257                         cb_getattr_free(resop);
1258                         break;
1259 
1260                 case OP_CB_RECALL:
1261 
1262                         cb_recall_free(resop);
1263                         break;
1264 
1265                 default:
1266                         break;
1267                 }
1268         }
1269 
1270         if (resp->array != NULL) {
1271                 kmem_free(resp->array,
1272                     resp->array_len * sizeof (nfs_cb_resop4));
1273         }
1274 }
1275 
1276 static void
1277 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
1278 {
1279         CB_COMPOUND4args args;
1280         CB_COMPOUND4res res;
1281         struct nfs4_callback_globals *ncg;
1282 
1283         bool_t (*xdr_args)(), (*xdr_res)();
1284         void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
1285             struct nfs4_callback_globals *);
1286         void (*freeproc)(CB_COMPOUND4res *);
1287 
1288         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1289         ASSERT(ncg != NULL);
1290 
1291         ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
1292 
1293         switch (req->rq_proc) {
1294         case CB_NULL:
1295                 xdr_args = xdr_void;
1296                 xdr_res = xdr_void;
1297                 proc = cb_null;
1298                 freeproc = NULL;
1299                 break;
1300 
1301         case CB_COMPOUND:
1302                 xdr_args = xdr_CB_COMPOUND4args_clnt;
1303                 xdr_res = xdr_CB_COMPOUND4res;
1304                 proc = cb_compound;
1305                 freeproc = cb_compound_free;
1306                 break;
1307 
1308         default:
1309                 CB_WARN("cb_dispatch: no proc\n");
1310                 svcerr_noproc(xprt);
1311                 return;
1312         }
1313 
1314         args.tag.utf8string_val = NULL;
1315         args.array = NULL;
1316 
1317         if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
1318 
1319                 CB_WARN("cb_dispatch: cannot getargs\n");
1320                 svcerr_decode(xprt);
1321                 return;
1322         }
1323 
1324         (*proc)(&args, &res, req, ncg);
1325 
1326         if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
1327 
1328                 CB_WARN("cb_dispatch: bad sendreply\n");
1329                 svcerr_systemerr(xprt);
1330         }
1331 
1332         if (freeproc)
1333                 (*freeproc)(&res);
1334 
1335         if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
1336 
1337                 CB_WARN("cb_dispatch: bad freeargs\n");
1338         }
1339 }
1340 
1341 static rpcprog_t
1342 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
1343 {
1344         int i, j;
1345 
1346         j = ncg->nfs4_program_hint;
1347         for (i = 0; i < nfs4_num_prognums; i++, j++) {
1348 
1349                 if (j >= nfs4_num_prognums)
1350                         j = 0;
1351 
1352                 if (ncg->nfs4prog2server[j] == NULL) {
1353                         ncg->nfs4_program_hint = j+1;
1354                         return (j+NFS4_CALLBACK);
1355                 }
1356         }
1357 
1358         return (0);
1359 }
1360 
1361 void
1362 nfs4callback_destroy(nfs4_server_t *np)
1363 {
1364         struct nfs4_callback_globals *ncg;
1365         struct nfs41_cb_info *cbi;
1366         int i;
1367 
1368         if (np->s_program == 0)
1369                 return;
1370 
1371         ncg = np->zone_globals;
1372         cbi = ncg->nfs4prog2cbinfo[np->s_program - NFS4_CALLBACK];
1373 
1374         i = np->s_program - NFS4_CALLBACK;
1375 
1376         mutex_enter(&ncg->nfs4_cb_lock);
1377 
1378         ASSERT(ncg->nfs4prog2server[i] == np);
1379 
1380         ncg->nfs4prog2server[i] = NULL;
1381         ncg->nfs4prog2cbinfo[i] = NULL;
1382 
1383         if (i < ncg->nfs4_program_hint)
1384                 ncg->nfs4_program_hint = i;
1385 
1386         mutex_exit(&ncg->nfs4_cb_lock);
1387         np->s_program = 0;
1388         if (cbi != NULL)
1389                 nfs41_cbinfo_rele(cbi);
1390 }
1391 
1392 void
1393 nfs41_cbinfo_rele(struct nfs41_cb_info *cbi)
1394 {
1395         mutex_enter(&cbi->cb_reflock);
1396         cbi->cb_refcnt--;
1397         if (cbi->cb_refcnt > 0) {
1398                 mutex_exit(&cbi->cb_reflock);
1399                 return;
1400         }
1401         ASSERT(cbi->cb_flags & NFS41_CB_THREAD_EXIT);
1402         ASSERT(cbi->cb_cbconn_exit);
1403         mutex_exit(&cbi->cb_reflock);
1404 
1405         cbi->cb_rpc->r_flags |= SVCCB_DEAD;
1406         cv_signal(&cbi->cb_rpc->r_cbwait);    /* XXX - See mir_set_cbinfo */
1407 
1408         if (cbi->cb_client) {
1409                 if (!(CLNT_CONTROL(cbi->cb_client,
1410                     CLSET_BACKCHANNEL_CLEAR, NULL))) {
1411                         zcmn_err(getzoneid(), CE_WARN,
1412                             "Failed To Clear Client Handle Callback %p",
1413                             (void *)cbi->cb_client);
1414                 }
1415                 CLNT_DESTROY(cbi->cb_client);
1416         }
1417         mutex_destroy(&cbi->cb_cbconn_lock);
1418         cv_destroy(&cbi->cb_destroy_wait);
1419         cv_destroy(&cbi->cb_cbconn_wait);
1420         mutex_destroy(&cbi->cb_reflock);
1421         kmem_free(cbi, sizeof (*cbi));
1422 }
1423 
1424 /*
1425  * nfs4_setport - This function saves a netid and univeral address for
1426  * the callback program.  These values will be used during setclientid.
1427  */
1428 static void
1429 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
1430         struct nfs4_callback_globals *ncg)
1431 {
1432         struct nfs4_cb_port *p;
1433         bool_t found = FALSE;
1434 
1435         ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
1436 
1437         p = list_head(&ncg->nfs4_cb_ports);
1438         for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
1439                 if (strcmp(p->netid, netid) == 0) {
1440                         found = TRUE;
1441                         break;
1442                 }
1443         }
1444         if (found == TRUE)
1445                 (void) strcpy(p->uaddr, uaddr);
1446         else {
1447                 p = kmem_alloc(sizeof (*p), KM_SLEEP);
1448 
1449                 (void) strcpy(p->uaddr, uaddr);
1450                 (void) strcpy(p->netid, netid);
1451                 (void) strcpy(p->protofmly, protofmly);
1452                 (void) strcpy(p->proto, proto);
1453                 list_insert_head(&ncg->nfs4_cb_ports, p);
1454         }
1455 }
1456 
1457 static void
1458 nfs41_callback_thread(struct nfs41_cb_info *cbi)
1459 {
1460         callb_cpr_t     cprinfo;
1461         kmutex_t        cpr_lock;
1462         SVCXPRT         *clone_xprt;
1463         mblk_t          *mp;
1464         struct rpc_msg  msg;
1465         struct svc_req  r;
1466         char            *cred_area;
1467         int             rqcred_size = 400;      /* RQCRED_SIZE */
1468         SVCCB           *cb = cbi->cb_rpc;
1469 
1470         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
1471         CALLB_CPR_INIT(&cprinfo, &cpr_lock, callb_generic_cpr,
1472             "nfs41_cb");
1473 
1474         mutex_enter(&cbi->cb_rpc->r_lock);
1475         while (!(cbi->cb_flags & NFS41_CB_THREAD_EXIT)) {
1476                 mutex_enter(&cpr_lock);
1477                 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1478                 mutex_exit(&cpr_lock);
1479 
1480                 cv_wait(&cbi->cb_rpc->r_cbwait, &cbi->cb_rpc->r_lock);
1481 
1482                 mutex_enter(&cpr_lock);
1483                 CALLB_CPR_SAFE_END(&cprinfo, &cpr_lock);
1484                 mutex_exit(&cpr_lock);
1485 
1486                 if (cbi->cb_flags & NFS41_CB_THREAD_EXIT)
1487                         break;
1488 
1489                 mutex_exit(&cbi->cb_rpc->r_lock);
1490 
1491                 mutex_enter(&cb->r_mlock);
1492                 mp = cb->r_mp;
1493                 cb->r_mp = NULL;
1494                 mutex_exit(&cb->r_mlock);
1495 
1496                 clone_xprt = svc_clone_init();
1497 
1498                 svc_init_clone_xprt(clone_xprt, cb->r_q);
1499                 clone_xprt->xp_master = NULL;
1500                 clone_xprt->xp_msg_size = 2048; /* COTS_MAX_ALLOCSIZE */
1501                 cred_area = kmem_zalloc(2 * MAX_AUTH_BYTES + rqcred_size,
1502                     KM_SLEEP);
1503                 msg.rm_call.cb_cred.oa_base = cred_area;
1504                 msg.rm_call.cb_verf.oa_base = &(cred_area[MAX_AUTH_BYTES]);
1505                 r.rq_clntcred = &(cred_area[2 * MAX_AUTH_BYTES]);
1506 
1507                 /*
1508                  * underlying transport recv routine may modify mblk data
1509                  * and make it difficult to extract label afterwards. So
1510                  * get the label from the raw mblk data now.
1511                  */
1512                 if (is_system_labeled()) {
1513                         mblk_t *lmp;
1514 
1515                         r.rq_label = kmem_alloc(sizeof (bslabel_t), KM_SLEEP);
1516                         if (DB_CRED(mp) != NULL)
1517                                 lmp = mp;
1518                 else {
1519                         ASSERT(mp->b_cont != NULL);
1520                         lmp = mp->b_cont;
1521                         ASSERT(DB_CRED(lmp) != NULL);
1522                 }
1523                 bcopy(label2bslabel(crgetlabel(DB_CRED(lmp))), r.rq_label,
1524                     sizeof (bslabel_t));
1525                 } else {
1526                         r.rq_label = NULL;
1527                 }
1528 
1529                 /*
1530                  * Now receive the message.
1531                  */
1532                 if (SVC_RECV(clone_xprt, mp, &msg)) {
1533                         void (*dispatchroutine) (struct svc_req *, SVCXPRT *);
1534                         bool_t no_dispatch;
1535                         enum auth_stat why;
1536 
1537                         /*
1538                          * Find the registered program and call its
1539                          * dispatch routine.
1540                          */
1541                         r.rq_xprt = clone_xprt;
1542                         r.rq_prog = msg.rm_call.cb_prog;
1543                         r.rq_vers = msg.rm_call.cb_vers;
1544                         r.rq_proc = msg.rm_call.cb_proc;
1545                         r.rq_cred = msg.rm_call.cb_cred;
1546 
1547                         if ((why = sec_svc_msg(&r, &msg, &no_dispatch)) !=
1548                             AUTH_OK) {
1549                                 svcerr_auth(clone_xprt, why);
1550                                 (void) SVC_FREEARGS(clone_xprt, NULL, NULL);
1551                         } else if (no_dispatch) {
1552                                 (void) SVC_FREEARGS(clone_xprt, NULL, NULL);
1553                         } else {
1554                                 if (r.rq_vers >= cbi->cb_versmin &&
1555                                     r.rq_vers <= cbi->cb_versmax) {
1556                                         dispatchroutine = cbi->cb_callback;
1557                                         (*dispatchroutine) (&r, clone_xprt);
1558                                 } else {
1559                                         svcerr_progvers(clone_xprt,
1560                                             cbi->cb_versmin,
1561                                             cbi->cb_versmax);
1562                                 }
1563                         (void) SVC_FREEARGS(clone_xprt, NULL, NULL);
1564                         }
1565                         if (r.rq_cred.oa_flavor == RPCSEC_GSS)
1566                                 rpc_gss_cleanup(clone_xprt);
1567                 }
1568                 if (r.rq_label != NULL)
1569                         kmem_free(r.rq_label, sizeof (bslabel_t));
1570                 mutex_enter(&cbi->cb_rpc->r_lock);
1571         }
1572         cbi->cb_thread = NULL;
1573         mutex_exit(&cbi->cb_rpc->r_lock);
1574         mutex_enter(&cpr_lock);
1575         CALLB_CPR_EXIT(&cprinfo);
1576 
1577         nfs41_cbinfo_rele(cbi);
1578 
1579         /*
1580          * Signal destroy_session that we are done.
1581          */
1582         cv_signal(&cbi->cb_destroy_wait);
1583 
1584         zthread_exit();
1585 }
1586 
1587 
1588 /*
1589  * nfs4_cb_args - This function is used to construct the callback
1590  * portion of the arguments needed for setclientid.
1591  */
1592 
1593 void
1594 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
1595 {
1596         struct nfs4_cb_port *p;
1597         bool_t found = FALSE;
1598         rpcprog_t pgm;
1599         struct nfs4_callback_globals *ncg = np->zone_globals;
1600 
1601         /*
1602          * This server structure may already have a program number
1603          * assigned to it.  This happens when the client has to
1604          * re-issue SETCLIENTID.  Just re-use the information.
1605          */
1606         if (np->s_program >= NFS4_CALLBACK &&
1607             np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
1608                 nfs4callback_destroy(np);
1609 
1610         mutex_enter(&ncg->nfs4_cb_lock);
1611 
1612         p = list_head(&ncg->nfs4_cb_ports);
1613         for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
1614                 if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
1615                     strcmp(p->proto, knc->knc_proto) == 0) {
1616                         found = TRUE;
1617                         break;
1618                 }
1619         }
1620 
1621         if (found == FALSE) {
1622 
1623                 NFS4_DEBUG(nfs4_callback_debug,
1624                     (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
1625                     knc->knc_protofmly, knc->knc_proto));
1626 
1627                 args->callback.cb_program = 0;
1628                 args->callback.cb_location.r_netid = NULL;
1629                 args->callback.cb_location.r_addr = NULL;
1630                 args->callback_ident = 0;
1631                 mutex_exit(&ncg->nfs4_cb_lock);
1632                 return;
1633         }
1634 
1635         if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
1636                 CB_WARN("nfs4_cb_args: out of program numbers\n");
1637 
1638                 args->callback.cb_program = 0;
1639                 args->callback.cb_location.r_netid = NULL;
1640                 args->callback.cb_location.r_addr = NULL;
1641                 args->callback_ident = 0;
1642                 mutex_exit(&ncg->nfs4_cb_lock);
1643                 return;
1644         }
1645 
1646         ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
1647         args->callback.cb_program = pgm;
1648         args->callback.cb_location.r_netid = p->netid;
1649         args->callback.cb_location.r_addr = p->uaddr;
1650         args->callback_ident = pgm;
1651 
1652         np->s_program = pgm;
1653 
1654         mutex_exit(&ncg->nfs4_cb_lock);
1655 }
1656 
1657 /*
1658  * nfs4_cb_args - This function is used to construct the callback
1659  * portion of the arguments needed for create_session.
1660  */
1661 /* ARGSUSED */
1662 void
1663 nfs41_cb_args(nfs4_server_t *np, struct knetconfig *knc,
1664         CREATE_SESSION4args *args)
1665 {
1666         rpcprog_t pgm;
1667         struct nfs4_callback_globals *ncg = np->zone_globals;
1668         struct nfs41_cb_info    *cbi;
1669 
1670         /*
1671          * This server structure may already have a program number
1672          * assigned to it.  This happens when the client has to
1673          * re-issue SETCLIENTID.  Just re-use the information.
1674          */
1675         if (np->s_program >= NFS4_CALLBACK &&
1676             np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
1677                 nfs4callback_destroy(np);
1678 
1679         mutex_enter(&ncg->nfs4_cb_lock);
1680 
1681         if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
1682                 CB_WARN("nfs4_cb_args: out of program numbers\n");
1683 
1684                 args->csa_cb_program = 0;
1685                 args->csa_sec_parms.csa_sec_parms_len = 0;
1686                 args->csa_sec_parms.csa_sec_parms_val = NULL;
1687                 mutex_exit(&ncg->nfs4_cb_lock);
1688                 return;
1689         }
1690 
1691         if (ncg->nfs4prog2cbinfo[pgm-NFS4_CALLBACK] == NULL)
1692                 cbi = kmem_zalloc(sizeof (struct nfs41_cb_info), KM_SLEEP);
1693         else
1694                 cbi = ncg->nfs4prog2cbinfo[pgm-NFS4_CALLBACK];
1695 
1696         cbi->cb_prog = pgm;
1697         cbi->cb_versmin = NFS_CB;
1698         cbi->cb_versmax = NFS_CB;
1699         cbi->cb_callback = cb_dispatch;
1700 
1701         cv_init(&cbi->cb_destroy_wait, NULL, CV_DEFAULT, NULL);
1702         mutex_init(&cbi->cb_reflock, NULL, MUTEX_DEFAULT, NULL);
1703 
1704         cv_init(&cbi->cb_cbconn_wait, NULL, CV_DEFAULT, NULL);
1705         mutex_init(&cbi->cb_cbconn_lock, NULL, MUTEX_DEFAULT, NULL);
1706 
1707         /*
1708          * set cb_refcnt to 2, 1 to account for it being in the
1709          * nfs4prog2cbinfo table, and another for the nfs41_callback_thread.
1710          */
1711         cbi->cb_refcnt = 2;

1712         ncg->nfs4prog2cbinfo[pgm-NFS4_CALLBACK] = cbi;
1713         ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
1714         np->s_program = pgm;
1715         mutex_exit(&ncg->nfs4_cb_lock);
1716 
1717         args->csa_cb_program = pgm;
1718         args->csa_sec_parms.csa_sec_parms_len = 1;
1719         args->csa_sec_parms.csa_sec_parms_val = (callback_sec_parms4 *)
1720             kmem_zalloc(sizeof (callback_sec_parms4), KM_SLEEP);
1721         args->csa_sec_parms.csa_sec_parms_val->cb_secflavor = AUTH_NONE;
1722         cbi->cb_rpc = kmem_zalloc(sizeof (SVCCB), KM_SLEEP);
1723         mutex_init(&cbi->cb_rpc->r_lock, NULL, MUTEX_DEFAULT, NULL);
1724         mutex_init(&cbi->cb_rpc->r_mlock, NULL, MUTEX_DEFAULT, NULL);
1725         cv_init(&cbi->cb_rpc->r_cbwait, NULL, CV_DEFAULT, NULL);
1726         cbi->cb_rpc->r_prog = pgm;
1727         if (!cbi->cb_thread) {
1728                 cbi->cb_thread = zthread_create(NULL, 0,
1729                     nfs41_callback_thread,
1730                     cbi, 0, minclsyspri);
1731                 ASSERT(cbi->cb_thread != NULL);
1732         }
1733 
1734 }
1735 
1736 static int
1737 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
1738 {
1739         file_t *fp;
1740         vnode_t *vp;
1741         rnode4_t *rp;
1742         int error;
1743         STRUCT_HANDLE(nfs4_svc_args, uap);
1744 
1745         STRUCT_SET_HANDLE(uap, model, arg);
1746 
1747         if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
1748                 return (EBADF);
1749 
1750         vp = fp->f_vnode;
1751 
1752         if (vp == NULL || vp->v_type != VREG ||
1753             !vn_matchops(vp, nfs4_vnodeops)) {
1754                 releasef(STRUCT_FGET(uap, fd));
1755                 return (EBADF);
1756         }
1757 
1758         rp = VTOR4(vp);
1759 
1760         /*
1761          * I can't convince myself that we need locking here.  The
1762          * rnode cannot disappear and the value returned is instantly
1763          * stale anway, so why bother?
1764          */
1765 
1766         error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
1767         releasef(STRUCT_FGET(uap, fd));
1768         return (error);
1769 }
1770 
1771 
1772 /*
1773  * NFS4 client system call.  This service does the
1774  * necessary initialization for the callback program.
1775  * This is fashioned after the server side interaction
1776  * between nfsd and the kernel.  On the client, the
1777  * mount command forks and the child process does the
1778  * necessary interaction with the kernel.
1779  *
1780  * uap->fd is the fd of an open transport provider
1781  */
1782 int
1783 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
1784 {
1785         file_t *fp;
1786         int error;
1787         int readsize;
1788         char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
1789         char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
1790         size_t len;
1791         STRUCT_HANDLE(nfs4_svc_args, uap);
1792         struct netbuf addrmask;
1793         int cmd;
1794         SVCMASTERXPRT *cb_xprt;
1795         struct nfs4_callback_globals *ncg;
1796 
1797 #ifdef lint
1798         model = model;          /* STRUCT macros don't always refer to it */
1799 #endif
1800 
1801         STRUCT_SET_HANDLE(uap, model, arg);
1802 
1803         if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
1804                 return (nfs4_dquery(arg, model));
1805 
1806         if (secpolicy_nfs(CRED()) != 0)
1807                 return (EPERM);
1808 
1809         if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
1810                 return (EBADF);
1811 
1812         /*
1813          * Set read buffer size to rsize
1814          * and add room for RPC headers.
1815          */
1816         readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
1817         if (readsize < RPC_MAXDATASIZE)
1818                 readsize = RPC_MAXDATASIZE;
1819 
1820         error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
1821             KNC_STRSIZE, &len);
1822         if (error) {
1823                 releasef(STRUCT_FGET(uap, fd));
1824                 return (error);
1825         }
1826 
1827         cmd = STRUCT_FGET(uap, cmd);
1828 
1829         if (cmd & NFS4_KRPC_START) {
1830                 addrmask.len = STRUCT_FGET(uap, addrmask.len);
1831                 addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1832                 addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1833                 error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1834                     addrmask.len);
1835                 if (error) {
1836                         releasef(STRUCT_FGET(uap, fd));
1837                         kmem_free(addrmask.buf, addrmask.maxlen);
1838                         return (error);
1839                 }
1840         }
1841         else
1842                 addrmask.buf = NULL;
1843 
1844         error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1845             sizeof (uaddr), &len);
1846         if (error) {
1847                 releasef(STRUCT_FGET(uap, fd));
1848                 if (addrmask.buf)
1849                         kmem_free(addrmask.buf, addrmask.maxlen);
1850                 return (error);
1851         }
1852 
1853         error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1854             sizeof (protofmly), &len);
1855         if (error) {
1856                 releasef(STRUCT_FGET(uap, fd));
1857                 if (addrmask.buf)
1858                         kmem_free(addrmask.buf, addrmask.maxlen);
1859                 return (error);
1860         }
1861 
1862         error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1863             sizeof (proto), &len);
1864         if (error) {
1865                 releasef(STRUCT_FGET(uap, fd));
1866                 if (addrmask.buf)
1867                         kmem_free(addrmask.buf, addrmask.maxlen);
1868                 return (error);
1869         }
1870 
1871         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1872         ASSERT(ncg != NULL);
1873 
1874         mutex_enter(&ncg->nfs4_cb_lock);
1875         if (cmd & NFS4_SETPORT)
1876                 nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1877 
1878         if (cmd & NFS4_KRPC_START) {
1879                 error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1880                     &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1881                 if (error) {
1882                         CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1883                             error);
1884                         kmem_free(addrmask.buf, addrmask.maxlen);
1885                 }
1886         }
1887 
1888         mutex_exit(&ncg->nfs4_cb_lock);
1889         releasef(STRUCT_FGET(uap, fd));
1890         return (error);
1891 }
1892 
1893 struct nfs4_callback_globals *
1894 nfs4_get_callback_globals(void)
1895 {
1896         return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1897 }
1898 
1899 static void *
1900 nfs4_callback_init_zone(zoneid_t zoneid)
1901 {
1902         kstat_t *nfs4_callback_kstat;
1903         struct nfs4_callback_globals *ncg;
1904 
1905         ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1906 
1907         ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1908             sizeof (struct nfs4_server *), KM_SLEEP);
1909 
1910         ncg->nfs4prog2cbinfo = kmem_zalloc(nfs4_num_prognums *
1911             sizeof (struct nfs4_cb_info *), KM_SLEEP);
1912 
1913         /* initialize the dlist */
1914         mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1915         list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1916             offsetof(struct nfs4_dnode, linkage));
1917 
1918         /* initialize cb_port list */
1919         mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1920         list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1921             offsetof(struct nfs4_cb_port, linkage));
1922 
1923         /* get our own copy of the kstats */
1924         bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1925             sizeof (nfs4_callback_stats_tmpl));
1926         /* register "nfs:0:nfs4_callback_stats" for this zone */
1927         if ((nfs4_callback_kstat =
1928             kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1929             KSTAT_TYPE_NAMED,
1930             sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1931             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1932             zoneid)) != NULL) {
1933                 nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1934                 kstat_install(nfs4_callback_kstat);
1935         }
1936         return (ncg);
1937 }
1938 
1939 static void
1940 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1941 {
1942         nfs4_server_t *sp;
1943         int i, num_removed;
1944 
1945         /*
1946          * It's OK here to just run through the registered "programs", as
1947          * servers without programs won't have any delegations to handle.
1948          */
1949         for (i = 0; i < nfs4_num_prognums; i++) {
1950                 rnode4_t *rp;
1951 
1952                 mutex_enter(&ncg->nfs4_cb_lock);
1953                 sp = ncg->nfs4prog2server[i];
1954                 mutex_exit(&ncg->nfs4_cb_lock);
1955 
1956                 if (nfs4_server_vlock(sp, 1) == FALSE)
1957                         continue;
1958                 num_removed = 0;
1959                 while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1960                         mutex_enter(&rp->r_statev4_lock);
1961                         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1962                                 /*
1963                                  * We need to take matters into our own hands,
1964                                  * as nfs4delegreturn_cleanup_impl() won't
1965                                  * remove this from the list.
1966                                  */
1967                                 list_remove(&sp->s_deleg_list, rp);
1968                                 mutex_exit(&rp->r_statev4_lock);
1969                                 nfs4_dec_state_ref_count_nolock(sp,
1970                                     VTOMI4(RTOV4(rp)));
1971                                 num_removed++;
1972                                 continue;
1973                         }
1974                         mutex_exit(&rp->r_statev4_lock);
1975                         VN_HOLD(RTOV4(rp));
1976                         mutex_exit(&sp->s_lock);
1977                         /*
1978                          * The following will remove the node from the list.
1979                          */
1980                         nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1981                         VN_RELE(RTOV4(rp));
1982                         mutex_enter(&sp->s_lock);
1983                 }
1984                 mutex_exit(&sp->s_lock);
1985                 /* each removed list node reles a reference */
1986                 while (num_removed-- > 0)
1987                         nfs4_server_rele(sp);
1988                 /* remove our reference for nfs4_server_vlock */
1989                 nfs4_server_rele(sp);
1990         }
1991 }
1992 
1993 /* ARGSUSED */
1994 static void
1995 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1996 {
1997         struct nfs4_callback_globals *ncg = data;
1998 
1999         /*
2000          * Clean pending delegation return list.
2001          */
2002         nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
2003 
2004         /*
2005          * Discard all delegations.
2006          */
2007         nfs4_discard_delegations(ncg);
2008 }
2009 
2010 static void
2011 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
2012 {
2013         struct nfs4_callback_globals *ncg = data;
2014         struct nfs4_cb_port *p;
2015         nfs4_server_t *sp, *next;
2016         nfs4_server_t freelist;
2017         int i;
2018 
2019         kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
2020 
2021         /*
2022          * Discard all delegations that may have crept in since we did the
2023          * _shutdown.
2024          */
2025         nfs4_discard_delegations(ncg);
2026         /*
2027          * We're completely done with this zone and all associated
2028          * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
2029          * more reference outstanding -- the reference we didn't release in
2030          * nfs4_renew_lease_thread().
2031          *
2032          * Here we need to run through the global nfs4_server_lst as we need to
2033          * deal with nfs4_server_ts without programs, as they also have threads
2034          * created for them, and so have outstanding references that we need to
2035          * release.
2036          */
2037         freelist.forw = &freelist;
2038         freelist.back = &freelist;
2039         mutex_enter(&nfs4_server_lst_lock);
2040         sp = nfs4_server_lst.forw;
2041         while (sp != &nfs4_server_lst) {
2042                 next = sp->forw;
2043                 if (sp->zoneid == zoneid) {
2044                         remque(sp);
2045                         insque(sp, &freelist);
2046                 }
2047                 sp = next;
2048         }
2049         mutex_exit(&nfs4_server_lst_lock);
2050 
2051         sp = freelist.forw;
2052         while (sp != &freelist) {
2053                 next = sp->forw;
2054                 nfs4_server_rele(sp);   /* free the list's reference */
2055                 sp = next;
2056         }
2057 
2058 #ifdef DEBUG
2059         for (i = 0; i < nfs4_num_prognums; i++) {
2060                 ASSERT(ncg->nfs4prog2server[i] == NULL);
2061         }
2062 #endif
2063         kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
2064             sizeof (struct nfs4_server *));
2065 
2066         mutex_enter(&ncg->nfs4_cb_lock);
2067         while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
2068                 list_remove(&ncg->nfs4_cb_ports, p);
2069                 kmem_free(p, sizeof (*p));
2070         }
2071         list_destroy(&ncg->nfs4_cb_ports);
2072         mutex_destroy(&ncg->nfs4_cb_lock);
2073         list_destroy(&ncg->nfs4_dlist);
2074         mutex_destroy(&ncg->nfs4_dlist_lock);
2075         kmem_free(ncg, sizeof (*ncg));
2076 }
2077 
2078 void
2079 nfs4_callback_init(void)
2080 {
2081         int i;
2082         SVC_CALLOUT *nfs4_cb_sc;
2083 
2084         /* initialize the callback table */
2085         nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
2086             sizeof (SVC_CALLOUT), KM_SLEEP);
2087 
2088         for (i = 0; i < nfs4_num_prognums; i++) {
2089                 nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
2090                 nfs4_cb_sc[i].sc_versmin = NFS_CB;
2091                 nfs4_cb_sc[i].sc_versmax = NFS_CB;
2092                 nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
2093         }
2094 
2095         nfs4_cb_sct.sct_size = nfs4_num_prognums;
2096         nfs4_cb_sct.sct_free = FALSE;
2097         nfs4_cb_sct.sct_sc = nfs4_cb_sc;
2098 
2099         /*
2100          * Compute max bytes required for dyamically allocated parts
2101          * of cb_getattr reply.  Only size and change are supported now.
2102          * If CB_GETATTR is changed to reply with additional attrs,
2103          * additional sizes must be added below.
2104          *
2105          * fattr4_change + fattr4_size == uint64_t + uint64_t
2106          */
2107         cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
2108 
2109         zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
2110             nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
2111 }
2112 
2113 void
2114 nfs4_callback_fini(void)
2115 {
2116 }
2117 
2118 /*
2119  * NB: This function can be called from the *wrong* zone (ie, the zone that
2120  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
2121  * if the zone is going away and we get called from nfs4_async_inactive().  In
2122  * this case the globals will be NULL and we won't update the counters, which
2123  * doesn't matter as the zone is going away anyhow.
2124  */
2125 static void
2126 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
2127         struct nfs4_callback_globals *ncg)
2128 {
2129         mntinfo4_t *mi = VTOMI4(RTOV4(rp));
2130         boolean_t need_rele = B_FALSE;
2131 
2132         /*
2133          * Caller must be holding mi_recovlock in read mode
2134          * to call here.  This is provided by start_op.
2135          * Delegation management requires to grab s_lock
2136          * first and then r_statev4_lock.
2137          */
2138 
2139         if (np == NULL) {
2140                 np = find_nfs4_server_all(mi, 1);
2141                 ASSERT(np != NULL);
2142                 need_rele = B_TRUE;
2143         } else {
2144                 mutex_enter(&np->s_lock);
2145         }
2146 
2147         mutex_enter(&rp->r_statev4_lock);
2148 
2149         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2150                 mutex_exit(&rp->r_statev4_lock);
2151                 mutex_exit(&np->s_lock);
2152                 if (need_rele)
2153                         nfs4_server_rele(np);
2154                 return;
2155         }
2156 
2157         /*
2158          * Free the cred originally held when
2159          * the delegation was granted.  Caller must
2160          * hold this cred if it wants to use it after
2161          * this call.
2162          */
2163         crfree(rp->r_deleg_cred);
2164         rp->r_deleg_cred = NULL;
2165         rp->r_deleg_type = OPEN_DELEGATE_NONE;
2166         rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2167         rp->r_deleg_needs_recall = FALSE;
2168         rp->r_deleg_return_pending = FALSE;
2169 
2170         /*
2171          * Remove the rnode from the server's list and
2172          * update the ref counts.
2173          */
2174         list_remove(&np->s_deleg_list, rp);
2175         mutex_exit(&rp->r_statev4_lock);
2176         nfs4_dec_state_ref_count_nolock(np, mi);
2177         mutex_exit(&np->s_lock);
2178         /* removed list node removes a reference */
2179         nfs4_server_rele(np);
2180         if (need_rele)
2181                 nfs4_server_rele(np);
2182         if (ncg != NULL)
2183                 ncg->nfs4_callback_stats.delegations.value.ui64--;
2184 }
2185 
2186 void
2187 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
2188 {
2189         struct nfs4_callback_globals *ncg;
2190 
2191         if (np != NULL) {
2192                 ncg = np->zone_globals;
2193         } else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
2194                 ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2195                 ASSERT(ncg != NULL);
2196         } else {
2197                 /*
2198                  * Request coming from the wrong zone.
2199                  */
2200                 ASSERT(getzoneid() == GLOBAL_ZONEID);
2201                 ncg = NULL;
2202         }
2203 
2204         nfs4delegreturn_cleanup_impl(rp, np, ncg);
2205 }
2206 
2207 static void
2208 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2209         cred_t *cr, vnode_t *vp)
2210 {
2211         if (error != ETIMEDOUT && error != EINTR &&
2212             !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2213                 lost_rqstp->lr_op = 0;
2214                 return;
2215         }
2216 
2217         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2218             "nfs4close_save_lost_rqst: error %d", error));
2219 
2220         lost_rqstp->lr_op = OP_DELEGRETURN;
2221         /*
2222          * The vp is held and rele'd via the recovery code.
2223          * See nfs4_save_lost_rqst.
2224          */
2225         lost_rqstp->lr_vp = vp;
2226         lost_rqstp->lr_dvp = NULL;
2227         lost_rqstp->lr_oop = NULL;
2228         lost_rqstp->lr_osp = NULL;
2229         lost_rqstp->lr_lop = NULL;
2230         lost_rqstp->lr_cr = cr;
2231         lost_rqstp->lr_flk = NULL;
2232         lost_rqstp->lr_putfirst = FALSE;
2233 }
2234 
2235 static void
2236 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
2237 {
2238         COMPOUND4args_clnt args;
2239         COMPOUND4res_clnt res;
2240         nfs_argop4 argops[3];
2241         nfs4_ga_res_t *garp = NULL;
2242         hrtime_t t;
2243         int numops;
2244         int doqueue = 1;
2245         mntinfo4_t *mi = VTOMI4(RTOV4(rp));
2246 
2247         args.ctag = TAG_DELEGRETURN;
2248 
2249         numops = 3;             /* PUTFH, GETATTR, DELEGRETURN */
2250 
2251         args.array = argops;
2252         args.array_len = numops;
2253 
2254         argops[0].argop = OP_CPUTFH;
2255         argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2256 
2257         argops[1].argop = OP_GETATTR;
2258         argops[1].nfs_argop4_u.opgetattr.attr_request =
2259             MI4_DEFAULT_ATTRMAP(mi);
2260         argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
2261 
2262         argops[2].argop = OP_DELEGRETURN;
2263         argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
2264             rp->r_deleg_stateid;
2265 
2266         t = gethrtime();
2267         rfs4call(VTOMI4(RTOV4(rp)), NULL, &args, &res, cr, &doqueue, 0, ep);
2268 
2269         if (ep->error)
2270                 return;
2271 
2272         if (res.status == NFS4_OK) {
2273                 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
2274                 nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
2275 
2276         }
2277         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2278 }
2279 
2280 int
2281 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
2282         struct nfs4_callback_globals *ncg)
2283 {
2284         vnode_t *vp = RTOV4(rp);
2285         mntinfo4_t *mi = VTOMI4(vp);
2286         nfs4_lost_rqst_t lost_rqst;
2287         nfs4_recov_state_t recov_state;
2288         bool_t needrecov = FALSE, recovonly, done = FALSE;
2289         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2290 
2291         ncg->nfs4_callback_stats.delegreturn.value.ui64++;
2292 
2293         while (!done) {
2294                 e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
2295                     &recov_state, &recovonly);
2296 
2297                 if (e.error) {
2298                         if (flags & NFS4_DR_FORCE) {
2299                                 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
2300                                     RW_READER, 0);
2301                                 nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2302                                 nfs_rw_exit(&mi->mi_recovlock);
2303                         }
2304                         break;
2305                 }
2306 
2307                 /*
2308                  * Check to see if the delegation has already been
2309                  * returned by the recovery thread.   The state of
2310                  * the delegation cannot change at this point due
2311                  * to start_fop and the r_deleg_recall_lock.
2312                  */
2313                 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2314                         e.error = 0;
2315                         nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2316                         break;
2317                 }
2318 
2319                 if (recovonly) {
2320                         /*
2321                          * Delegation will be returned via the
2322                          * recovery framework.  Build a lost request
2323                          * structure, start recovery and get out.
2324                          */
2325                         nfs4_error_init(&e, EINTR);
2326                         nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
2327                             cr, vp);
2328                         (void) nfs4_start_recovery(&e, mi, vp,
2329                             NULL, &rp->r_deleg_stateid,
2330                             lost_rqst.lr_op == OP_DELEGRETURN ?
2331                             &lost_rqst : NULL, OP_DELEGRETURN, NULL);
2332                         nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2333                         break;
2334                 }
2335 
2336                 nfs4delegreturn_otw(rp, cr, &e);
2337 
2338                 /*
2339                  * Ignore some errors on delegreturn; no point in marking
2340                  * the file dead on a state destroying operation.
2341                  */
2342                 if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
2343                     e.stat == NFS4ERR_BADHANDLE ||
2344                     e.stat == NFS4ERR_STALE))
2345                         needrecov = FALSE;
2346                 else
2347                         needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
2348 
2349                 if (needrecov) {
2350                         nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
2351                             cr, vp);
2352                         (void) nfs4_start_recovery(&e, mi, vp,
2353                             NULL, &rp->r_deleg_stateid,
2354                             lost_rqst.lr_op == OP_DELEGRETURN ?
2355                             &lost_rqst : NULL, OP_DELEGRETURN, NULL);
2356                 } else {
2357                         nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2358                         done = TRUE;
2359                 }
2360 
2361                 nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2362         }
2363         return (e.error);
2364 }
2365 
2366 /*
2367  * nfs4_resend_delegreturn - used to drive the delegreturn
2368  * operation via the recovery thread.
2369  */
2370 void
2371 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
2372         nfs4_server_t *np)
2373 {
2374         rnode4_t *rp = VTOR4(lorp->lr_vp);
2375 
2376         /* If the file failed recovery, just quit. */
2377         mutex_enter(&rp->r_statelock);
2378         if (rp->r_flags & R4RECOVERR) {
2379                 ep->error = EIO;
2380         }
2381         mutex_exit(&rp->r_statelock);
2382 
2383         if (!ep->error)
2384                 nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
2385 
2386         /*
2387          * If recovery is now needed, then return the error
2388          * and status and let the recovery thread handle it,
2389          * including re-driving another delegreturn.  Otherwise,
2390          * just give up and clean up the delegation.
2391          */
2392         if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
2393                 return;
2394 
2395         if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
2396                 nfs4delegreturn_cleanup(rp, np);
2397 
2398         nfs4_error_zinit(ep);
2399 }
2400 
2401 /*
2402  * nfs4delegreturn - general function to return a delegation.
2403  *
2404  * NFS4_DR_FORCE - return the delegation even if start_op fails
2405  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
2406  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
2407  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
2408  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
2409  * NFS4_DR_REOPEN - do file reopens, if applicable
2410  */
2411 static int
2412 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
2413 {
2414         int error = 0;
2415         cred_t *cr = NULL;
2416         vnode_t *vp;
2417         bool_t needrecov = FALSE;
2418         bool_t rw_entered = FALSE;
2419         bool_t do_reopen;
2420 
2421         vp = RTOV4(rp);
2422 
2423         /*
2424          * If NFS4_DR_DISCARD is set by itself, take a short-cut and
2425          * discard without doing an otw DELEGRETURN.  This may only be used
2426          * by the recovery thread because it bypasses the synchronization
2427          * with r_deleg_recall_lock and mi->mi_recovlock.
2428          */
2429         if (flags == NFS4_DR_DISCARD) {
2430                 nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2431                 return (0);
2432         }
2433 
2434         if (flags & NFS4_DR_DID_OP) {
2435                 /*
2436                  * Caller had already done start_op, which means the
2437                  * r_deleg_recall_lock is already held in READ mode
2438                  * so we cannot take it in write mode.  Return the
2439                  * delegation asynchronously.
2440                  *
2441                  * Remove the NFS4_DR_DID_OP flag so we don't
2442                  * get stuck looping through here.
2443                  */
2444                 VN_HOLD(vp);
2445                 nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
2446                 return (0);
2447         }
2448 
2449         /*
2450          * Verify we still have a delegation and crhold the credential.
2451          */
2452         mutex_enter(&rp->r_statev4_lock);
2453         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2454                 mutex_exit(&rp->r_statev4_lock);
2455                 goto out;
2456         }
2457         cr = rp->r_deleg_cred;
2458         ASSERT(cr != NULL);
2459         crhold(cr);
2460         mutex_exit(&rp->r_statev4_lock);
2461 
2462         /*
2463          * Push the modified data back to the server synchronously
2464          * before doing DELEGRETURN.
2465          */
2466         if (flags & NFS4_DR_PUSH)
2467                 (void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2468 
2469         /*
2470          * Take r_deleg_recall_lock in WRITE mode, this will prevent
2471          * nfs4_is_otw_open_necessary from trying to use the delegation
2472          * while the DELEGRETURN is in progress.
2473          */
2474         (void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
2475 
2476         rw_entered = TRUE;
2477 
2478         if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
2479                 goto out;
2480 
2481         if (flags & NFS4_DR_REOPEN) {
2482                 /*
2483                  * If R4RECOVERRP is already set, then skip re-opening
2484                  * the delegation open streams and go straight to doing
2485                  * delegreturn.  (XXX if the file has failed recovery, then the
2486                  * delegreturn attempt is likely to be futile.)
2487                  */
2488                 mutex_enter(&rp->r_statelock);
2489                 do_reopen = !(rp->r_flags & R4RECOVERRP);
2490                 mutex_exit(&rp->r_statelock);
2491 
2492                 if (do_reopen) {
2493                         error = deleg_reopen(vp, &needrecov, ncg, flags);
2494                         if (error != 0) {
2495                                 if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
2496                                     == 0)
2497                                         goto out;
2498                         } else if (needrecov) {
2499                                 if ((flags & NFS4_DR_FORCE) == 0)
2500                                         goto out;
2501                         }
2502                 }
2503         }
2504 
2505         if (flags & NFS4_DR_DISCARD) {
2506                 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
2507 
2508                 mutex_enter(&rp->r_statelock);
2509                 /*
2510                  * deleg_return_pending is cleared inside of delegation_accept
2511                  * when a delegation is accepted.  if this flag has been
2512                  * cleared, then a new delegation has overwritten the one we
2513                  * were about to throw away.
2514                  */
2515                 if (!rp->r_deleg_return_pending) {
2516                         mutex_exit(&rp->r_statelock);
2517                         goto out;
2518                 }
2519                 mutex_exit(&rp->r_statelock);
2520                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
2521                 nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2522                 nfs_rw_exit(&mi->mi_recovlock);
2523         } else {
2524                 error = nfs4_do_delegreturn(rp, flags, cr, ncg);
2525         }
2526 
2527 out:
2528         if (cr)
2529                 crfree(cr);
2530         if (rw_entered)
2531                 nfs_rw_exit(&rp->r_deleg_recall_lock);
2532         return (error);
2533 }
2534 
2535 int
2536 nfs4delegreturn(rnode4_t *rp, int flags)
2537 {
2538         struct nfs4_callback_globals *ncg;
2539 
2540         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2541         ASSERT(ncg != NULL);
2542 
2543         return (nfs4delegreturn_impl(rp, flags, ncg));
2544 }
2545 
2546 void
2547 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
2548 {
2549         struct cb_recall_pass *pp;
2550 
2551         pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2552         pp->rp = rp;
2553         pp->flags = flags;
2554         pp->truncate = trunc;
2555 
2556         /*
2557          * Fire up a thread to do the actual delegreturn
2558          * Caller must guarantee that the rnode doesn't
2559          * vanish (by calling VN_HOLD).
2560          */
2561 
2562         (void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2563             minclsyspri);
2564 }
2565 
2566 static void
2567 delegreturn_all_thread(rpcprog_t *pp)
2568 {
2569         nfs4_server_t *np;
2570         bool_t found = FALSE;
2571         rpcprog_t prog;
2572         rnode4_t *rp;
2573         vnode_t *vp;
2574         zoneid_t zoneid = getzoneid();
2575         struct nfs4_callback_globals *ncg;
2576 
2577         NFS4_DEBUG(nfs4_drat_debug,
2578             (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
2579 
2580         prog = *pp;
2581         kmem_free(pp, sizeof (*pp));
2582         pp = NULL;
2583 
2584         mutex_enter(&nfs4_server_lst_lock);
2585         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2586                 if (np->zoneid == zoneid && np->s_program == prog) {
2587                         mutex_enter(&np->s_lock);
2588                         found = TRUE;
2589                         break;
2590                 }
2591         }
2592         mutex_exit(&nfs4_server_lst_lock);
2593 
2594         /*
2595          * It's possible that the nfs4_server which was using this
2596          * program number has vanished since this thread is async.
2597          * If so, just return.  Your work here is finished, my friend.
2598          */
2599         if (!found)
2600                 goto out;
2601 
2602         ncg = np->zone_globals;
2603         while ((rp = list_head(&np->s_deleg_list)) != NULL) {
2604                 vp = RTOV4(rp);
2605                 VN_HOLD(vp);
2606                 mutex_exit(&np->s_lock);
2607                 (void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
2608                     ncg);
2609                 VN_RELE(vp);
2610 
2611                 /* retake the s_lock for next trip through the loop */
2612                 mutex_enter(&np->s_lock);
2613         }
2614         mutex_exit(&np->s_lock);
2615 out:
2616         NFS4_DEBUG(nfs4_drat_debug,
2617             (CE_NOTE, "delereturn_all_thread: complete\n"));
2618         zthread_exit();
2619 }
2620 
2621 void
2622 nfs4_delegreturn_all(nfs4_server_t *sp)
2623 {
2624         rpcprog_t pro, *pp;
2625 
2626         mutex_enter(&sp->s_lock);
2627 
2628         /* Check to see if the delegation list is empty */
2629 
2630         if (list_head(&sp->s_deleg_list) == NULL) {
2631                 mutex_exit(&sp->s_lock);
2632                 return;
2633         }
2634         /*
2635          * Grab the program number; the async thread will use this
2636          * to find the nfs4_server.
2637          */
2638         pro = sp->s_program;
2639         mutex_exit(&sp->s_lock);
2640         pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
2641         *pp = pro;
2642         (void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
2643             minclsyspri);
2644 }
2645 
2646 
2647 /*
2648  * Discard any delegations
2649  *
2650  * Iterate over the servers s_deleg_list and
2651  * for matching mount-point rnodes discard
2652  * the delegation.
2653  */
2654 void
2655 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
2656 {
2657         rnode4_t *rp, *next;
2658         mntinfo4_t *r_mi;
2659         struct nfs4_callback_globals *ncg;
2660 
2661         ASSERT(mutex_owned(&sp->s_lock));
2662         ncg = sp->zone_globals;
2663 
2664         for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
2665                 r_mi = VTOMI4(RTOV4(rp));
2666                 next = list_next(&sp->s_deleg_list, rp);
2667 
2668                 if (r_mi != mi) {
2669                         /*
2670                          * Skip if this rnode is in not on the
2671                          * same mount-point
2672                          */
2673                         continue;
2674                 }
2675 
2676                 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
2677 
2678 #ifdef DEBUG
2679                 if (nfs4_client_recov_debug) {
2680                         zprintf(getzoneid(),
2681                             "nfs4_deleg_discard: matched rnode %p "
2682                         "-- discarding delegation\n", (void *)rp);
2683                 }
2684 #endif
2685                 mutex_enter(&rp->r_statev4_lock);
2686                 /*
2687                  * Free the cred originally held when the delegation
2688                  * was granted. Also need to decrement the refcnt
2689                  * on this server for each delegation we discard
2690                  */
2691                 if (rp->r_deleg_cred)
2692                         crfree(rp->r_deleg_cred);
2693                 rp->r_deleg_cred = NULL;
2694                 rp->r_deleg_type = OPEN_DELEGATE_NONE;
2695                 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2696                 rp->r_deleg_needs_recall = FALSE;
2697                 ASSERT(sp->s_refcnt > 1);
2698                 sp->s_refcnt--;
2699                 list_remove(&sp->s_deleg_list, rp);
2700                 mutex_exit(&rp->r_statev4_lock);
2701                 nfs4_dec_state_ref_count_nolock(sp, mi);
2702                 ncg->nfs4_callback_stats.delegations.value.ui64--;
2703         }
2704 }
2705 
2706 /*
2707  * Reopen any open streams that were covered by the given file's
2708  * delegation.
2709  * Returns zero or an errno value.  If there was no error, *recovp
2710  * indicates whether recovery was initiated.
2711  */
2712 
2713 static int
2714 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
2715         int flags)
2716 {
2717         nfs4_open_stream_t *osp;
2718         nfs4_recov_state_t recov_state;
2719         bool_t needrecov = FALSE;
2720         mntinfo4_t *mi;
2721         rnode4_t *rp;
2722         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2723         int claimnull;
2724 
2725         mi = VTOMI4(vp);
2726         rp = VTOR4(vp);
2727 
2728         recov_state.rs_flags = 0;
2729         recov_state.rs_num_retry_despite_err = 0;
2730 
2731 retry:
2732         if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
2733                 return (e.error);
2734         }
2735 
2736         /*
2737          * if we mean to discard the delegation, it must be BAD, so don't
2738          * use it when doing the reopen or it will fail too.
2739          */
2740         claimnull = (flags & NFS4_DR_DISCARD);
2741         /*
2742          * Loop through the open streams for this rnode to find
2743          * all of the ones created using the delegation state ID.
2744          * Each of these needs to be re-opened.
2745          */
2746 
2747         while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
2748 
2749                 if (claimnull) {
2750                         nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
2751                 } else {
2752                         ncg->nfs4_callback_stats.claim_cur.value.ui64++;
2753 
2754                         nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
2755                             FALSE);
2756                         if (e.error == 0 && e.stat == NFS4_OK)
2757                                 ncg->nfs4_callback_stats.
2758                                     claim_cur_ok.value.ui64++;
2759                 }
2760 
2761                 if (e.error == EAGAIN) {
2762                         nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
2763                         goto retry;
2764                 }
2765 
2766                 /*
2767                  * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
2768                  * recovery has already been started inside of nfs4_reopen.
2769                  */
2770                 if (e.error == EINTR || e.error == ETIMEDOUT ||
2771                     NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
2772                         open_stream_rele(osp, rp);
2773                         break;
2774                 }
2775 
2776                 needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
2777 
2778                 if (e.error != 0 && !needrecov) {
2779                         /*
2780                          * Recovery is not possible, but don't give up yet;
2781                          * we'd still like to do delegreturn after
2782                          * reopening as many streams as possible.
2783                          * Continue processing the open streams.
2784                          */
2785 
2786                         ncg->nfs4_callback_stats.recall_failed.value.ui64++;
2787 
2788                 } else if (needrecov) {
2789                         /*
2790                          * Start recovery and bail out.  The recovery
2791                          * thread will take it from here.
2792                          */
2793                         (void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
2794                             NULL, OP_OPEN, NULL);
2795                         open_stream_rele(osp, rp);
2796                         *recovp = TRUE;
2797                         break;
2798                 }
2799 
2800                 open_stream_rele(osp, rp);
2801         }
2802 
2803         nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2804 
2805         return (e.error);
2806 }
2807 
2808 /*
2809  * get_next_deleg_stream - returns the next open stream which
2810  * represents a delegation for this rnode.  In order to assure
2811  * forward progress, the caller must guarantee that each open
2812  * stream returned is changed so that a future call won't return
2813  * it again.
2814  *
2815  * There are several ways for the open stream to change.  If the open
2816  * stream is !os_delegation, then we aren't interested in it.  Also, if
2817  * either os_failed_reopen or !os_valid, then don't return the osp.
2818  *
2819  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
2820  * the osp if it is an os_delegation open stream.  Also, if the rnode still
2821  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
2822  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
2823  * then return the osp.
2824  *
2825  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
2826  * prevents new OPENs from going OTW (as start_fop takes this
2827  * lock in READ mode); thus, no new open streams can be created
2828  * (which inherently means no new delegation open streams are
2829  * being created).
2830  */
2831 
2832 static nfs4_open_stream_t *
2833 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2834 {
2835         nfs4_open_stream_t      *osp;
2836 
2837         ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2838 
2839         /*
2840          * Search through the list of open streams looking for
2841          * one that was created while holding the delegation.
2842          */
2843         mutex_enter(&rp->r_os_lock);
2844         for (osp = list_head(&rp->r_open_streams); osp != NULL;
2845             osp = list_next(&rp->r_open_streams, osp)) {
2846                 mutex_enter(&osp->os_sync_lock);
2847                 if (!osp->os_delegation || osp->os_failed_reopen ||
2848                     !osp->os_valid) {
2849                         mutex_exit(&osp->os_sync_lock);
2850                         continue;
2851                 }
2852                 if (!claimnull || rp->r_deleg_return_pending ||
2853                     !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2854                         osp->os_ref_count++;
2855                         mutex_exit(&osp->os_sync_lock);
2856                         mutex_exit(&rp->r_os_lock);
2857                         return (osp);
2858                 }
2859                 mutex_exit(&osp->os_sync_lock);
2860         }
2861         mutex_exit(&rp->r_os_lock);
2862 
2863         return (NULL);
2864 }
2865 
2866 static void
2867 nfs4delegreturn_thread(struct cb_recall_pass *args)
2868 {
2869         rnode4_t *rp;
2870         vnode_t *vp;
2871         cred_t *cr;
2872         int dtype, error, flags;
2873         bool_t rdirty, rip;
2874         kmutex_t cpr_lock;
2875         callb_cpr_t cpr_info;
2876         struct nfs4_callback_globals *ncg;
2877 
2878         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2879         ASSERT(ncg != NULL);
2880 
2881         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2882 
2883         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2884             "nfsv4delegRtn");
2885 
2886         rp = args->rp;
2887         vp = RTOV4(rp);
2888 
2889         mutex_enter(&rp->r_statev4_lock);
2890         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2891                 mutex_exit(&rp->r_statev4_lock);
2892                 goto out;
2893         }
2894         mutex_exit(&rp->r_statev4_lock);
2895 
2896         /*
2897          * Take the read-write lock in read mode to prevent other
2898          * threads from modifying the data during the recall.  This
2899          * doesn't affect mmappers.
2900          */
2901         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2902 
2903         /* Proceed with delegreturn */
2904 
2905         mutex_enter(&rp->r_statev4_lock);
2906         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2907                 mutex_exit(&rp->r_statev4_lock);
2908                 nfs_rw_exit(&rp->r_rwlock);
2909                 goto out;
2910         }
2911         dtype = rp->r_deleg_type;
2912         cr = rp->r_deleg_cred;
2913         ASSERT(cr != NULL);
2914         crhold(cr);
2915         mutex_exit(&rp->r_statev4_lock);
2916 
2917         flags = args->flags;
2918 
2919         /*
2920          * If the file is being truncated at the server, then throw
2921          * away all of the pages, it doesn't matter what flavor of
2922          * delegation we have.
2923          */
2924 
2925         if (args->truncate) {
2926                 ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2927                 nfs4_invalidate_pages(vp, 0, cr);
2928         } else if (dtype == OPEN_DELEGATE_WRITE) {
2929 
2930                 mutex_enter(&rp->r_statelock);
2931                 rdirty = rp->r_flags & R4DIRTY;
2932                 mutex_exit(&rp->r_statelock);
2933 
2934                 if (rdirty) {
2935                         error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2936 
2937                         if (error)
2938                                 CB_WARN1("nfs4delegreturn_thread:"
2939                                 " VOP_PUTPAGE: %d\n", error);
2940                 }
2941                 /* turn off NFS4_DR_PUSH because we just did that above. */
2942                 flags &= ~NFS4_DR_PUSH;
2943         }
2944 
2945         mutex_enter(&rp->r_statelock);
2946         rip =  rp->r_flags & R4RECOVERRP;
2947         mutex_exit(&rp->r_statelock);
2948 
2949         /* If a failed recovery is indicated, discard the pages */
2950 
2951         if (rip) {
2952 
2953                 error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2954 
2955                 if (error)
2956                         CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2957                             error);
2958         }
2959 
2960         /*
2961          * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2962          * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2963          */
2964         flags &= ~NFS4_DR_DID_OP;
2965 
2966         (void) nfs4delegreturn_impl(rp, flags, ncg);
2967 
2968         nfs_rw_exit(&rp->r_rwlock);
2969         crfree(cr);
2970 out:
2971         kmem_free(args, sizeof (struct cb_recall_pass));
2972         VN_RELE(vp);
2973         mutex_enter(&cpr_lock);
2974         CALLB_CPR_EXIT(&cpr_info);
2975         mutex_destroy(&cpr_lock);
2976         zthread_exit();
2977 }
2978 
2979 /*
2980  * This function has one assumption that the caller of this function is
2981  * either doing recovery (therefore cannot call nfs4_start_op) or has
2982  * already called nfs4_start_op().
2983  */
2984 void
2985 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2986         nfs4_ga_res_t *garp, cred_t *cr)
2987 {
2988         open_read_delegation4 *orp;
2989         open_write_delegation4 *owp;
2990         nfs4_server_t *np;
2991         bool_t already = FALSE;
2992         bool_t recall = FALSE;
2993         bool_t valid_garp = TRUE;
2994         bool_t delegation_granted = FALSE;
2995         bool_t dr_needed = FALSE;
2996         bool_t recov;
2997         int dr_flags = 0;
2998         long mapcnt;
2999         uint_t rflag;
3000         mntinfo4_t *mi;
3001         struct nfs4_callback_globals *ncg;
3002         open_delegation_type4 odt;
3003 
3004         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
3005         ASSERT(ncg != NULL);
3006 
3007         mi = VTOMI4(RTOV4(rp));
3008 
3009         /*
3010          * Accept a delegation granted to the client via an OPEN.
3011          * Set the delegation fields in the rnode and insert the
3012          * rnode onto the list anchored in the nfs4_server_t.  The
3013          * proper locking order requires the nfs4_server_t first,
3014          * even though it may not be needed in all cases.
3015          *
3016          * NB: find_nfs4_server returns with s_lock held.
3017          */
3018 
3019         if ((np = find_nfs4_server(mi)) == NULL)
3020                 return;
3021 
3022         /* grab the statelock too, for examining r_mapcnt */
3023         mutex_enter(&rp->r_statelock);
3024         mutex_enter(&rp->r_statev4_lock);
3025 
3026         if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
3027             rp->r_deleg_type == OPEN_DELEGATE_WRITE)
3028                 already = TRUE;
3029 
3030         odt = res->delegation.delegation_type;
3031 
3032         if (odt == OPEN_DELEGATE_READ) {
3033 
3034                 rp->r_deleg_type = res->delegation.delegation_type;
3035                 orp = &res->delegation.open_delegation4_u.read;
3036                 rp->r_deleg_stateid = orp->stateid;
3037                 rp->r_deleg_perms = orp->permissions;
3038                 if (claim == CLAIM_PREVIOUS)
3039                         if ((recall = orp->recall) != 0)
3040                                 dr_needed = TRUE;
3041 
3042                 delegation_granted = TRUE;
3043 
3044                 ncg->nfs4_callback_stats.delegations.value.ui64++;
3045                 ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
3046 
3047         } else if (odt == OPEN_DELEGATE_WRITE) {
3048 
3049                 rp->r_deleg_type = res->delegation.delegation_type;
3050                 owp = &res->delegation.open_delegation4_u.write;
3051                 rp->r_deleg_stateid = owp->stateid;
3052                 rp->r_deleg_perms = owp->permissions;
3053                 rp->r_deleg_limit = owp->space_limit;
3054                 if (claim == CLAIM_PREVIOUS)
3055                         if ((recall = owp->recall) != 0)
3056                                 dr_needed = TRUE;
3057 
3058                 delegation_granted = TRUE;
3059 
3060                 if (garp == NULL || !garp->n4g_change_valid) {
3061                         valid_garp = FALSE;
3062                         rp->r_deleg_change = 0;
3063                         rp->r_deleg_change_grant = 0;
3064                 } else {
3065                         rp->r_deleg_change = garp->n4g_change;
3066                         rp->r_deleg_change_grant = garp->n4g_change;
3067                 }
3068                 mapcnt = rp->r_mapcnt;
3069                 rflag = rp->r_flags;
3070 
3071                 /*
3072                  * Update the delegation change attribute if
3073                  * there are mappers for the file is dirty.  This
3074                  * might be the case during recovery after server
3075                  * reboot.
3076                  */
3077                 if (mapcnt > 0 || rflag & R4DIRTY)
3078                         rp->r_deleg_change++;
3079 
3080                 NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
3081                     "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
3082                     (int)(rp->r_deleg_change >> 32)));
3083                 NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
3084                     "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
3085                     (int)(rp->r_deleg_change_grant >> 32)));
3086 
3087 
3088                 ncg->nfs4_callback_stats.delegations.value.ui64++;
3089                 ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
3090         } else if (already) {
3091                 /*
3092                  * No delegation granted.  If the rnode currently has
3093                  * has one, then consider it tainted and return it.
3094                  */
3095                 dr_needed = TRUE;
3096         }
3097 
3098         if (delegation_granted) {
3099                 /* Add the rnode to the list. */
3100                 if (!already) {
3101                         crhold(cr);
3102                         rp->r_deleg_cred = cr;
3103 
3104                         ASSERT(mutex_owned(&np->s_lock));
3105                         list_insert_head(&np->s_deleg_list, rp);
3106                         /* added list node gets a reference */
3107                         np->s_refcnt++;
3108                         nfs4_inc_state_ref_count_nolock(np, mi);
3109                 }
3110                 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
3111         }
3112 
3113         /*
3114          * We've now safely accepted the delegation, if any.  Drop the
3115          * locks and figure out what post-processing is needed.  We'd
3116          * like to retain r_statev4_lock, but nfs4_server_rele takes
3117          * s_lock which would be a lock ordering violation.
3118          */
3119         mutex_exit(&rp->r_statev4_lock);
3120         mutex_exit(&rp->r_statelock);
3121         mutex_exit(&np->s_lock);
3122         nfs4_server_rele(np);
3123 
3124         /*
3125          * Check to see if we are in recovery.  Remember that
3126          * this function is protected by start_op, so a recovery
3127          * cannot begin until we are out of here.
3128          */
3129         mutex_enter(&mi->mi_lock);
3130         recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
3131         mutex_exit(&mi->mi_lock);
3132 
3133         mutex_enter(&rp->r_statev4_lock);
3134 
3135         if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
3136                 dr_needed = TRUE;
3137 
3138         if (dr_needed && rp->r_deleg_return_pending == FALSE) {
3139                 if (recov) {
3140                         /*
3141                          * We cannot call delegreturn from inside
3142                          * of recovery or VOP_PUTPAGE will hang
3143                          * due to nfs4_start_fop call in
3144                          * nfs4write.  Use dlistadd to add the
3145                          * rnode to the list of rnodes needing
3146                          * cleaning.  We do not need to do reopen
3147                          * here because recov_openfiles will do it.
3148                          * In the non-recall case, just discard the
3149                          * delegation as it is no longer valid.
3150                          */
3151                         if (recall)
3152                                 dr_flags = NFS4_DR_PUSH;
3153                         else
3154                                 dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
3155 
3156                         nfs4_dlistadd(rp, ncg, dr_flags);
3157                         dr_flags = 0;
3158                 } else {
3159                         /*
3160                          * Push the modified data back to the server,
3161                          * reopen any delegation open streams, and return
3162                          * the delegation.  Drop the statev4_lock first!
3163                          */
3164                         dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
3165                 }
3166         }
3167         mutex_exit(&rp->r_statev4_lock);
3168         if (dr_flags)
3169                 (void) nfs4delegreturn_impl(rp, dr_flags, ncg);
3170 }
3171 
3172 /*
3173  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
3174  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
3175  * or BADSEQID and the recovery code is unable to recover.  Push any
3176  * dirty data back to the server and return the delegation (if any).
3177  */
3178 
3179 void
3180 nfs4delegabandon(rnode4_t *rp)
3181 {
3182         vnode_t *vp;
3183         struct cb_recall_pass *pp;
3184         open_delegation_type4 dt;
3185 
3186         mutex_enter(&rp->r_statev4_lock);
3187         dt = rp->r_deleg_type;
3188         mutex_exit(&rp->r_statev4_lock);
3189 
3190         if (dt == OPEN_DELEGATE_NONE)
3191                 return;
3192 
3193         vp = RTOV4(rp);
3194         VN_HOLD(vp);
3195 
3196         pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
3197         pp->rp = rp;
3198         /*
3199          * Recovery on the file has failed and we want to return
3200          * the delegation.  We don't want to reopen files and
3201          * nfs4delegreturn_thread() figures out what to do about
3202          * the data.  The only thing to do is attempt to return
3203          * the delegation.
3204          */
3205         pp->flags = 0;
3206         pp->truncate = FALSE;
3207 
3208         /*
3209          * Fire up a thread to do the delegreturn; this is
3210          * necessary because we could be inside a GETPAGE or
3211          * PUTPAGE and we cannot do another one.
3212          */
3213 
3214         (void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
3215             minclsyspri);
3216 }
3217 
3218 static int
3219 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
3220         int flg)
3221 {
3222         rnode4_t *rp;
3223         int error = 0;
3224 
3225 #ifdef lint
3226         op = op;
3227 #endif
3228 
3229         if (vp && vp->v_type == VREG) {
3230                 rp = VTOR4(vp);
3231 
3232                 /*
3233                  * Take r_deleg_recall_lock in read mode to synchronize
3234                  * with delegreturn.
3235                  */
3236                 error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
3237                     RW_READER, INTR4(vp));
3238 
3239                 if (error == 0)
3240                         rsp->rs_flags |= flg;
3241 
3242         }
3243         return (error);
3244 }
3245 
3246 void
3247 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
3248 {
3249         NFS4_DEBUG(nfs4_recall_debug,
3250             (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
3251             (void *)vp1, (void *)vp2));
3252 
3253         if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
3254                 nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
3255         if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
3256                 nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
3257 }
3258 
3259 int
3260 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
3261         nfs4_recov_state_t *rsp)
3262 {
3263         int error;
3264 
3265         NFS4_DEBUG(nfs4_recall_debug,
3266             (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
3267             (void *)vp1, (void *) vp2));
3268 
3269         rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
3270 
3271         if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
3272                 return (error);
3273 
3274         if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
3275             != 0) {
3276                 if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
3277                         nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
3278                         rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
3279                 }
3280 
3281                 return (error);
3282         }
3283 
3284         return (0);
3285 }
3286 
3287 /*
3288  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
3289  * DELEGRETURN'd at the end of recovery.
3290  */
3291 
3292 static void
3293 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
3294 {
3295         struct nfs4_dnode *dp;
3296 
3297         ASSERT(mutex_owned(&rp->r_statev4_lock));
3298         /*
3299          * Mark the delegation as having a return pending.
3300          * This will prevent the use of the delegation stateID
3301          * by read, write, setattr and open.
3302          */
3303         rp->r_deleg_return_pending = TRUE;
3304         dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
3305         VN_HOLD(RTOV4(rp));
3306         dp->rnodep = rp;
3307         dp->flags = flags;
3308         mutex_enter(&ncg->nfs4_dlist_lock);
3309         list_insert_head(&ncg->nfs4_dlist, dp);
3310 #ifdef  DEBUG
3311         ncg->nfs4_dlistadd_c++;
3312 #endif
3313         mutex_exit(&ncg->nfs4_dlist_lock);
3314 }
3315 
3316 /*
3317  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
3318  * of files awaiting cleaning.  If the override_flags are non-zero
3319  * then use them rather than the flags that were set when the rnode
3320  * was added to the dlist.
3321  */
3322 static void
3323 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
3324 {
3325         rnode4_t *rp;
3326         struct nfs4_dnode *dp;
3327         int flags;
3328 
3329         ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
3330 
3331         mutex_enter(&ncg->nfs4_dlist_lock);
3332         while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
3333 #ifdef  DEBUG
3334                 ncg->nfs4_dlistclean_c++;
3335 #endif
3336                 list_remove(&ncg->nfs4_dlist, dp);
3337                 mutex_exit(&ncg->nfs4_dlist_lock);
3338                 rp = dp->rnodep;
3339                 flags = (override_flags != 0) ? override_flags : dp->flags;
3340                 kmem_free(dp, sizeof (*dp));
3341                 (void) nfs4delegreturn_impl(rp, flags, ncg);
3342                 VN_RELE(RTOV4(rp));
3343                 mutex_enter(&ncg->nfs4_dlist_lock);
3344         }
3345         mutex_exit(&ncg->nfs4_dlist_lock);
3346 }
3347 
3348 void
3349 nfs4_dlistclean(void)
3350 {
3351         struct nfs4_callback_globals *ncg;
3352 
3353         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
3354         ASSERT(ncg != NULL);
3355 
3356         nfs4_dlistclean_impl(ncg, 0);
3357 }
--- EOF ---