Move CallBack Server thread creation, initial processing and destruction to RPC
Cleanup some RPC code.
Remove extraneous fields from nfs41_cb_info and clean up the code.
Change KM_SLEEP in mir_nfs41_callback_thread to KM_NOSLEEP.
Fix lint warnings

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25 
  26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  27 /* All Rights Reserved */
  28 
  29 #include <sys/param.h>
  30 #include <sys/types.h>
  31 #include <sys/systm.h>
  32 #include <sys/cred.h>
  33 #include <sys/vfs.h>
  34 #include <sys/vnode.h>
  35 #include <sys/pathname.h>
  36 #include <sys/sysmacros.h>
  37 #include <sys/kmem.h>
  38 #include <sys/kstat.h>
  39 #include <sys/mkdev.h>
  40 #include <sys/mount.h>
  41 #include <sys/statvfs.h>
  42 #include <sys/errno.h>
  43 #include <sys/debug.h>
  44 #include <sys/cmn_err.h>
  45 #include <sys/utsname.h>
  46 #include <sys/bootconf.h>
  47 #include <sys/modctl.h>
  48 #include <sys/acl.h>
  49 #include <sys/flock.h>
  50 #include <sys/kstr.h>
  51 #include <sys/stropts.h>
  52 #include <sys/strsubr.h>
  53 #include <sys/atomic.h>
  54 #include <sys/disp.h>
  55 #include <sys/policy.h>
  56 #include <sys/list.h>
  57 #include <sys/zone.h>
  58 #include <sys/sdt.h>
  59 
  60 #include <rpc/types.h>
  61 #include <rpc/auth.h>
  62 #include <rpc/rpcsec_gss.h>
  63 #include <rpc/clnt.h>
  64 #include <rpc/xdr.h>
  65 
  66 #include <nfs/nfs.h>
  67 #include <nfs/nfs_clnt.h>
  68 #include <nfs/mount.h>
  69 #include <nfs/nfs_acl.h>
  70 
  71 #include <fs/fs_subr.h>
  72 
  73 #include <nfs/nfs4.h>
  74 #include <nfs/rnode4.h>
  75 #include <nfs/nfs4_clnt.h>
  76 #include <nfs/nfssys.h>
  77 #include <nfs/nfs4_pnfs.h>
  78 
  79 #ifdef  DEBUG
  80 /*
  81  * These are "special" state IDs and file handles that
  82  * match any delegation state ID or file handled.  This
  83  * is for testing purposes only.
  84  */
  85 
  86 
  87 stateid4 nfs4_deleg_any = { 0x7FFFFFF0 };
  88 char nfs4_deleg_fh[] = "\0377\0376\0375\0374";
  89 nfs_fh4 nfs4_deleg_anyfh = { sizeof (nfs4_deleg_fh)-1, nfs4_deleg_fh };
  90 nfsstat4 cb4_getattr_fail = NFS4_OK;
  91 nfsstat4 cb4_recall_fail = NFS4_OK;
  92 
  93 int nfs4_callback_debug;
  94 int nfs4_recall_debug;
  95 int nfs4_drat_debug;
  96 
  97 #endif
  98 
  99 int     nfs41_birpc = 1;        /* Use bidirectional rpc */
 100 
 101 #define CB_NOTE(x)      NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE, x))
 102 #define CB_WARN(x)      NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x))
 103 #define CB_WARN1(x, y)  NFS4_DEBUG(nfs4_callback_debug, (CE_WARN, x, y))
 104 
 105 enum nfs4_delegreturn_policy nfs4_delegreturn_policy = INACTIVE;
 106 
 107 static zone_key_t nfs4_callback_zone_key;
 108 
 109 /*
 110  * NFS4_MAPSIZE is the number of bytes we are willing to consume
 111  * for the block allocation map when the server grants a NFS_LIMIT_BLOCK
 112  * style delegation.
 113  */
 114 
 115 #define NFS4_MAPSIZE    8192
 116 #define NFS4_MAPWORDS   NFS4_MAPSIZE/sizeof (uint_t)
 117 #define NbPW            (NBBY*sizeof (uint_t))
 118 
 119 static int nfs4_num_prognums = 1024;
 120 static SVC_CALLOUT_TABLE nfs4_cb_sct;
 121 
 122 struct nfs4_dnode {
 123         list_node_t     linkage;
 124         rnode4_t        *rnodep;
 125         int             flags;          /* Flags for nfs4delegreturn_impl() */
 126 };
 127 
 128 static const struct nfs4_callback_stats nfs4_callback_stats_tmpl = {
 129         { "delegations",        KSTAT_DATA_UINT64 },
 130         { "cb_getattr",         KSTAT_DATA_UINT64 },
 131         { "cb_recall",          KSTAT_DATA_UINT64 },
 132         { "cb_null",            KSTAT_DATA_UINT64 },
 133         { "cb_dispatch",        KSTAT_DATA_UINT64 },
 134         { "delegaccept_r",      KSTAT_DATA_UINT64 },
 135         { "delegaccept_rw",     KSTAT_DATA_UINT64 },
 136         { "delegreturn",        KSTAT_DATA_UINT64 },
 137         { "callbacks",          KSTAT_DATA_UINT64 },
 138         { "claim_cur",          KSTAT_DATA_UINT64 },
 139         { "claim_cur_ok",       KSTAT_DATA_UINT64 },
 140         { "recall_trunc",       KSTAT_DATA_UINT64 },
 141         { "recall_failed",      KSTAT_DATA_UINT64 },
 142         { "return_limit_write", KSTAT_DATA_UINT64 },
 143         { "return_limit_addmap", KSTAT_DATA_UINT64 },
 144         { "deleg_recover",      KSTAT_DATA_UINT64 },
 145         { "cb_illegal",         KSTAT_DATA_UINT64 },
 146         { "cb_sequence",        KSTAT_DATA_UINT64 }
 147 };
 148 
 149 struct nfs4_cb_port {
 150         list_node_t             linkage; /* linkage into per-zone port list */
 151         char                    netid[KNC_STRSIZE];
 152         char                    uaddr[KNC_STRSIZE];
 153         char                    protofmly[KNC_STRSIZE];
 154         char                    proto[KNC_STRSIZE];
 155 };
 156 
 157 static int cb_getattr_bytes;
 158 
 159 struct cb_recall_pass {
 160         rnode4_t        *rp;
 161         int             flags;          /* Flags for nfs4delegreturn_impl() */
 162         bool_t          truncate;
 163 };
 164 
 165 static nfs4_open_stream_t *get_next_deleg_stream(rnode4_t *, int);
 166 static void nfs4delegreturn_thread(struct cb_recall_pass *);
 167 static int deleg_reopen(vnode_t *, bool_t *, struct nfs4_callback_globals *,
 168     int);
 169 static void nfs4_dlistadd(rnode4_t *, struct nfs4_callback_globals *, int);
 170 static void nfs4_dlistclean_impl(struct nfs4_callback_globals *, int);
 171 static int nfs4delegreturn_impl(rnode4_t *, int,
 172     struct nfs4_callback_globals *);
 173 static void nfs4delegreturn_cleanup_impl(rnode4_t *, nfs4_server_t *,
 174     struct nfs4_callback_globals *);
 175 
 176 
 177 /*
 178  * Only used for non-bidirectional RPC --Performs a BC2S and
 179  * starts the cbconn_thread.
 180  * (expects np->s_lock to be held)
 181  */
 182 
 183 void
 184 nfs41set_callback(nfs4_server_t *np, servinfo4_t *svp, mntinfo4_t *mi,
 185     cred_t *cr)
 186 {
 187         struct nfs41_cb_info    *cbi;
 188         CLIENT                  *client;
 189         struct nfs4_clnt        *nfscl;
 190         int                     error;
 191 
 192         ASSERT(MUTEX_HELD(&np->s_lock));
 193 
 194         if (nfs4bind_conn_to_session(np, svp, mi, cr, CDFC4_BACK)) {
 195                 zcmn_err(getzoneid(), CE_WARN,
 196                     "Callback Channel Binding Failed");
 197                 return;
 198         }
 199 
 200         /*
 201          * The following below is to create a client handle
 202          * used only by the cbconn_thread to send out NFSPROC4_NULL
 203          * and should not be used for anything else.
 204          */
 205         cbi = np->zone_globals->nfs4prog2cbinfo[np->s_program-NFS4_CALLBACK];
 206         ASSERT(cbi != NULL);
 207         client = cbi->cb_client;
 208 
 209         /*
 210          * If client from a previous session, destroy it first
 211          */
 212         if (client) {
 213                 AUTH_DESTROY(client->cl_auth);
 214                 CLNT_DESTROY(client);
 215         }
 216 
 217         nfscl = zone_getspecific(nfs4clnt_zone_key, nfs_zone());
 218         ASSERT(nfscl != NULL);
 219 
 220         /* Get a CLIENT handle */
 221         error = clnt_tli_kcreate(svp->sv_knconf, &svp->sv_addr,
 222             NFS4_PROGRAM, NFS_V4, 0, 0, np->s_cred, &client);
 223 
 224         if (error != 0) {
 225                 zcmn_err(getzoneid(), CE_WARN,
 226                     "Failed to get handle for callback");
 227                 cbi->cb_client = NULL;
 228                 return;
 229         }
 230 
 231         /* Define this handle as a back channel handle */
 232         if (!(CLNT_CONTROL(client, CLSET_BACKCHANNEL, NULL))) {
 233                 zcmn_err(getzoneid(), CE_WARN,
 234                     "Failed to set client handle as callback");
 235                 CLNT_DESTROY(client);
 236                 cbi->cb_client = NULL;
 237                 return;
 238         }
 239 
 240         /* Associate it with the session */
 241         if (!CLNT_CONTROL(client, CLSET_TAG, (char *)(np->ssx.sessionid))) {
 242                 zcmn_err(getzoneid(), CE_WARN,
 243                     "Failed to set tag on client handle");
 244                 CLNT_DESTROY(client);
 245                 cbi->cb_client = NULL;
 246                 return;
 247         }
 248 
 249         cbi->cb_nfscl = nfscl;
 250         cbi->cb_client = client;
 251 
 252         /*
 253          * Now start the cbconn_thread
 254          */
 255 
 256         np->s_refcnt++;
 257         mutex_enter(&cbi->cb_reflock);
 258         cbi->cb_refcnt++;
 259         mutex_exit(&cbi->cb_reflock);
 260         (void) zthread_create(NULL, 0, nfs4_cbconn_thread, np, 0,
 261             minclsyspri);
 262 }
 263 
 264 /*
 265  * nfs4_cbconn_thread is used to send a null op to the server over the
 266  * backchannel connection, to keep the back channel connection up.
 267  * This is not needed for bidirectional rpc as the op_sequence
 268  * heartbeat thread is doing the same thing.
 269  */
 270 void
 271 nfs4_cbconn_thread(nfs4_server_t *np)
 272 {
 273         clock_t                 tick_delay;
 274         callb_cpr_t             cpr_info;
 275         kmutex_t                cpr_lock;
 276         struct nfs41_cb_info    *cbi;
 277         uint32_t                zilch = 0;
 278         int                     timeo;
 279         struct timeval          wait;
 280         enum clnt_stat          rpc_stat;
 281 
 282         cbi = np->zone_globals->nfs4prog2cbinfo[np->s_program-NFS4_CALLBACK];
 283         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
 284         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr, "nfsv4cbconn");
 285 
 286         timeo = (NFS_TIMEO * hz) / 10;
 287         timeo = (MIN(NFS_TIMEO, (NFS_COTS_TIMEO / 10)) * hz) / 10;
 288         TICK_TO_TIMEVAL(timeo, &wait);
 289         tick_delay = MSEC_TO_TICK((4 * (60 * 1000L)));
 290 
 291         while (!(cbi->cb_cbconn_exit)) {
 292                 if (!(CLNT_CONTROL(cbi->cb_client, CLSET_XID,
 293                     (char *)&zilch))) {
 294                         zcmn_err(getzoneid(), CE_WARN,
 295                             "Failed to zero xid, cbconn thread exiting");
 296                         break;
 297                 }
 298                 /* Execute remote NULL procedure to establish the connection */
 299                 rpc_stat = CLNT_CALL(cbi->cb_client, NFSPROC4_NULL,
 300                     xdr_void, NULL, xdr_void, NULL, wait);
 301                 if (rpc_stat != RPC_SUCCESS) {
 302                         zcmn_err(getzoneid(), CE_WARN,
 303                             "OP_NULL failed to transmit "
 304                             " on callback connection "
 305                             "status: 0x%x, cbconn thread exiting", rpc_stat);
 306                         break;
 307                 }
 308                 mutex_enter(&cpr_lock);
 309                 CALLB_CPR_SAFE_BEGIN(&cpr_info);
 310                 mutex_exit(&cpr_lock);
 311 
 312                 mutex_enter(&cbi->cb_cbconn_lock);
 313                 (void) cv_timedwait(&cbi->cb_cbconn_wait,
 314                     &cbi->cb_cbconn_lock, tick_delay + lbolt);
 315                 mutex_exit(&cbi->cb_cbconn_lock);
 316 
 317                 mutex_enter(&cpr_lock);
 318                 CALLB_CPR_SAFE_END(&cpr_info, &cpr_lock);
 319                 mutex_exit(&cpr_lock);
 320         }
 321 
 322         nfs4_server_rele(np);
 323         nfs41_cbinfo_rele(cbi);
 324         mutex_enter(&cpr_lock);
 325         CALLB_CPR_EXIT(&cpr_info);
 326         cv_signal(&cbi->cb_destroy_wait);
 327         zthread_exit();
 328 }
 329 
 330 static void
 331 cb_sequence(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 332         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 333 {
 334         nfs4_server_t   *np;
 335         nfs41_cb_slot_t *cslot;
 336 
 337         CB_SEQUENCE4args *args = &argop->nfs_cb_argop4_u.opcbsequence;
 338         CB_SEQUENCE4res *resp = &resop->nfs_cb_resop4_u.opcbsequence;
 339 
 340         ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
 341 
 342         mutex_enter(&ncg->nfs4_cb_lock);
 343         np = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 344         mutex_exit(&ncg->nfs4_cb_lock);
 345         if (nfs4_server_vlock(np, 0) == FALSE) {
 346                 CB_WARN("cb_sequence: cannot find server\n");
 347                 *cs->statusp = resp->csr_status = NFS4ERR_BADHANDLE;
 348                 return;
 349         }
 350 
 351         bcopy(&args->csa_sessionid,
 352             &resp->CB_SEQUENCE4res_u.csr_resok4.csr_sessionid,
 353             sizeof (args->csa_sessionid));
 354         resp->CB_SEQUENCE4res_u.csr_resok4.csr_slotid = args->csa_slotid;
 355         resp->CB_SEQUENCE4res_u.csr_resok4.csr_sequenceid =
 356             args->csa_sequenceid;
 357         resp->CB_SEQUENCE4res_u.csr_resok4.csr_highest_slotid =
 358             args->csa_highest_slotid;
 359         resp->CB_SEQUENCE4res_u.csr_resok4.csr_target_highest_slotid =
 360             args->csa_highest_slotid;
 361 
 362         if (bcmp(&args->csa_sessionid, &np->ssx.sessionid,
 363             sizeof (np->ssx.sessionid)) != 0) {
 364                 CB_WARN("cb_sequence: Bad Sequence Id\n");
 365                 *cs->statusp = resp->csr_status = NFS4ERR_BADSESSION;
 366                 mutex_exit(&np->s_lock);
 367                 nfs4_server_rele(np);
 368                 return;
 369         }
 370 
 371         if (args->csa_slotid >= np->ssx.cb_slot_table_size) {
 372                 CB_WARN("cb_sequence: Bad Slotid\n");
 373                 *cs->statusp = resp->csr_status = NFS4ERR_BADSLOT;
 374                 mutex_exit(&np->s_lock);
 375                 nfs4_server_rele(np);
 376                 return;
 377         }
 378 
 379         cslot = np->ssx.cb_slot_table[args->csa_slotid];
 380 
 381         if (args->csa_sequenceid != cslot->cb_seq + 1 || (cslot->cb_inuse)) {
 382                 CB_WARN("cb_sequence: Bad Sequence\n");
 383                 *cs->statusp = resp->csr_status = NFS4ERR_SEQ_MISORDERED;
 384                 mutex_exit(&np->s_lock);
 385                 nfs4_server_rele(np);
 386                 return;
 387         }
 388 
 389         cslot->cb_seq = args->csa_sequenceid;
 390         /*
 391          * todo: need to set inuse and deal with server having
 392          * multiple callbacks in-flight.
 393          */
 394 
 395         *cs->statusp = resp->csr_status = NFS4_OK;
 396         mutex_exit(&np->s_lock);
 397         nfs4_server_rele(np);
 398 }
 399 
 400 static void
 401 cb_getattr(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 402         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 403 {
 404         CB_GETATTR4args *args = &argop->nfs_cb_argop4_u.opcbgetattr;
 405         CB_GETATTR4res *resp = &resop->nfs_cb_resop4_u.opcbgetattr;
 406         rnode4_t *rp;
 407         vnode_t *vp;
 408         bool_t found = FALSE;
 409         struct nfs4_server *sp;
 410         struct fattr4 *fap;
 411         rpc_inline_t *fdata;
 412         long mapcnt;
 413         fattr4_change change;
 414         fattr4_size size;
 415         uint_t rflag;
 416 
 417         ncg->nfs4_callback_stats.cb_getattr.value.ui64++;
 418 
 419 #ifdef DEBUG
 420         /*
 421          * error injection hook: set cb_getattr_fail global to
 422          * NFS4 pcol error to be returned
 423          */
 424         if (cb4_getattr_fail != NFS4_OK) {
 425                 *cs->statusp = resp->status = cb4_getattr_fail;
 426                 return;
 427         }
 428 #endif
 429 
 430         resp->obj_attributes.attrmask =
 431             NFS4_EMPTY_ATTRMAP(RFS4_ATTRVERS(cs));
 432 
 433         mutex_enter(&ncg->nfs4_cb_lock);
 434         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 435         mutex_exit(&ncg->nfs4_cb_lock);
 436 
 437         if (nfs4_server_vlock(sp, 0) == FALSE) {
 438 
 439                 CB_WARN("cb_getattr: cannot find server\n");
 440 
 441                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 442                 return;
 443         }
 444 
 445         /*
 446          * In cb_compound, callback_ident was validated against rq_prog,
 447          * but we couldn't verify that it was set to the value we provided
 448          * at setclientid time (because we didn't have server struct yet).
 449          * Now we have the server struct, but don't have callback_ident
 450          * handy.  So, validate server struct program number against req
 451          * RPC's prog number.  At this point, we know the RPC prog num
 452          * is valid (else we wouldn't be here); however, we don't know
 453          * that it was the prog number we supplied to this server at
 454          * setclientid time.  If the prog numbers aren't equivalent, then
 455          * log the problem and fail the request because either cbserv
 456          * and/or cbclient are confused.  This will probably never happen.
 457          */
 458         if (sp->s_program != req->rq_prog) {
 459 #ifdef DEBUG
 460                 zcmn_err(getzoneid(), CE_WARN,
 461                     "cb_getattr: wrong server program number srv=%d req=%d\n",
 462                     sp->s_program, req->rq_prog);
 463 #else
 464                 zcmn_err(getzoneid(), CE_WARN,
 465                     "cb_getattr: wrong server program number\n");
 466 #endif
 467                 mutex_exit(&sp->s_lock);
 468                 nfs4_server_rele(sp);
 469                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 470                 return;
 471         }
 472 
 473         /*
 474          * Search the delegation list for a matching file handle;
 475          * mutex on sp prevents the list from changing.
 476          */
 477 
 478         rp = list_head(&sp->s_deleg_list);
 479         for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
 480                 nfs4_fhandle_t fhandle;
 481 
 482                 sfh4_copyval(rp->r_fh, &fhandle);
 483 
 484                 if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
 485                     bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
 486                     fhandle.fh_len) == 0)) {
 487 
 488                         found = TRUE;
 489                         break;
 490                 }
 491 #ifdef  DEBUG
 492                 if (nfs4_deleg_anyfh.nfs_fh4_len == args->fh.nfs_fh4_len &&
 493                     bcmp(nfs4_deleg_anyfh.nfs_fh4_val, args->fh.nfs_fh4_val,
 494                     args->fh.nfs_fh4_len) == 0) {
 495 
 496                         found = TRUE;
 497                         break;
 498                 }
 499 #endif
 500         }
 501 
 502         /*
 503          * VN_HOLD the vnode before releasing s_lock to guarantee
 504          * we have a valid vnode reference.
 505          */
 506         if (found == TRUE) {
 507                 vp = RTOV4(rp);
 508                 VN_HOLD(vp);
 509         }
 510 
 511         mutex_exit(&sp->s_lock);
 512         nfs4_server_rele(sp);
 513 
 514         if (found == FALSE) {
 515 
 516                 CB_WARN("cb_getattr: bad fhandle\n");
 517 
 518                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 519                 return;
 520         }
 521 
 522         /*
 523          * Figure out which attributes the server wants.  We only
 524          * offer FATTR4_CHANGE & FATTR4_SIZE; ignore the rest.
 525          */
 526         fdata = kmem_alloc(cb_getattr_bytes, KM_SLEEP);
 527 
 528         /*
 529          * Don't actually need to create XDR to encode these
 530          * simple data structures.
 531          * xdrmem_create(&xdr, fdata, cb_getattr_bytes, XDR_ENCODE);
 532          */
 533         fap = &resp->obj_attributes;
 534 
 535         fap->attrmask = NFS4_EMPTY_ATTRMAP(RFS4_ATTRVERS(cs));
 536         /* attrlist4_len starts at 0 and increases as attrs are processed */
 537         fap->attrlist4 = (char *)fdata;
 538         fap->attrlist4_len = 0;
 539 
 540         if (ATTR_ISSET(args->attr_request, CHANGE)) {
 541                 /*
 542                  * If the file is mmapped, then increment the change
 543                  * attribute and return it.  This will guarantee that
 544                  * the server will perceive that the file has changed
 545                  * if there is any chance that the client application
 546                  * has changed it.  Otherwise, just return the change
 547                  * attribute as it has been updated by nfs4write_deleg.
 548                  */
 549 
 550                 mutex_enter(&rp->r_statelock);
 551                 mapcnt = rp->r_mapcnt;
 552                 rflag = rp->r_flags;
 553                 mutex_exit(&rp->r_statelock);
 554 
 555                 mutex_enter(&rp->r_statev4_lock);
 556                 /*
 557                  * If object mapped, then always return new change.
 558                  * Otherwise, return change if object has dirty
 559                  * pages.  If object doesn't have any dirty pages,
 560                  * then all changes have been pushed to server, so
 561                  * reset change to grant change.
 562                  */
 563                 if (mapcnt)
 564                         rp->r_deleg_change++;
 565                 else if (! (rflag & R4DIRTY))
 566                 rp->r_deleg_change = rp->r_deleg_change_grant;
 567                 change = rp->r_deleg_change;
 568                 mutex_exit(&rp->r_statev4_lock);
 569 
 570                 /*
 571                  * Use inline XDR code directly, we know that we
 572                  * going to a memory buffer and it has enough
 573                  * space so it cannot fail.
 574                  */
 575                 IXDR_PUT_U_HYPER(fdata, change);
 576                 fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
 577                 ATTR_SET(fap->attrmask, CHANGE);
 578         }
 579 
 580         if (ATTR_ISSET(args->attr_request, SIZE)) {
 581                 /*
 582                  * Use an atomic add of 0 to fetch a consistent view
 583                  * of r_size; this avoids having to take rw_lock
 584                  * which could cause a deadlock.
 585                  */
 586                 size = atomic_add_64_nv((uint64_t *)&rp->r_size, 0);
 587 
 588                 /*
 589                  * Use inline XDR code directly, we know that we
 590                  * going to a memory buffer and it has enough
 591                  * space so it cannot fail.
 592                  */
 593                 IXDR_PUT_U_HYPER(fdata, size);
 594                 fap->attrlist4_len += 2 * BYTES_PER_XDR_UNIT;
 595                 ATTR_SET(fap->attrmask, SIZE);
 596         }
 597 
 598         VN_RELE(vp);
 599 
 600         *cs->statusp = resp->status = NFS4_OK;
 601 }
 602 
 603 static void
 604 cb_getattr_free(nfs_cb_resop4 *resop)
 605 {
 606         if (resop->nfs_cb_resop4_u.opcbgetattr.obj_attributes.attrlist4)
 607                 kmem_free(resop->nfs_cb_resop4_u.opcbgetattr.
 608                     obj_attributes.attrlist4, cb_getattr_bytes);
 609 }
 610 
 611 static nfsstat4
 612 layoutrecall_all(nfs4_server_t *np)
 613 {
 614         vnode_t *vp;
 615         rnode4_t *rp;
 616         mntinfo4_t *mi = NULL;
 617         nfs4_fsidlt_t *ltp;
 618         nfsstat4 nstatus = NFS4ERR_NOMATCHING_LAYOUT;
 619 
 620         /*
 621          * Walk thru all of the layout trees, and discard all
 622          * all the layouts, effectively discarding all the layouts
 623          * from this particular server, then do LAYOUTRETURN4_ALL.
 624          */
 625         mutex_enter(&np->s_lt_lock);
 626         for (ltp = avl_first(&np->s_fsidlt); ltp;
 627             ltp = AVL_NEXT(&np->s_fsidlt, ltp)) {
 628                 mutex_enter(&ltp->lt_rlt_lock);
 629                 for (rp = avl_first(&ltp->lt_rlayout_tree); rp;
 630                     rp = AVL_NEXT(&ltp->lt_rlayout_tree, rp)) {
 631 
 632                         vp = RTOV4(rp);
 633                         VN_HOLD(vp);
 634                         pnfs_layout_discard(rp, ltp, np);
 635                         /*
 636                          * Hold the mi to prevent it from disappearing
 637                          * after we drop the reference on the vnode.  This
 638                          * will remain held until we send the request down
 639                          * the taskq.
 640                          */
 641                         if (mi == NULL) {
 642                                 mi = VTOMI4(vp);
 643                                 MI4_HOLD(mi);
 644                         }
 645                         VN_RELE(vp);
 646                         nstatus = NFS4_OK;
 647                 }
 648                 mutex_exit(&ltp->lt_rlt_lock);
 649         }
 650         mutex_exit(&np->s_lt_lock);
 651         if (nstatus == NFS4_OK) {
 652                 pnfs_layoutreturn_bulk(mi, kcred, LAYOUTRETURN4_ALL);
 653                 MI4_RELE(mi);
 654         }
 655         return (nstatus);
 656 }
 657 
 658 
 659 static nfsstat4
 660 layoutrecall_fsid(fsid4 *recallfsid, nfs4_server_t *np)
 661 {
 662         vnode_t *vp;
 663         rnode4_t *rp;
 664         mntinfo4_t *mi = NULL;
 665         nfs4_fsidlt_t *ltp, lt;
 666         nfsstat4 nstatus = NFS4ERR_NOMATCHING_LAYOUT;
 667 
 668         lt.lt_fsid.major = recallfsid->major;
 669         lt.lt_fsid.minor = recallfsid->minor;
 670 
 671         mutex_enter(&np->s_lt_lock);
 672         ltp = avl_find(&np->s_fsidlt, &lt, NULL);
 673 
 674         /*
 675          * If no matching fsid layout tree is found, then no layouts exist
 676          * for this fsid.
 677          */
 678         if (ltp == NULL) {
 679                 mutex_exit(&np->s_lt_lock);
 680                 return (nstatus);
 681         }
 682 
 683         /*
 684          * Found a matching fsid tree, return and free all
 685          * layouts on this tree.
 686          */
 687         mutex_enter(&ltp->lt_rlt_lock);
 688         mutex_exit(&np->s_lt_lock);
 689 
 690         for (rp = avl_first(&ltp->lt_rlayout_tree); rp;
 691             rp = AVL_NEXT(&ltp->lt_rlayout_tree, rp)) {
 692                 /*
 693                  * For each rnode on this fsid's layout tree,
 694                  * discard the layout.  We do not return each
 695                  * layout individually, instead we return in
 696                  * bulk, at the end.
 697                  */
 698                 vp = RTOV4(rp);
 699                 VN_HOLD(vp);
 700                 pnfs_layout_discard(rp, ltp, np);
 701                 if (mi == NULL) {
 702                         mi = VTOMI4(vp);
 703                         MI4_HOLD(mi);
 704                 }
 705                 VN_RELE(vp);
 706                 nstatus = NFS4_OK;
 707         }
 708         mutex_exit(&ltp->lt_rlt_lock);
 709         if (nstatus == NFS4_OK) {
 710                 pnfs_layoutreturn_bulk(mi, kcred, LAYOUTRETURN4_FSID);
 711                 MI4_RELE(mi);
 712         }
 713         return (nstatus);
 714 }
 715 
 716 static nfsstat4
 717 layoutrecall_file(layoutrecall_file4 *lrf, nfs4_server_t *np)
 718 {
 719         nfs_fh4         *rawfh = &lrf->lor_fh;
 720         nfs4_sharedfh_t sfh;
 721         vnode_t         *vp;
 722         rnode4_t        lrp, *rp;
 723         nfs4_fsidlt_t   *ltp;
 724         nfsstat4 nstatus = NFS4ERR_NOMATCHING_LAYOUT;
 725 
 726         bcopy(rawfh, &sfh, sizeof (*rawfh));
 727         lrp.r_fh = &sfh;
 728 
 729         mutex_enter(&np->s_lt_lock);
 730         /*
 731          * Look thru the fsid layout trees until we find a matching
 732          * rnode on an fsid layout tree's rnode layout tree.
 733          */
 734         for (ltp = avl_first(&np->s_fsidlt); ltp;
 735             ltp = AVL_NEXT(&np->s_fsidlt, ltp)) {
 736                 /*
 737                  * Look at this fsid layout tree's rnode layout tree
 738                  * and see if it has the rnode we want based on the
 739                  * file handle.
 740                  */
 741                 mutex_enter(&ltp->lt_rlt_lock);
 742                 rp = avl_find(&ltp->lt_rlayout_tree, &lrp, NULL);
 743                 if (rp != NULL) {
 744                         vp = RTOV4(rp);
 745                         VN_HOLD(vp);
 746                         mutex_enter(&rp->r_statelock);
 747                         /*
 748                          * Since this client will only hold one layout
 749                          * for an rnode at a time, if we get a
 750                          * layoutrecall, the stateid it has should match
 751                          * ours!.
 752                          */
 753                         if (lrf->lor_stateid.seqid !=
 754                             rp->r_lostateid.seqid + 1) {
 755                                 cmn_err(CE_WARN, "our layout stateids are"
 756                                     "out of sync! rnode: %p", (void *)rp);
 757                         }
 758                         pnfs_layout_return(vp, kcred, lrf->lor_stateid,
 759                             LR_ASYNC);
 760                         mutex_exit(&rp->r_statelock);
 761                         mutex_exit(&ltp->lt_rlt_lock);
 762                         VN_RELE(vp);
 763                         nstatus = NFS4_OK;
 764                         break;
 765                 }
 766                 mutex_exit(&ltp->lt_rlt_lock);
 767         }
 768         mutex_exit(&np->s_lt_lock);
 769         return (nstatus);
 770 }
 771 
 772 static void
 773 cb_layoutrecall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 774         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 775 {
 776         CB_LAYOUTRECALL4args *args = &argop->nfs_cb_argop4_u.opcblayoutrecall;
 777         CB_LAYOUTRECALL4res *resp = &resop->nfs_cb_resop4_u.opcblayoutrecall;
 778         struct nfs4_server *sp;
 779 
 780         if (args->clora_type != LAYOUT4_NFSV4_1_FILES) {
 781                 DTRACE_PROBE1(nfsc__i__badlayoutype, int32_t,
 782                     args->clora_type);
 783                 *cs->statusp = resp->clorr_status = NFS4ERR_INVAL;
 784                 return;
 785         }
 786 
 787         mutex_enter(&ncg->nfs4_cb_lock);
 788         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 789         mutex_exit(&ncg->nfs4_cb_lock);
 790 
 791         if (nfs4_server_vlock(sp, 0) == FALSE) {
 792                 DTRACE_PROBE1(nfsc__i__bad_prog, int, req->rq_prog);
 793                 *cs->statusp = resp->clorr_status = NFS4ERR_NOMATCHING_LAYOUT;
 794                 return;
 795         }
 796         mutex_exit(&sp->s_lock);
 797 
 798         switch (args->clora_recall.lor_recalltype) {
 799         case LAYOUTRECALL4_FILE:
 800                 *cs->statusp = resp->clorr_status =
 801                     layoutrecall_file(&args->clora_recall.
 802                     layoutrecall4_u.lor_layout, sp);
 803                 break;
 804         case LAYOUTRECALL4_FSID:
 805                 *cs->statusp = resp->clorr_status =
 806                     layoutrecall_fsid(&args->clora_recall.
 807                     layoutrecall4_u.lor_fsid, sp);
 808                 break;
 809         case LAYOUTRECALL4_ALL:
 810                 *cs->statusp = resp->clorr_status = layoutrecall_all(sp);
 811                 break;
 812         default:
 813                 *cs->statusp = resp->clorr_status = NFS4ERR_INVAL;
 814         }
 815         nfs4_server_rele(sp);
 816 
 817         if (resp->clorr_status != NFS4_OK)
 818                 DTRACE_PROBE2(nfsc__i__cblayouterr,
 819                     nfs4_server_t *, sp, nfsstat, resp->clorr_status);
 820 }
 821 
 822 static nfsstat4
 823 cb_notify_device(nfs4_server_t *sp, notify4 *no)
 824 {
 825         nfsstat4 stat = NFS4_OK;
 826         XDR x;
 827         notify_deviceid_change4 ndc;
 828         notify_deviceid_delete4 ndd;
 829 
 830         /* check for missing or extra bits */
 831         if ((no->notify_mask &
 832             ~(NOTIFY_DEVICEID4_CHANGE_MASK|NOTIFY_DEVICEID4_DELETE_MASK)) ||
 833             (no->notify_mask == 0))
 834                 DTRACE_PROBE1(nfsc__i__bad_mask, bitmap4 *, no->notify_mask);
 835 
 836         xdrmem_create(&x, no->notify_vals.notifylist4_val,
 837             no->notify_vals.notifylist4_len, XDR_DECODE);
 838         /*
 839          * The order of checking is significant.  Oddly, both bits
 840          * could be set.
 841          */
 842         if (no->notify_mask & NOTIFY_DEVICEID4_CHANGE_MASK) {
 843 
 844                 if (!xdr_notify_deviceid_change4(&x, &ndc))
 845                         stat = NFS4ERR_BADXDR;
 846                 else {
 847                         stat = pnfs_change_device(sp, &ndc);
 848                         xdr_free(xdr_notify_deviceid_change4, (caddr_t)&ndc);
 849                 }
 850         }
 851         if (stat == NFS4_OK &&
 852             (no->notify_mask & NOTIFY_DEVICEID4_DELETE_MASK)) {
 853 
 854                 if (!xdr_notify_deviceid_delete4(&x, &ndd))
 855                         stat = NFS4ERR_BADXDR;
 856                 else {
 857                         stat = pnfs_delete_device(sp, &ndd);
 858                         xdr_free(xdr_notify_deviceid_change4, (caddr_t)&ndd);
 859                 }
 860         }
 861 
 862         return (stat);
 863 }
 864 
 865 static void
 866 cb_notify_deviceid(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop,
 867     struct svc_req *req, struct compound_state *cs,
 868     struct nfs4_callback_globals *ncg)
 869 {
 870         CB_NOTIFY_DEVICEID4args *args =
 871             &argop->nfs_cb_argop4_u.opcbnotify_deviceid;
 872         CB_NOTIFY_DEVICEID4res *resp =
 873             &resop->nfs_cb_resop4_u.opcbnotify_deviceid;
 874         struct nfs4_server *sp;
 875         int i;
 876         nfsstat4 stat;
 877 
 878         mutex_enter(&ncg->nfs4_cb_lock);
 879         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 880         mutex_exit(&ncg->nfs4_cb_lock);
 881 
 882         if (nfs4_server_vlock(sp, 0) == FALSE) {
 883                 DTRACE_PROBE1(nfsc__i__bad_prog, int, req->rq_prog);
 884                 *cs->statusp = resp->cndr_status = NFS4ERR_INVAL;
 885                 return;
 886         }
 887         mutex_exit(&sp->s_lock);
 888 
 889         stat = NFS4_OK;
 890         for (i = 0; i < args->cnda_changes.cnda_changes_len; i++)
 891                 if ((stat = cb_notify_device(sp,
 892                     &args->cnda_changes.cnda_changes_val[i])) != NFS4_OK)
 893                         break;
 894 
 895         *cs->statusp = resp->cndr_status = stat;
 896         nfs4_server_rele(sp);
 897 }
 898 
 899 
 900 static void
 901 cb_recall(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
 902         struct compound_state *cs, struct nfs4_callback_globals *ncg)
 903 {
 904         CB_RECALL4args * args = &argop->nfs_cb_argop4_u.opcbrecall;
 905         CB_RECALL4res *resp = &resop->nfs_cb_resop4_u.opcbrecall;
 906         rnode4_t *rp;
 907         vnode_t *vp;
 908         struct nfs4_server *sp;
 909         bool_t found = FALSE;
 910 
 911         ncg->nfs4_callback_stats.cb_recall.value.ui64++;
 912 
 913         ASSERT(req->rq_prog >= NFS4_CALLBACK);
 914         ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
 915 
 916 #ifdef DEBUG
 917         /*
 918          * error injection hook: set cb_recall_fail global to
 919          * NFS4 pcol error to be returned
 920          */
 921         if (cb4_recall_fail != NFS4_OK) {
 922                 *cs->statusp = resp->status = cb4_recall_fail;
 923                 return;
 924         }
 925 #endif
 926 
 927         mutex_enter(&ncg->nfs4_cb_lock);
 928         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
 929         mutex_exit(&ncg->nfs4_cb_lock);
 930 
 931         if (nfs4_server_vlock(sp, 0) == FALSE) {
 932 
 933                 CB_WARN("cb_recall: cannot find server\n");
 934 
 935                 *cs->statusp = resp->status = NFS4ERR_BADHANDLE;
 936                 return;
 937         }
 938 
 939         /*
 940          * Search the delegation list for a matching file handle
 941          * AND stateid; mutex on sp prevents the list from changing.
 942          */
 943 
 944         rp = list_head(&sp->s_deleg_list);
 945         for (; rp != NULL; rp = list_next(&sp->s_deleg_list, rp)) {
 946                 mutex_enter(&rp->r_statev4_lock);
 947 
 948                 /* check both state id and file handle! */
 949 
 950                 if ((bcmp(&rp->r_deleg_stateid, &args->stateid,
 951                     sizeof (stateid4)) == 0)) {
 952                         nfs4_fhandle_t fhandle;
 953 
 954                         sfh4_copyval(rp->r_fh, &fhandle);
 955                         if ((fhandle.fh_len == args->fh.nfs_fh4_len &&
 956                             bcmp(fhandle.fh_buf, args->fh.nfs_fh4_val,
 957                             fhandle.fh_len) == 0)) {
 958 
 959                                 found = TRUE;
 960                                 break;
 961                         } else {
 962 #ifdef  DEBUG
 963                                 CB_WARN("cb_recall: stateid OK, bad fh");
 964 #endif
 965                         }
 966                 }
 967 #ifdef  DEBUG
 968                 if (bcmp(&args->stateid, &nfs4_deleg_any,
 969                     sizeof (stateid4)) == 0) {
 970 
 971                         found = TRUE;
 972                         break;
 973                 }
 974 #endif
 975                 mutex_exit(&rp->r_statev4_lock);
 976         }
 977 
 978         /*
 979          * VN_HOLD the vnode before releasing s_lock to guarantee
 980          * we have a valid vnode reference.  The async thread will
 981          * release the hold when it's done.
 982          */
 983         if (found == TRUE) {
 984                 mutex_exit(&rp->r_statev4_lock);
 985                 vp = RTOV4(rp);
 986                 VN_HOLD(vp);
 987         }
 988         mutex_exit(&sp->s_lock);
 989         nfs4_server_rele(sp);
 990 
 991         if (found == FALSE) {
 992 
 993                 CB_WARN("cb_recall: bad stateid\n");
 994 
 995                 *cs->statusp = resp->status = NFS4ERR_BAD_STATEID;
 996                 return;
 997         }
 998 
 999         /* Fire up a thread to do the delegreturn */
1000         nfs4delegreturn_async(rp, NFS4_DR_RECALL|NFS4_DR_REOPEN,
1001             args->truncate);
1002 
1003         *cs->statusp = resp->status = 0;
1004 }
1005 
1006 /* ARGSUSED */
1007 static void
1008 cb_recall_free(nfs_cb_resop4 *resop)
1009 {
1010         /* nothing to do here, cb_recall doesn't kmem_alloc */
1011 }
1012 
1013 /*
1014  * This function handles the CB_NULL proc call from an NFSv4 Server.
1015  *
1016  * We take note that the server has sent a CB_NULL for later processing
1017  * in the recovery logic. It is noted so we may pause slightly after the
1018  * setclientid and before reopening files. The pause is to allow the
1019  * NFSv4 Server time to receive the CB_NULL reply and adjust any of
1020  * its internal structures such that it has the opportunity to grant
1021  * delegations to reopened files.
1022  *
1023  */
1024 
1025 /* ARGSUSED */
1026 static void
1027 cb_null(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
1028     struct nfs4_callback_globals *ncg)
1029 {
1030         struct nfs4_server *sp;
1031 
1032         ncg->nfs4_callback_stats.cb_null.value.ui64++;
1033 
1034         ASSERT(req->rq_prog >= NFS4_CALLBACK);
1035         ASSERT(req->rq_prog < NFS4_CALLBACK+nfs4_num_prognums);
1036 
1037         mutex_enter(&ncg->nfs4_cb_lock);
1038         sp = ncg->nfs4prog2server[req->rq_prog - NFS4_CALLBACK];
1039         mutex_exit(&ncg->nfs4_cb_lock);
1040 
1041         if (nfs4_server_vlock(sp, 0) != FALSE) {
1042                 sp->s_flags |= N4S_CB_PINGED;
1043                 cv_broadcast(&sp->wait_cb_null);
1044                 mutex_exit(&sp->s_lock);
1045                 nfs4_server_rele(sp);
1046         }
1047 }
1048 
1049 /*
1050  * cb_illegal   args: void
1051  *              res : status (NFS4ERR_OP_CB_ILLEGAL)
1052  */
1053 /* ARGSUSED */
1054 static void
1055 cb_illegal(nfs_cb_argop4 *argop, nfs_cb_resop4 *resop, struct svc_req *req,
1056         struct compound_state *cs, struct nfs4_callback_globals *ncg)
1057 {
1058         CB_ILLEGAL4res *resp = &resop->nfs_cb_resop4_u.opcbillegal;
1059 
1060         ncg->nfs4_callback_stats.cb_illegal.value.ui64++;
1061         resop->resop = OP_CB_ILLEGAL;
1062         *cs->statusp = resp->status = NFS4ERR_OP_ILLEGAL;
1063 }
1064 
1065 static void
1066 cb_compound(CB_COMPOUND4args *args, CB_COMPOUND4res *resp, struct svc_req *req,
1067         struct nfs4_callback_globals *ncg)
1068 {
1069         uint_t i;
1070         struct compound_state cs;
1071         nfs_cb_argop4 *argop;
1072         nfs_cb_resop4 *resop, *new_res;
1073         uint_t op, mvers_0;
1074         boolean_t       sequenced = FALSE;
1075 
1076         bzero(&cs, sizeof (cs));
1077         cs.statusp = &resp->status;
1078         cs.cont = TRUE;
1079 
1080         /*
1081          * Form a reply tag by copying over the reqeuest tag.
1082          */
1083         resp->tag.utf8string_len = args->tag.utf8string_len;
1084         resp->tag.utf8string_val = kmem_alloc(resp->tag.utf8string_len,
1085             KM_SLEEP);
1086         bcopy(args->tag.utf8string_val, resp->tag.utf8string_val,
1087             args->tag.utf8string_len);
1088 
1089         /*
1090          * minorversion should be zero or one
1091          */
1092         if (args->minorversion != CB4_MINOR_v0 &&
1093             args->minorversion != CB4_MINOR_v1) {
1094                 resp->array_len = 0;
1095                 resp->array = NULL;
1096                 resp->status = NFS4ERR_MINOR_VERS_MISMATCH;
1097                 return;
1098         }
1099 
1100         /*
1101          * The XDR code for CB_COMPOUND decodes all cb ops regardless
1102          * of the minorversion of the compound containing the ops.
1103          *
1104          * mvers_0 is used to validate ops according to minor version:
1105          * - only mvers 0 cb ops are allowed in mv 0 cb compounds
1106          * - "is sequenced" checks only apply to mv 1 cb compunds
1107          */
1108         mvers_0 = (args->minorversion == CB4_MINOR_v0);
1109 
1110 #ifdef DEBUG
1111         /*
1112          * Verify callback_ident.  It doesn't really matter if it's wrong
1113          * because we don't really use callback_ident -- we use prog number
1114          * of the RPC request instead.  In this case, just print a DEBUG
1115          * console message to reveal brokenness of cbclient (at bkoff/cthon).
1116          */
1117         if (args->callback_ident != req->rq_prog)
1118                 zcmn_err(getzoneid(), CE_WARN,
1119                     "cb_compound: cb_client using wrong "
1120                     "callback_ident(%d), should be %d",
1121                     args->callback_ident, req->rq_prog);
1122 #endif
1123 
1124         resp->array_len = args->array_len;
1125         resp->array = kmem_zalloc(args->array_len * sizeof (nfs_cb_resop4),
1126             KM_SLEEP);
1127 
1128         for (i = 0; i < args->array_len && cs.cont; i++) {
1129 
1130                 argop = &args->array[i];
1131                 resop = &resp->array[i];
1132                 resop->resop = argop->argop;
1133                 op = (uint_t)resop->resop;
1134 
1135                 switch (op) {
1136 
1137                 case OP_CB_SEQUENCE:
1138 
1139                         if (mvers_0) {
1140                                 op = OP_CB_ILLEGAL;
1141                                 cb_illegal(argop, resop, req, &cs, ncg);
1142                                 break;
1143                         }
1144                         cb_sequence(argop, resop, req, &cs, ncg);
1145                         if (*cs.statusp == NFS4_OK)
1146                                 sequenced = TRUE;
1147                         break;
1148 
1149                 case OP_CB_GETATTR:
1150 
1151                         if (!sequenced && !mvers_0) {
1152                                 *cs.statusp = resp->status =
1153                                     NFS4ERR_SEQUENCE_POS;
1154                                 break;
1155                         }
1156                         cb_getattr(argop, resop, req, &cs, ncg);
1157                         break;
1158 
1159                 case OP_CB_RECALL:
1160                         if (!sequenced && !mvers_0) {
1161                                 *cs.statusp = resp->status =
1162                                     NFS4ERR_SEQUENCE_POS;
1163                                 break;
1164                         }
1165                         cb_recall(argop, resop, req, &cs, ncg);
1166                         break;
1167 
1168                 case OP_CB_LAYOUTRECALL:
1169                         if (mvers_0) {
1170                                 op = OP_CB_ILLEGAL;
1171                                 cb_illegal(argop, resop, req, &cs, ncg);
1172                                 break;
1173                         }
1174                         if (!sequenced) {
1175                                 *cs.statusp = resp->status =
1176                                     NFS4ERR_SEQUENCE_POS;
1177                                 break;
1178                         }
1179                         cb_layoutrecall(argop, resop, req, &cs, ncg);
1180                         break;
1181 
1182                 case OP_CB_NOTIFY_DEVICEID:
1183                         if (mvers_0) {
1184                                 op = OP_CB_ILLEGAL;
1185                                 cb_illegal(argop, resop, req, &cs, ncg);
1186                                 break;
1187                         }
1188                         if (!sequenced) {
1189                                 *cs.statusp = resp->status =
1190                                     NFS4ERR_SEQUENCE_POS;
1191                                 break;
1192                         }
1193                         cb_notify_deviceid(argop, resop, req, &cs, ncg);
1194                         break;
1195 
1196                 case OP_CB_ILLEGAL:
1197                         if (!sequenced && !mvers_0) {
1198                                 *cs.statusp = resp->status =
1199                                     NFS4ERR_SEQUENCE_POS;
1200                                 break;
1201                         }
1202                         /* fall through */
1203 
1204                 default:
1205                         /*
1206                          * Handle OP_CB_ILLEGAL and any undefined opcode.
1207                          * Currently, the XDR code will return BADXDR
1208                          * if cb op doesn't decode to legal value, so
1209                          * it really only handles OP_CB_ILLEGAL.
1210                          */
1211                         op = OP_CB_ILLEGAL;
1212                         cb_illegal(argop, resop, req, &cs, ncg);
1213                 }
1214 
1215                 if (*cs.statusp != NFS4_OK)
1216                         cs.cont = FALSE;
1217 
1218                 /*
1219                  * If not at last op, and if we are to stop, then
1220                  * compact the results array.
1221                  */
1222                 if ((i + 1) < args->array_len && !cs.cont) {
1223 
1224                         new_res = kmem_alloc(
1225                             (i+1) * sizeof (nfs_cb_resop4), KM_SLEEP);
1226                         bcopy(resp->array,
1227                             new_res, (i+1) * sizeof (nfs_cb_resop4));
1228                         kmem_free(resp->array,
1229                             args->array_len * sizeof (nfs_cb_resop4));
1230 
1231                         resp->array_len =  i + 1;
1232                         resp->array = new_res;
1233                 }
1234         }
1235 
1236 }
1237 
1238 static void
1239 cb_compound_free(CB_COMPOUND4res *resp)
1240 {
1241         uint_t i, op;
1242         nfs_cb_resop4 *resop;
1243 
1244         if (resp->tag.utf8string_val) {
1245                 UTF8STRING_FREE(resp->tag)
1246         }
1247 
1248         for (i = 0; i < resp->array_len; i++) {
1249 
1250                 resop = &resp->array[i];
1251                 op = (uint_t)resop->resop;
1252 
1253                 switch (op) {
1254 
1255                 case OP_CB_GETATTR:
1256 
1257                         cb_getattr_free(resop);
1258                         break;
1259 
1260                 case OP_CB_RECALL:
1261 
1262                         cb_recall_free(resop);
1263                         break;
1264 
1265                 default:
1266                         break;
1267                 }
1268         }
1269 
1270         if (resp->array != NULL) {
1271                 kmem_free(resp->array,
1272                     resp->array_len * sizeof (nfs_cb_resop4));
1273         }
1274 }
1275 
1276 static void
1277 cb_dispatch(struct svc_req *req, SVCXPRT *xprt)
1278 {
1279         CB_COMPOUND4args args;
1280         CB_COMPOUND4res res;
1281         struct nfs4_callback_globals *ncg;
1282 
1283         bool_t (*xdr_args)(), (*xdr_res)();
1284         void (*proc)(CB_COMPOUND4args *, CB_COMPOUND4res *, struct svc_req *,
1285             struct nfs4_callback_globals *);
1286         void (*freeproc)(CB_COMPOUND4res *);
1287 
1288         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1289         ASSERT(ncg != NULL);
1290 
1291         ncg->nfs4_callback_stats.cb_dispatch.value.ui64++;
1292 
1293         switch (req->rq_proc) {
1294         case CB_NULL:
1295                 xdr_args = xdr_void;
1296                 xdr_res = xdr_void;
1297                 proc = cb_null;
1298                 freeproc = NULL;
1299                 break;
1300 
1301         case CB_COMPOUND:
1302                 xdr_args = xdr_CB_COMPOUND4args_clnt;
1303                 xdr_res = xdr_CB_COMPOUND4res;
1304                 proc = cb_compound;
1305                 freeproc = cb_compound_free;
1306                 break;
1307 
1308         default:
1309                 CB_WARN("cb_dispatch: no proc\n");
1310                 svcerr_noproc(xprt);
1311                 return;
1312         }
1313 
1314         args.tag.utf8string_val = NULL;
1315         args.array = NULL;
1316 
1317         if (!SVC_GETARGS(xprt, xdr_args, (caddr_t)&args)) {
1318 
1319                 CB_WARN("cb_dispatch: cannot getargs\n");
1320                 svcerr_decode(xprt);
1321                 return;
1322         }
1323 
1324         (*proc)(&args, &res, req, ncg);
1325 
1326         if (svc_sendreply(xprt, xdr_res, (caddr_t)&res) == FALSE) {
1327 
1328                 CB_WARN("cb_dispatch: bad sendreply\n");
1329                 svcerr_systemerr(xprt);
1330         }
1331 
1332         if (freeproc)
1333                 (*freeproc)(&res);
1334 
1335         if (!SVC_FREEARGS(xprt, xdr_args, (caddr_t)&args)) {
1336 
1337                 CB_WARN("cb_dispatch: bad freeargs\n");
1338         }
1339 }
1340 
1341 static rpcprog_t
1342 nfs4_getnextprogram(struct nfs4_callback_globals *ncg)
1343 {
1344         int i, j;
1345 
1346         j = ncg->nfs4_program_hint;
1347         for (i = 0; i < nfs4_num_prognums; i++, j++) {
1348 
1349                 if (j >= nfs4_num_prognums)
1350                         j = 0;
1351 
1352                 if (ncg->nfs4prog2server[j] == NULL) {
1353                         ncg->nfs4_program_hint = j+1;
1354                         return (j+NFS4_CALLBACK);
1355                 }
1356         }
1357 
1358         return (0);
1359 }
1360 
1361 void
1362 nfs4callback_destroy(nfs4_server_t *np)
1363 {
1364         struct nfs4_callback_globals *ncg;
1365         struct nfs41_cb_info *cbi;
1366         int i;
1367 
1368         if (np->s_program == 0)
1369                 return;
1370 
1371         ncg = np->zone_globals;
1372         cbi = ncg->nfs4prog2cbinfo[np->s_program - NFS4_CALLBACK];
1373 
1374         i = np->s_program - NFS4_CALLBACK;
1375 
1376         mutex_enter(&ncg->nfs4_cb_lock);
1377 
1378         ASSERT(ncg->nfs4prog2server[i] == np);
1379 
1380         ncg->nfs4prog2server[i] = NULL;
1381         ncg->nfs4prog2cbinfo[i] = NULL;
1382 
1383         if (i < ncg->nfs4_program_hint)
1384                 ncg->nfs4_program_hint = i;
1385 
1386         mutex_exit(&ncg->nfs4_cb_lock);
1387         np->s_program = 0;
1388         if (cbi != NULL)
1389                 nfs41_cbinfo_rele(cbi);
1390 }
1391 
1392 void
1393 nfs41_cbinfo_rele(struct nfs41_cb_info *cbi)
1394 {
1395         mutex_enter(&cbi->cb_reflock);
1396         cbi->cb_refcnt--;
1397         if (cbi->cb_refcnt > 0) {
1398                 mutex_exit(&cbi->cb_reflock);
1399                 return;
1400         }

1401         ASSERT(cbi->cb_cbconn_exit);
1402         mutex_exit(&cbi->cb_reflock);
1403 



1404         if (cbi->cb_client) {
1405                 if (!(CLNT_CONTROL(cbi->cb_client,
1406                     CLSET_BACKCHANNEL_CLEAR, NULL))) {
1407                         zcmn_err(getzoneid(), CE_WARN,
1408                             "Failed To Clear Client Handle Callback %p",
1409                             (void *)cbi->cb_client);
1410                 }
1411                 CLNT_DESTROY(cbi->cb_client);
1412         }
1413         mutex_destroy(&cbi->cb_cbconn_lock);
1414         cv_destroy(&cbi->cb_destroy_wait);
1415         cv_destroy(&cbi->cb_cbconn_wait);
1416         mutex_destroy(&cbi->cb_reflock);
1417         kmem_free(cbi, sizeof (*cbi));
1418 }
1419 
1420 /*
1421  * nfs4_setport - This function saves a netid and univeral address for
1422  * the callback program.  These values will be used during setclientid.
1423  */
1424 static void
1425 nfs4_setport(char *netid, char *uaddr, char *protofmly, char *proto,
1426         struct nfs4_callback_globals *ncg)
1427 {
1428         struct nfs4_cb_port *p;
1429         bool_t found = FALSE;
1430 
1431         ASSERT(MUTEX_HELD(&ncg->nfs4_cb_lock));
1432 
1433         p = list_head(&ncg->nfs4_cb_ports);
1434         for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
1435                 if (strcmp(p->netid, netid) == 0) {
1436                         found = TRUE;
1437                         break;
1438                 }
1439         }
1440         if (found == TRUE)
1441                 (void) strcpy(p->uaddr, uaddr);
1442         else {
1443                 p = kmem_alloc(sizeof (*p), KM_SLEEP);
1444 
1445                 (void) strcpy(p->uaddr, uaddr);
1446                 (void) strcpy(p->netid, netid);
1447                 (void) strcpy(p->protofmly, protofmly);
1448                 (void) strcpy(p->proto, proto);
1449                 list_insert_head(&ncg->nfs4_cb_ports, p);
1450         }
1451 }
1452 



































































































































1453 /*
1454  * nfs4_cb_args - This function is used to construct the callback
1455  * portion of the arguments needed for setclientid.
1456  */
1457 
1458 void
1459 nfs4_cb_args(nfs4_server_t *np, struct knetconfig *knc, SETCLIENTID4args *args)
1460 {
1461         struct nfs4_cb_port *p;
1462         bool_t found = FALSE;
1463         rpcprog_t pgm;
1464         struct nfs4_callback_globals *ncg = np->zone_globals;
1465 
1466         /*
1467          * This server structure may already have a program number
1468          * assigned to it.  This happens when the client has to
1469          * re-issue SETCLIENTID.  Just re-use the information.
1470          */
1471         if (np->s_program >= NFS4_CALLBACK &&
1472             np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
1473                 nfs4callback_destroy(np);
1474 
1475         mutex_enter(&ncg->nfs4_cb_lock);
1476 
1477         p = list_head(&ncg->nfs4_cb_ports);
1478         for (; p != NULL; p = list_next(&ncg->nfs4_cb_ports, p)) {
1479                 if (strcmp(p->protofmly, knc->knc_protofmly) == 0 &&
1480                     strcmp(p->proto, knc->knc_proto) == 0) {
1481                         found = TRUE;
1482                         break;
1483                 }
1484         }
1485 
1486         if (found == FALSE) {
1487 
1488                 NFS4_DEBUG(nfs4_callback_debug,
1489                     (CE_WARN, "nfs4_cb_args: could not find netid for %s/%s\n",
1490                     knc->knc_protofmly, knc->knc_proto));
1491 
1492                 args->callback.cb_program = 0;
1493                 args->callback.cb_location.r_netid = NULL;
1494                 args->callback.cb_location.r_addr = NULL;
1495                 args->callback_ident = 0;
1496                 mutex_exit(&ncg->nfs4_cb_lock);
1497                 return;
1498         }
1499 
1500         if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
1501                 CB_WARN("nfs4_cb_args: out of program numbers\n");
1502 
1503                 args->callback.cb_program = 0;
1504                 args->callback.cb_location.r_netid = NULL;
1505                 args->callback.cb_location.r_addr = NULL;
1506                 args->callback_ident = 0;
1507                 mutex_exit(&ncg->nfs4_cb_lock);
1508                 return;
1509         }
1510 
1511         ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
1512         args->callback.cb_program = pgm;
1513         args->callback.cb_location.r_netid = p->netid;
1514         args->callback.cb_location.r_addr = p->uaddr;
1515         args->callback_ident = pgm;
1516 
1517         np->s_program = pgm;
1518 
1519         mutex_exit(&ncg->nfs4_cb_lock);
1520 }
1521 
1522 /*
1523  * nfs4_cb_args - This function is used to construct the callback
1524  * portion of the arguments needed for create_session.
1525  */
1526 /* ARGSUSED */
1527 void
1528 nfs41_cb_args(nfs4_server_t *np, struct knetconfig *knc,
1529         CREATE_SESSION4args *args)
1530 {
1531         rpcprog_t pgm;
1532         struct nfs4_callback_globals *ncg = np->zone_globals;
1533         struct nfs41_cb_info    *cbi;
1534 
1535         /*
1536          * This server structure may already have a program number
1537          * assigned to it.  This happens when the client has to
1538          * re-issue SETCLIENTID.  Just re-use the information.
1539          */
1540         if (np->s_program >= NFS4_CALLBACK &&
1541             np->s_program < NFS4_CALLBACK + nfs4_num_prognums)
1542                 nfs4callback_destroy(np);
1543 
1544         mutex_enter(&ncg->nfs4_cb_lock);
1545 
1546         if ((pgm = nfs4_getnextprogram(ncg)) == 0) {
1547                 CB_WARN("nfs4_cb_args: out of program numbers\n");
1548 
1549                 args->csa_cb_program = 0;
1550                 args->csa_sec_parms.csa_sec_parms_len = 0;
1551                 args->csa_sec_parms.csa_sec_parms_val = NULL;
1552                 mutex_exit(&ncg->nfs4_cb_lock);
1553                 return;
1554         }
1555 
1556         if (ncg->nfs4prog2cbinfo[pgm-NFS4_CALLBACK] == NULL)
1557                 cbi = kmem_zalloc(sizeof (struct nfs41_cb_info), KM_SLEEP);
1558         else
1559                 cbi = ncg->nfs4prog2cbinfo[pgm-NFS4_CALLBACK];
1560 
1561         cbi->cb_prog = pgm;
1562         cbi->cb_dispatch = cb_dispatch;


1563 
1564         cv_init(&cbi->cb_destroy_wait, NULL, CV_DEFAULT, NULL);
1565         mutex_init(&cbi->cb_reflock, NULL, MUTEX_DEFAULT, NULL);
1566 
1567         cv_init(&cbi->cb_cbconn_wait, NULL, CV_DEFAULT, NULL);
1568         mutex_init(&cbi->cb_cbconn_lock, NULL, MUTEX_DEFAULT, NULL);
1569 
1570         /*
1571          * set cb_refcnt to 1, to account for it being in the
1572          * nfs4prog2cbinfo table
1573          */
1574         cbi->cb_refcnt = 1;
1575 
1576         ncg->nfs4prog2cbinfo[pgm-NFS4_CALLBACK] = cbi;
1577         ncg->nfs4prog2server[pgm-NFS4_CALLBACK] = np;
1578         np->s_program = pgm;
1579         mutex_exit(&ncg->nfs4_cb_lock);
1580 
1581         args->csa_cb_program = pgm;
1582         args->csa_sec_parms.csa_sec_parms_len = 1;
1583         args->csa_sec_parms.csa_sec_parms_val = (callback_sec_parms4 *)
1584             kmem_zalloc(sizeof (callback_sec_parms4), KM_SLEEP);
1585         args->csa_sec_parms.csa_sec_parms_val->cb_secflavor = AUTH_NONE;












1586 }
1587 
1588 static int
1589 nfs4_dquery(struct nfs4_svc_args *arg, model_t model)
1590 {
1591         file_t *fp;
1592         vnode_t *vp;
1593         rnode4_t *rp;
1594         int error;
1595         STRUCT_HANDLE(nfs4_svc_args, uap);
1596 
1597         STRUCT_SET_HANDLE(uap, model, arg);
1598 
1599         if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
1600                 return (EBADF);
1601 
1602         vp = fp->f_vnode;
1603 
1604         if (vp == NULL || vp->v_type != VREG ||
1605             !vn_matchops(vp, nfs4_vnodeops)) {
1606                 releasef(STRUCT_FGET(uap, fd));
1607                 return (EBADF);
1608         }
1609 
1610         rp = VTOR4(vp);
1611 
1612         /*
1613          * I can't convince myself that we need locking here.  The
1614          * rnode cannot disappear and the value returned is instantly
1615          * stale anway, so why bother?
1616          */
1617 
1618         error = suword32(STRUCT_FGETP(uap, netid), rp->r_deleg_type);
1619         releasef(STRUCT_FGET(uap, fd));
1620         return (error);
1621 }
1622 
1623 
1624 /*
1625  * NFS4 client system call.  This service does the
1626  * necessary initialization for the callback program.
1627  * This is fashioned after the server side interaction
1628  * between nfsd and the kernel.  On the client, the
1629  * mount command forks and the child process does the
1630  * necessary interaction with the kernel.
1631  *
1632  * uap->fd is the fd of an open transport provider
1633  */
1634 int
1635 nfs4_svc(struct nfs4_svc_args *arg, model_t model)
1636 {
1637         file_t *fp;
1638         int error;
1639         int readsize;
1640         char buf[KNC_STRSIZE], uaddr[KNC_STRSIZE];
1641         char protofmly[KNC_STRSIZE], proto[KNC_STRSIZE];
1642         size_t len;
1643         STRUCT_HANDLE(nfs4_svc_args, uap);
1644         struct netbuf addrmask;
1645         int cmd;
1646         SVCMASTERXPRT *cb_xprt;
1647         struct nfs4_callback_globals *ncg;
1648 
1649 #ifdef lint
1650         model = model;          /* STRUCT macros don't always refer to it */
1651 #endif
1652 
1653         STRUCT_SET_HANDLE(uap, model, arg);
1654 
1655         if (STRUCT_FGET(uap, cmd) == NFS4_DQUERY)
1656                 return (nfs4_dquery(arg, model));
1657 
1658         if (secpolicy_nfs(CRED()) != 0)
1659                 return (EPERM);
1660 
1661         if ((fp = getf(STRUCT_FGET(uap, fd))) == NULL)
1662                 return (EBADF);
1663 
1664         /*
1665          * Set read buffer size to rsize
1666          * and add room for RPC headers.
1667          */
1668         readsize = nfs3tsize() + (RPC_MAXDATASIZE - NFS_MAXDATA);
1669         if (readsize < RPC_MAXDATASIZE)
1670                 readsize = RPC_MAXDATASIZE;
1671 
1672         error = copyinstr((const char *)STRUCT_FGETP(uap, netid), buf,
1673             KNC_STRSIZE, &len);
1674         if (error) {
1675                 releasef(STRUCT_FGET(uap, fd));
1676                 return (error);
1677         }
1678 
1679         cmd = STRUCT_FGET(uap, cmd);
1680 
1681         if (cmd & NFS4_KRPC_START) {
1682                 addrmask.len = STRUCT_FGET(uap, addrmask.len);
1683                 addrmask.maxlen = STRUCT_FGET(uap, addrmask.maxlen);
1684                 addrmask.buf = kmem_alloc(addrmask.maxlen, KM_SLEEP);
1685                 error = copyin(STRUCT_FGETP(uap, addrmask.buf), addrmask.buf,
1686                     addrmask.len);
1687                 if (error) {
1688                         releasef(STRUCT_FGET(uap, fd));
1689                         kmem_free(addrmask.buf, addrmask.maxlen);
1690                         return (error);
1691                 }
1692         }
1693         else
1694                 addrmask.buf = NULL;
1695 
1696         error = copyinstr((const char *)STRUCT_FGETP(uap, addr), uaddr,
1697             sizeof (uaddr), &len);
1698         if (error) {
1699                 releasef(STRUCT_FGET(uap, fd));
1700                 if (addrmask.buf)
1701                         kmem_free(addrmask.buf, addrmask.maxlen);
1702                 return (error);
1703         }
1704 
1705         error = copyinstr((const char *)STRUCT_FGETP(uap, protofmly), protofmly,
1706             sizeof (protofmly), &len);
1707         if (error) {
1708                 releasef(STRUCT_FGET(uap, fd));
1709                 if (addrmask.buf)
1710                         kmem_free(addrmask.buf, addrmask.maxlen);
1711                 return (error);
1712         }
1713 
1714         error = copyinstr((const char *)STRUCT_FGETP(uap, proto), proto,
1715             sizeof (proto), &len);
1716         if (error) {
1717                 releasef(STRUCT_FGET(uap, fd));
1718                 if (addrmask.buf)
1719                         kmem_free(addrmask.buf, addrmask.maxlen);
1720                 return (error);
1721         }
1722 
1723         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
1724         ASSERT(ncg != NULL);
1725 
1726         mutex_enter(&ncg->nfs4_cb_lock);
1727         if (cmd & NFS4_SETPORT)
1728                 nfs4_setport(buf, uaddr, protofmly, proto, ncg);
1729 
1730         if (cmd & NFS4_KRPC_START) {
1731                 error = svc_tli_kcreate(fp, readsize, buf, &addrmask, &cb_xprt,
1732                     &nfs4_cb_sct, NULL, NFS_CB_SVCPOOL_ID, FALSE);
1733                 if (error) {
1734                         CB_WARN1("nfs4_svc: svc_tli_kcreate failed %d\n",
1735                             error);
1736                         kmem_free(addrmask.buf, addrmask.maxlen);
1737                 }
1738         }
1739 
1740         mutex_exit(&ncg->nfs4_cb_lock);
1741         releasef(STRUCT_FGET(uap, fd));
1742         return (error);
1743 }
1744 
1745 struct nfs4_callback_globals *
1746 nfs4_get_callback_globals(void)
1747 {
1748         return (zone_getspecific(nfs4_callback_zone_key, nfs_zone()));
1749 }
1750 
1751 static void *
1752 nfs4_callback_init_zone(zoneid_t zoneid)
1753 {
1754         kstat_t *nfs4_callback_kstat;
1755         struct nfs4_callback_globals *ncg;
1756 
1757         ncg = kmem_zalloc(sizeof (*ncg), KM_SLEEP);
1758 
1759         ncg->nfs4prog2server = kmem_zalloc(nfs4_num_prognums *
1760             sizeof (struct nfs4_server *), KM_SLEEP);
1761 
1762         ncg->nfs4prog2cbinfo = kmem_zalloc(nfs4_num_prognums *
1763             sizeof (struct nfs4_cb_info *), KM_SLEEP);
1764 
1765         /* initialize the dlist */
1766         mutex_init(&ncg->nfs4_dlist_lock, NULL, MUTEX_DEFAULT, NULL);
1767         list_create(&ncg->nfs4_dlist, sizeof (struct nfs4_dnode),
1768             offsetof(struct nfs4_dnode, linkage));
1769 
1770         /* initialize cb_port list */
1771         mutex_init(&ncg->nfs4_cb_lock, NULL, MUTEX_DEFAULT, NULL);
1772         list_create(&ncg->nfs4_cb_ports, sizeof (struct nfs4_cb_port),
1773             offsetof(struct nfs4_cb_port, linkage));
1774 
1775         /* get our own copy of the kstats */
1776         bcopy(&nfs4_callback_stats_tmpl, &ncg->nfs4_callback_stats,
1777             sizeof (nfs4_callback_stats_tmpl));
1778         /* register "nfs:0:nfs4_callback_stats" for this zone */
1779         if ((nfs4_callback_kstat =
1780             kstat_create_zone("nfs", 0, "nfs4_callback_stats", "misc",
1781             KSTAT_TYPE_NAMED,
1782             sizeof (ncg->nfs4_callback_stats) / sizeof (kstat_named_t),
1783             KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
1784             zoneid)) != NULL) {
1785                 nfs4_callback_kstat->ks_data = &ncg->nfs4_callback_stats;
1786                 kstat_install(nfs4_callback_kstat);
1787         }
1788         return (ncg);
1789 }
1790 
1791 static void
1792 nfs4_discard_delegations(struct nfs4_callback_globals *ncg)
1793 {
1794         nfs4_server_t *sp;
1795         int i, num_removed;
1796 
1797         /*
1798          * It's OK here to just run through the registered "programs", as
1799          * servers without programs won't have any delegations to handle.
1800          */
1801         for (i = 0; i < nfs4_num_prognums; i++) {
1802                 rnode4_t *rp;
1803 
1804                 mutex_enter(&ncg->nfs4_cb_lock);
1805                 sp = ncg->nfs4prog2server[i];
1806                 mutex_exit(&ncg->nfs4_cb_lock);
1807 
1808                 if (nfs4_server_vlock(sp, 1) == FALSE)
1809                         continue;
1810                 num_removed = 0;
1811                 while ((rp = list_head(&sp->s_deleg_list)) != NULL) {
1812                         mutex_enter(&rp->r_statev4_lock);
1813                         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
1814                                 /*
1815                                  * We need to take matters into our own hands,
1816                                  * as nfs4delegreturn_cleanup_impl() won't
1817                                  * remove this from the list.
1818                                  */
1819                                 list_remove(&sp->s_deleg_list, rp);
1820                                 mutex_exit(&rp->r_statev4_lock);
1821                                 nfs4_dec_state_ref_count_nolock(sp,
1822                                     VTOMI4(RTOV4(rp)));
1823                                 num_removed++;
1824                                 continue;
1825                         }
1826                         mutex_exit(&rp->r_statev4_lock);
1827                         VN_HOLD(RTOV4(rp));
1828                         mutex_exit(&sp->s_lock);
1829                         /*
1830                          * The following will remove the node from the list.
1831                          */
1832                         nfs4delegreturn_cleanup_impl(rp, sp, ncg);
1833                         VN_RELE(RTOV4(rp));
1834                         mutex_enter(&sp->s_lock);
1835                 }
1836                 mutex_exit(&sp->s_lock);
1837                 /* each removed list node reles a reference */
1838                 while (num_removed-- > 0)
1839                         nfs4_server_rele(sp);
1840                 /* remove our reference for nfs4_server_vlock */
1841                 nfs4_server_rele(sp);
1842         }
1843 }
1844 
1845 /* ARGSUSED */
1846 static void
1847 nfs4_callback_shutdown_zone(zoneid_t zoneid, void *data)
1848 {
1849         struct nfs4_callback_globals *ncg = data;
1850 
1851         /*
1852          * Clean pending delegation return list.
1853          */
1854         nfs4_dlistclean_impl(ncg, NFS4_DR_DISCARD);
1855 
1856         /*
1857          * Discard all delegations.
1858          */
1859         nfs4_discard_delegations(ncg);
1860 }
1861 
1862 static void
1863 nfs4_callback_fini_zone(zoneid_t zoneid, void *data)
1864 {
1865         struct nfs4_callback_globals *ncg = data;
1866         struct nfs4_cb_port *p;
1867         nfs4_server_t *sp, *next;
1868         nfs4_server_t freelist;
1869         int i;
1870 
1871         kstat_delete_byname_zone("nfs", 0, "nfs4_callback_stats", zoneid);
1872 
1873         /*
1874          * Discard all delegations that may have crept in since we did the
1875          * _shutdown.
1876          */
1877         nfs4_discard_delegations(ncg);
1878         /*
1879          * We're completely done with this zone and all associated
1880          * nfs4_server_t's.  Any remaining nfs4_server_ts should only have one
1881          * more reference outstanding -- the reference we didn't release in
1882          * nfs4_renew_lease_thread().
1883          *
1884          * Here we need to run through the global nfs4_server_lst as we need to
1885          * deal with nfs4_server_ts without programs, as they also have threads
1886          * created for them, and so have outstanding references that we need to
1887          * release.
1888          */
1889         freelist.forw = &freelist;
1890         freelist.back = &freelist;
1891         mutex_enter(&nfs4_server_lst_lock);
1892         sp = nfs4_server_lst.forw;
1893         while (sp != &nfs4_server_lst) {
1894                 next = sp->forw;
1895                 if (sp->zoneid == zoneid) {
1896                         remque(sp);
1897                         insque(sp, &freelist);
1898                 }
1899                 sp = next;
1900         }
1901         mutex_exit(&nfs4_server_lst_lock);
1902 
1903         sp = freelist.forw;
1904         while (sp != &freelist) {
1905                 next = sp->forw;
1906                 nfs4_server_rele(sp);   /* free the list's reference */
1907                 sp = next;
1908         }
1909 
1910 #ifdef DEBUG
1911         for (i = 0; i < nfs4_num_prognums; i++) {
1912                 ASSERT(ncg->nfs4prog2server[i] == NULL);
1913         }
1914 #endif
1915         kmem_free(ncg->nfs4prog2server, nfs4_num_prognums *
1916             sizeof (struct nfs4_server *));
1917 
1918         mutex_enter(&ncg->nfs4_cb_lock);
1919         while ((p = list_head(&ncg->nfs4_cb_ports)) != NULL) {
1920                 list_remove(&ncg->nfs4_cb_ports, p);
1921                 kmem_free(p, sizeof (*p));
1922         }
1923         list_destroy(&ncg->nfs4_cb_ports);
1924         mutex_destroy(&ncg->nfs4_cb_lock);
1925         list_destroy(&ncg->nfs4_dlist);
1926         mutex_destroy(&ncg->nfs4_dlist_lock);
1927         kmem_free(ncg, sizeof (*ncg));
1928 }
1929 
1930 void
1931 nfs4_callback_init(void)
1932 {
1933         int i;
1934         SVC_CALLOUT *nfs4_cb_sc;
1935 
1936         /* initialize the callback table */
1937         nfs4_cb_sc = kmem_alloc(nfs4_num_prognums *
1938             sizeof (SVC_CALLOUT), KM_SLEEP);
1939 
1940         for (i = 0; i < nfs4_num_prognums; i++) {
1941                 nfs4_cb_sc[i].sc_prog = NFS4_CALLBACK+i;
1942                 nfs4_cb_sc[i].sc_versmin = NFS_CB;
1943                 nfs4_cb_sc[i].sc_versmax = NFS_CB;
1944                 nfs4_cb_sc[i].sc_dispatch = cb_dispatch;
1945         }
1946 
1947         nfs4_cb_sct.sct_size = nfs4_num_prognums;
1948         nfs4_cb_sct.sct_free = FALSE;
1949         nfs4_cb_sct.sct_sc = nfs4_cb_sc;
1950 
1951         /*
1952          * Compute max bytes required for dyamically allocated parts
1953          * of cb_getattr reply.  Only size and change are supported now.
1954          * If CB_GETATTR is changed to reply with additional attrs,
1955          * additional sizes must be added below.
1956          *
1957          * fattr4_change + fattr4_size == uint64_t + uint64_t
1958          */
1959         cb_getattr_bytes = 2 * BYTES_PER_XDR_UNIT + 2 * BYTES_PER_XDR_UNIT;
1960 
1961         zone_key_create(&nfs4_callback_zone_key, nfs4_callback_init_zone,
1962             nfs4_callback_shutdown_zone, nfs4_callback_fini_zone);
1963 }
1964 
1965 void
1966 nfs4_callback_fini(void)
1967 {
1968 }
1969 
1970 /*
1971  * NB: This function can be called from the *wrong* zone (ie, the zone that
1972  * 'rp' belongs to and the caller's zone may not be the same).  This can happen
1973  * if the zone is going away and we get called from nfs4_async_inactive().  In
1974  * this case the globals will be NULL and we won't update the counters, which
1975  * doesn't matter as the zone is going away anyhow.
1976  */
1977 static void
1978 nfs4delegreturn_cleanup_impl(rnode4_t *rp, nfs4_server_t *np,
1979         struct nfs4_callback_globals *ncg)
1980 {
1981         mntinfo4_t *mi = VTOMI4(RTOV4(rp));
1982         boolean_t need_rele = B_FALSE;
1983 
1984         /*
1985          * Caller must be holding mi_recovlock in read mode
1986          * to call here.  This is provided by start_op.
1987          * Delegation management requires to grab s_lock
1988          * first and then r_statev4_lock.
1989          */
1990 
1991         if (np == NULL) {
1992                 np = find_nfs4_server_all(mi, 1);
1993                 ASSERT(np != NULL);
1994                 need_rele = B_TRUE;
1995         } else {
1996                 mutex_enter(&np->s_lock);
1997         }
1998 
1999         mutex_enter(&rp->r_statev4_lock);
2000 
2001         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2002                 mutex_exit(&rp->r_statev4_lock);
2003                 mutex_exit(&np->s_lock);
2004                 if (need_rele)
2005                         nfs4_server_rele(np);
2006                 return;
2007         }
2008 
2009         /*
2010          * Free the cred originally held when
2011          * the delegation was granted.  Caller must
2012          * hold this cred if it wants to use it after
2013          * this call.
2014          */
2015         crfree(rp->r_deleg_cred);
2016         rp->r_deleg_cred = NULL;
2017         rp->r_deleg_type = OPEN_DELEGATE_NONE;
2018         rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2019         rp->r_deleg_needs_recall = FALSE;
2020         rp->r_deleg_return_pending = FALSE;
2021 
2022         /*
2023          * Remove the rnode from the server's list and
2024          * update the ref counts.
2025          */
2026         list_remove(&np->s_deleg_list, rp);
2027         mutex_exit(&rp->r_statev4_lock);
2028         nfs4_dec_state_ref_count_nolock(np, mi);
2029         mutex_exit(&np->s_lock);
2030         /* removed list node removes a reference */
2031         nfs4_server_rele(np);
2032         if (need_rele)
2033                 nfs4_server_rele(np);
2034         if (ncg != NULL)
2035                 ncg->nfs4_callback_stats.delegations.value.ui64--;
2036 }
2037 
2038 void
2039 nfs4delegreturn_cleanup(rnode4_t *rp, nfs4_server_t *np)
2040 {
2041         struct nfs4_callback_globals *ncg;
2042 
2043         if (np != NULL) {
2044                 ncg = np->zone_globals;
2045         } else if (nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone) {
2046                 ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2047                 ASSERT(ncg != NULL);
2048         } else {
2049                 /*
2050                  * Request coming from the wrong zone.
2051                  */
2052                 ASSERT(getzoneid() == GLOBAL_ZONEID);
2053                 ncg = NULL;
2054         }
2055 
2056         nfs4delegreturn_cleanup_impl(rp, np, ncg);
2057 }
2058 
2059 static void
2060 nfs4delegreturn_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2061         cred_t *cr, vnode_t *vp)
2062 {
2063         if (error != ETIMEDOUT && error != EINTR &&
2064             !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2065                 lost_rqstp->lr_op = 0;
2066                 return;
2067         }
2068 
2069         NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2070             "nfs4close_save_lost_rqst: error %d", error));
2071 
2072         lost_rqstp->lr_op = OP_DELEGRETURN;
2073         /*
2074          * The vp is held and rele'd via the recovery code.
2075          * See nfs4_save_lost_rqst.
2076          */
2077         lost_rqstp->lr_vp = vp;
2078         lost_rqstp->lr_dvp = NULL;
2079         lost_rqstp->lr_oop = NULL;
2080         lost_rqstp->lr_osp = NULL;
2081         lost_rqstp->lr_lop = NULL;
2082         lost_rqstp->lr_cr = cr;
2083         lost_rqstp->lr_flk = NULL;
2084         lost_rqstp->lr_putfirst = FALSE;
2085 }
2086 
2087 static void
2088 nfs4delegreturn_otw(rnode4_t *rp, cred_t *cr, nfs4_error_t *ep)
2089 {
2090         COMPOUND4args_clnt args;
2091         COMPOUND4res_clnt res;
2092         nfs_argop4 argops[3];
2093         nfs4_ga_res_t *garp = NULL;
2094         hrtime_t t;
2095         int numops;
2096         int doqueue = 1;
2097         mntinfo4_t *mi = VTOMI4(RTOV4(rp));
2098 
2099         args.ctag = TAG_DELEGRETURN;
2100 
2101         numops = 3;             /* PUTFH, GETATTR, DELEGRETURN */
2102 
2103         args.array = argops;
2104         args.array_len = numops;
2105 
2106         argops[0].argop = OP_CPUTFH;
2107         argops[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2108 
2109         argops[1].argop = OP_GETATTR;
2110         argops[1].nfs_argop4_u.opgetattr.attr_request =
2111             MI4_DEFAULT_ATTRMAP(mi);
2112         argops[1].nfs_argop4_u.opgetattr.mi = VTOMI4(RTOV4(rp));
2113 
2114         argops[2].argop = OP_DELEGRETURN;
2115         argops[2].nfs_argop4_u.opdelegreturn.deleg_stateid =
2116             rp->r_deleg_stateid;
2117 
2118         t = gethrtime();
2119         rfs4call(VTOMI4(RTOV4(rp)), NULL, &args, &res, cr, &doqueue, 0, ep);
2120 
2121         if (ep->error)
2122                 return;
2123 
2124         if (res.status == NFS4_OK) {
2125                 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
2126                 nfs4_attr_cache(RTOV4(rp), garp, t, cr, TRUE, NULL);
2127 
2128         }
2129         (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2130 }
2131 
2132 int
2133 nfs4_do_delegreturn(rnode4_t *rp, int flags, cred_t *cr,
2134         struct nfs4_callback_globals *ncg)
2135 {
2136         vnode_t *vp = RTOV4(rp);
2137         mntinfo4_t *mi = VTOMI4(vp);
2138         nfs4_lost_rqst_t lost_rqst;
2139         nfs4_recov_state_t recov_state;
2140         bool_t needrecov = FALSE, recovonly, done = FALSE;
2141         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2142 
2143         ncg->nfs4_callback_stats.delegreturn.value.ui64++;
2144 
2145         while (!done) {
2146                 e.error = nfs4_start_fop(mi, vp, NULL, OH_DELEGRETURN,
2147                     &recov_state, &recovonly);
2148 
2149                 if (e.error) {
2150                         if (flags & NFS4_DR_FORCE) {
2151                                 (void) nfs_rw_enter_sig(&mi->mi_recovlock,
2152                                     RW_READER, 0);
2153                                 nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2154                                 nfs_rw_exit(&mi->mi_recovlock);
2155                         }
2156                         break;
2157                 }
2158 
2159                 /*
2160                  * Check to see if the delegation has already been
2161                  * returned by the recovery thread.   The state of
2162                  * the delegation cannot change at this point due
2163                  * to start_fop and the r_deleg_recall_lock.
2164                  */
2165                 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2166                         e.error = 0;
2167                         nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2168                         break;
2169                 }
2170 
2171                 if (recovonly) {
2172                         /*
2173                          * Delegation will be returned via the
2174                          * recovery framework.  Build a lost request
2175                          * structure, start recovery and get out.
2176                          */
2177                         nfs4_error_init(&e, EINTR);
2178                         nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
2179                             cr, vp);
2180                         (void) nfs4_start_recovery(&e, mi, vp,
2181                             NULL, &rp->r_deleg_stateid,
2182                             lost_rqst.lr_op == OP_DELEGRETURN ?
2183                             &lost_rqst : NULL, OP_DELEGRETURN, NULL);
2184                         nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2185                         break;
2186                 }
2187 
2188                 nfs4delegreturn_otw(rp, cr, &e);
2189 
2190                 /*
2191                  * Ignore some errors on delegreturn; no point in marking
2192                  * the file dead on a state destroying operation.
2193                  */
2194                 if (e.error == 0 && (nfs4_recov_marks_dead(e.stat) ||
2195                     e.stat == NFS4ERR_BADHANDLE ||
2196                     e.stat == NFS4ERR_STALE))
2197                         needrecov = FALSE;
2198                 else
2199                         needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
2200 
2201                 if (needrecov) {
2202                         nfs4delegreturn_save_lost_rqst(e.error, &lost_rqst,
2203                             cr, vp);
2204                         (void) nfs4_start_recovery(&e, mi, vp,
2205                             NULL, &rp->r_deleg_stateid,
2206                             lost_rqst.lr_op == OP_DELEGRETURN ?
2207                             &lost_rqst : NULL, OP_DELEGRETURN, NULL);
2208                 } else {
2209                         nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2210                         done = TRUE;
2211                 }
2212 
2213                 nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2214         }
2215         return (e.error);
2216 }
2217 
2218 /*
2219  * nfs4_resend_delegreturn - used to drive the delegreturn
2220  * operation via the recovery thread.
2221  */
2222 void
2223 nfs4_resend_delegreturn(nfs4_lost_rqst_t *lorp, nfs4_error_t *ep,
2224         nfs4_server_t *np)
2225 {
2226         rnode4_t *rp = VTOR4(lorp->lr_vp);
2227 
2228         /* If the file failed recovery, just quit. */
2229         mutex_enter(&rp->r_statelock);
2230         if (rp->r_flags & R4RECOVERR) {
2231                 ep->error = EIO;
2232         }
2233         mutex_exit(&rp->r_statelock);
2234 
2235         if (!ep->error)
2236                 nfs4delegreturn_otw(rp, lorp->lr_cr, ep);
2237 
2238         /*
2239          * If recovery is now needed, then return the error
2240          * and status and let the recovery thread handle it,
2241          * including re-driving another delegreturn.  Otherwise,
2242          * just give up and clean up the delegation.
2243          */
2244         if (nfs4_needs_recovery(ep, TRUE, lorp->lr_vp->v_vfsp))
2245                 return;
2246 
2247         if (rp->r_deleg_type != OPEN_DELEGATE_NONE)
2248                 nfs4delegreturn_cleanup(rp, np);
2249 
2250         nfs4_error_zinit(ep);
2251 }
2252 
2253 /*
2254  * nfs4delegreturn - general function to return a delegation.
2255  *
2256  * NFS4_DR_FORCE - return the delegation even if start_op fails
2257  * NFS4_DR_PUSH - push modified data back to the server via VOP_PUTPAGE
2258  * NFS4_DR_DISCARD - discard the delegation w/o delegreturn
2259  * NFS4_DR_DID_OP - calling function already did nfs4_start_op
2260  * NFS4_DR_RECALL - delegreturned initiated via CB_RECALL
2261  * NFS4_DR_REOPEN - do file reopens, if applicable
2262  */
2263 static int
2264 nfs4delegreturn_impl(rnode4_t *rp, int flags, struct nfs4_callback_globals *ncg)
2265 {
2266         int error = 0;
2267         cred_t *cr = NULL;
2268         vnode_t *vp;
2269         bool_t needrecov = FALSE;
2270         bool_t rw_entered = FALSE;
2271         bool_t do_reopen;
2272 
2273         vp = RTOV4(rp);
2274 
2275         /*
2276          * If NFS4_DR_DISCARD is set by itself, take a short-cut and
2277          * discard without doing an otw DELEGRETURN.  This may only be used
2278          * by the recovery thread because it bypasses the synchronization
2279          * with r_deleg_recall_lock and mi->mi_recovlock.
2280          */
2281         if (flags == NFS4_DR_DISCARD) {
2282                 nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2283                 return (0);
2284         }
2285 
2286         if (flags & NFS4_DR_DID_OP) {
2287                 /*
2288                  * Caller had already done start_op, which means the
2289                  * r_deleg_recall_lock is already held in READ mode
2290                  * so we cannot take it in write mode.  Return the
2291                  * delegation asynchronously.
2292                  *
2293                  * Remove the NFS4_DR_DID_OP flag so we don't
2294                  * get stuck looping through here.
2295                  */
2296                 VN_HOLD(vp);
2297                 nfs4delegreturn_async(rp, (flags & ~NFS4_DR_DID_OP), FALSE);
2298                 return (0);
2299         }
2300 
2301         /*
2302          * Verify we still have a delegation and crhold the credential.
2303          */
2304         mutex_enter(&rp->r_statev4_lock);
2305         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2306                 mutex_exit(&rp->r_statev4_lock);
2307                 goto out;
2308         }
2309         cr = rp->r_deleg_cred;
2310         ASSERT(cr != NULL);
2311         crhold(cr);
2312         mutex_exit(&rp->r_statev4_lock);
2313 
2314         /*
2315          * Push the modified data back to the server synchronously
2316          * before doing DELEGRETURN.
2317          */
2318         if (flags & NFS4_DR_PUSH)
2319                 (void) VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2320 
2321         /*
2322          * Take r_deleg_recall_lock in WRITE mode, this will prevent
2323          * nfs4_is_otw_open_necessary from trying to use the delegation
2324          * while the DELEGRETURN is in progress.
2325          */
2326         (void) nfs_rw_enter_sig(&rp->r_deleg_recall_lock, RW_WRITER, FALSE);
2327 
2328         rw_entered = TRUE;
2329 
2330         if (rp->r_deleg_type == OPEN_DELEGATE_NONE)
2331                 goto out;
2332 
2333         if (flags & NFS4_DR_REOPEN) {
2334                 /*
2335                  * If R4RECOVERRP is already set, then skip re-opening
2336                  * the delegation open streams and go straight to doing
2337                  * delegreturn.  (XXX if the file has failed recovery, then the
2338                  * delegreturn attempt is likely to be futile.)
2339                  */
2340                 mutex_enter(&rp->r_statelock);
2341                 do_reopen = !(rp->r_flags & R4RECOVERRP);
2342                 mutex_exit(&rp->r_statelock);
2343 
2344                 if (do_reopen) {
2345                         error = deleg_reopen(vp, &needrecov, ncg, flags);
2346                         if (error != 0) {
2347                                 if ((flags & (NFS4_DR_FORCE | NFS4_DR_RECALL))
2348                                     == 0)
2349                                         goto out;
2350                         } else if (needrecov) {
2351                                 if ((flags & NFS4_DR_FORCE) == 0)
2352                                         goto out;
2353                         }
2354                 }
2355         }
2356 
2357         if (flags & NFS4_DR_DISCARD) {
2358                 mntinfo4_t *mi = VTOMI4(RTOV4(rp));
2359 
2360                 mutex_enter(&rp->r_statelock);
2361                 /*
2362                  * deleg_return_pending is cleared inside of delegation_accept
2363                  * when a delegation is accepted.  if this flag has been
2364                  * cleared, then a new delegation has overwritten the one we
2365                  * were about to throw away.
2366                  */
2367                 if (!rp->r_deleg_return_pending) {
2368                         mutex_exit(&rp->r_statelock);
2369                         goto out;
2370                 }
2371                 mutex_exit(&rp->r_statelock);
2372                 (void) nfs_rw_enter_sig(&mi->mi_recovlock, RW_READER, FALSE);
2373                 nfs4delegreturn_cleanup_impl(rp, NULL, ncg);
2374                 nfs_rw_exit(&mi->mi_recovlock);
2375         } else {
2376                 error = nfs4_do_delegreturn(rp, flags, cr, ncg);
2377         }
2378 
2379 out:
2380         if (cr)
2381                 crfree(cr);
2382         if (rw_entered)
2383                 nfs_rw_exit(&rp->r_deleg_recall_lock);
2384         return (error);
2385 }
2386 
2387 int
2388 nfs4delegreturn(rnode4_t *rp, int flags)
2389 {
2390         struct nfs4_callback_globals *ncg;
2391 
2392         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2393         ASSERT(ncg != NULL);
2394 
2395         return (nfs4delegreturn_impl(rp, flags, ncg));
2396 }
2397 
2398 void
2399 nfs4delegreturn_async(rnode4_t *rp, int flags, bool_t trunc)
2400 {
2401         struct cb_recall_pass *pp;
2402 
2403         pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
2404         pp->rp = rp;
2405         pp->flags = flags;
2406         pp->truncate = trunc;
2407 
2408         /*
2409          * Fire up a thread to do the actual delegreturn
2410          * Caller must guarantee that the rnode doesn't
2411          * vanish (by calling VN_HOLD).
2412          */
2413 
2414         (void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
2415             minclsyspri);
2416 }
2417 
2418 static void
2419 delegreturn_all_thread(rpcprog_t *pp)
2420 {
2421         nfs4_server_t *np;
2422         bool_t found = FALSE;
2423         rpcprog_t prog;
2424         rnode4_t *rp;
2425         vnode_t *vp;
2426         zoneid_t zoneid = getzoneid();
2427         struct nfs4_callback_globals *ncg;
2428 
2429         NFS4_DEBUG(nfs4_drat_debug,
2430             (CE_NOTE, "delereturn_all_thread: prog %d\n", *pp));
2431 
2432         prog = *pp;
2433         kmem_free(pp, sizeof (*pp));
2434         pp = NULL;
2435 
2436         mutex_enter(&nfs4_server_lst_lock);
2437         for (np = nfs4_server_lst.forw; np != &nfs4_server_lst; np = np->forw) {
2438                 if (np->zoneid == zoneid && np->s_program == prog) {
2439                         mutex_enter(&np->s_lock);
2440                         found = TRUE;
2441                         break;
2442                 }
2443         }
2444         mutex_exit(&nfs4_server_lst_lock);
2445 
2446         /*
2447          * It's possible that the nfs4_server which was using this
2448          * program number has vanished since this thread is async.
2449          * If so, just return.  Your work here is finished, my friend.
2450          */
2451         if (!found)
2452                 goto out;
2453 
2454         ncg = np->zone_globals;
2455         while ((rp = list_head(&np->s_deleg_list)) != NULL) {
2456                 vp = RTOV4(rp);
2457                 VN_HOLD(vp);
2458                 mutex_exit(&np->s_lock);
2459                 (void) nfs4delegreturn_impl(rp, NFS4_DR_PUSH|NFS4_DR_REOPEN,
2460                     ncg);
2461                 VN_RELE(vp);
2462 
2463                 /* retake the s_lock for next trip through the loop */
2464                 mutex_enter(&np->s_lock);
2465         }
2466         mutex_exit(&np->s_lock);
2467 out:
2468         NFS4_DEBUG(nfs4_drat_debug,
2469             (CE_NOTE, "delereturn_all_thread: complete\n"));
2470         zthread_exit();
2471 }
2472 
2473 void
2474 nfs4_delegreturn_all(nfs4_server_t *sp)
2475 {
2476         rpcprog_t pro, *pp;
2477 
2478         mutex_enter(&sp->s_lock);
2479 
2480         /* Check to see if the delegation list is empty */
2481 
2482         if (list_head(&sp->s_deleg_list) == NULL) {
2483                 mutex_exit(&sp->s_lock);
2484                 return;
2485         }
2486         /*
2487          * Grab the program number; the async thread will use this
2488          * to find the nfs4_server.
2489          */
2490         pro = sp->s_program;
2491         mutex_exit(&sp->s_lock);
2492         pp = kmem_alloc(sizeof (rpcprog_t), KM_SLEEP);
2493         *pp = pro;
2494         (void) zthread_create(NULL, 0, delegreturn_all_thread, pp, 0,
2495             minclsyspri);
2496 }
2497 
2498 
2499 /*
2500  * Discard any delegations
2501  *
2502  * Iterate over the servers s_deleg_list and
2503  * for matching mount-point rnodes discard
2504  * the delegation.
2505  */
2506 void
2507 nfs4_deleg_discard(mntinfo4_t *mi, nfs4_server_t *sp)
2508 {
2509         rnode4_t *rp, *next;
2510         mntinfo4_t *r_mi;
2511         struct nfs4_callback_globals *ncg;
2512 
2513         ASSERT(mutex_owned(&sp->s_lock));
2514         ncg = sp->zone_globals;
2515 
2516         for (rp = list_head(&sp->s_deleg_list); rp != NULL; rp = next) {
2517                 r_mi = VTOMI4(RTOV4(rp));
2518                 next = list_next(&sp->s_deleg_list, rp);
2519 
2520                 if (r_mi != mi) {
2521                         /*
2522                          * Skip if this rnode is in not on the
2523                          * same mount-point
2524                          */
2525                         continue;
2526                 }
2527 
2528                 ASSERT(rp->r_deleg_type == OPEN_DELEGATE_READ);
2529 
2530 #ifdef DEBUG
2531                 if (nfs4_client_recov_debug) {
2532                         zprintf(getzoneid(),
2533                             "nfs4_deleg_discard: matched rnode %p "
2534                         "-- discarding delegation\n", (void *)rp);
2535                 }
2536 #endif
2537                 mutex_enter(&rp->r_statev4_lock);
2538                 /*
2539                  * Free the cred originally held when the delegation
2540                  * was granted. Also need to decrement the refcnt
2541                  * on this server for each delegation we discard
2542                  */
2543                 if (rp->r_deleg_cred)
2544                         crfree(rp->r_deleg_cred);
2545                 rp->r_deleg_cred = NULL;
2546                 rp->r_deleg_type = OPEN_DELEGATE_NONE;
2547                 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2548                 rp->r_deleg_needs_recall = FALSE;
2549                 ASSERT(sp->s_refcnt > 1);
2550                 sp->s_refcnt--;
2551                 list_remove(&sp->s_deleg_list, rp);
2552                 mutex_exit(&rp->r_statev4_lock);
2553                 nfs4_dec_state_ref_count_nolock(sp, mi);
2554                 ncg->nfs4_callback_stats.delegations.value.ui64--;
2555         }
2556 }
2557 
2558 /*
2559  * Reopen any open streams that were covered by the given file's
2560  * delegation.
2561  * Returns zero or an errno value.  If there was no error, *recovp
2562  * indicates whether recovery was initiated.
2563  */
2564 
2565 static int
2566 deleg_reopen(vnode_t *vp, bool_t *recovp, struct nfs4_callback_globals *ncg,
2567         int flags)
2568 {
2569         nfs4_open_stream_t *osp;
2570         nfs4_recov_state_t recov_state;
2571         bool_t needrecov = FALSE;
2572         mntinfo4_t *mi;
2573         rnode4_t *rp;
2574         nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
2575         int claimnull;
2576 
2577         mi = VTOMI4(vp);
2578         rp = VTOR4(vp);
2579 
2580         recov_state.rs_flags = 0;
2581         recov_state.rs_num_retry_despite_err = 0;
2582 
2583 retry:
2584         if ((e.error = nfs4_start_op(mi, vp, NULL, &recov_state)) != 0) {
2585                 return (e.error);
2586         }
2587 
2588         /*
2589          * if we mean to discard the delegation, it must be BAD, so don't
2590          * use it when doing the reopen or it will fail too.
2591          */
2592         claimnull = (flags & NFS4_DR_DISCARD);
2593         /*
2594          * Loop through the open streams for this rnode to find
2595          * all of the ones created using the delegation state ID.
2596          * Each of these needs to be re-opened.
2597          */
2598 
2599         while ((osp = get_next_deleg_stream(rp, claimnull)) != NULL) {
2600 
2601                 if (claimnull) {
2602                         nfs4_reopen(vp, osp, &e, CLAIM_NULL, FALSE, FALSE);
2603                 } else {
2604                         ncg->nfs4_callback_stats.claim_cur.value.ui64++;
2605 
2606                         nfs4_reopen(vp, osp, &e, CLAIM_DELEGATE_CUR, FALSE,
2607                             FALSE);
2608                         if (e.error == 0 && e.stat == NFS4_OK)
2609                                 ncg->nfs4_callback_stats.
2610                                     claim_cur_ok.value.ui64++;
2611                 }
2612 
2613                 if (e.error == EAGAIN) {
2614                         nfs4_end_op(mi, vp, NULL, &recov_state, TRUE);
2615                         goto retry;
2616                 }
2617 
2618                 /*
2619                  * if error is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, then
2620                  * recovery has already been started inside of nfs4_reopen.
2621                  */
2622                 if (e.error == EINTR || e.error == ETIMEDOUT ||
2623                     NFS4_FRC_UNMT_ERR(e.error, vp->v_vfsp)) {
2624                         open_stream_rele(osp, rp);
2625                         break;
2626                 }
2627 
2628                 needrecov = nfs4_needs_recovery(&e, TRUE, vp->v_vfsp);
2629 
2630                 if (e.error != 0 && !needrecov) {
2631                         /*
2632                          * Recovery is not possible, but don't give up yet;
2633                          * we'd still like to do delegreturn after
2634                          * reopening as many streams as possible.
2635                          * Continue processing the open streams.
2636                          */
2637 
2638                         ncg->nfs4_callback_stats.recall_failed.value.ui64++;
2639 
2640                 } else if (needrecov) {
2641                         /*
2642                          * Start recovery and bail out.  The recovery
2643                          * thread will take it from here.
2644                          */
2645                         (void) nfs4_start_recovery(&e, mi, vp, NULL, NULL,
2646                             NULL, OP_OPEN, NULL);
2647                         open_stream_rele(osp, rp);
2648                         *recovp = TRUE;
2649                         break;
2650                 }
2651 
2652                 open_stream_rele(osp, rp);
2653         }
2654 
2655         nfs4_end_op(mi, vp, NULL, &recov_state, needrecov);
2656 
2657         return (e.error);
2658 }
2659 
2660 /*
2661  * get_next_deleg_stream - returns the next open stream which
2662  * represents a delegation for this rnode.  In order to assure
2663  * forward progress, the caller must guarantee that each open
2664  * stream returned is changed so that a future call won't return
2665  * it again.
2666  *
2667  * There are several ways for the open stream to change.  If the open
2668  * stream is !os_delegation, then we aren't interested in it.  Also, if
2669  * either os_failed_reopen or !os_valid, then don't return the osp.
2670  *
2671  * If claimnull is false (doing reopen CLAIM_DELEGATE_CUR) then return
2672  * the osp if it is an os_delegation open stream.  Also, if the rnode still
2673  * has r_deleg_return_pending, then return the os_delegation osp.  Lastly,
2674  * if the rnode's r_deleg_stateid is different from the osp's open_stateid,
2675  * then return the osp.
2676  *
2677  * We have already taken the 'r_deleg_recall_lock' as WRITER, which
2678  * prevents new OPENs from going OTW (as start_fop takes this
2679  * lock in READ mode); thus, no new open streams can be created
2680  * (which inherently means no new delegation open streams are
2681  * being created).
2682  */
2683 
2684 static nfs4_open_stream_t *
2685 get_next_deleg_stream(rnode4_t *rp, int claimnull)
2686 {
2687         nfs4_open_stream_t      *osp;
2688 
2689         ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_WRITER));
2690 
2691         /*
2692          * Search through the list of open streams looking for
2693          * one that was created while holding the delegation.
2694          */
2695         mutex_enter(&rp->r_os_lock);
2696         for (osp = list_head(&rp->r_open_streams); osp != NULL;
2697             osp = list_next(&rp->r_open_streams, osp)) {
2698                 mutex_enter(&osp->os_sync_lock);
2699                 if (!osp->os_delegation || osp->os_failed_reopen ||
2700                     !osp->os_valid) {
2701                         mutex_exit(&osp->os_sync_lock);
2702                         continue;
2703                 }
2704                 if (!claimnull || rp->r_deleg_return_pending ||
2705                     !stateid4_cmp(&osp->open_stateid, &rp->r_deleg_stateid)) {
2706                         osp->os_ref_count++;
2707                         mutex_exit(&osp->os_sync_lock);
2708                         mutex_exit(&rp->r_os_lock);
2709                         return (osp);
2710                 }
2711                 mutex_exit(&osp->os_sync_lock);
2712         }
2713         mutex_exit(&rp->r_os_lock);
2714 
2715         return (NULL);
2716 }
2717 
2718 static void
2719 nfs4delegreturn_thread(struct cb_recall_pass *args)
2720 {
2721         rnode4_t *rp;
2722         vnode_t *vp;
2723         cred_t *cr;
2724         int dtype, error, flags;
2725         bool_t rdirty, rip;
2726         kmutex_t cpr_lock;
2727         callb_cpr_t cpr_info;
2728         struct nfs4_callback_globals *ncg;
2729 
2730         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2731         ASSERT(ncg != NULL);
2732 
2733         mutex_init(&cpr_lock, NULL, MUTEX_DEFAULT, NULL);
2734 
2735         CALLB_CPR_INIT(&cpr_info, &cpr_lock, callb_generic_cpr,
2736             "nfsv4delegRtn");
2737 
2738         rp = args->rp;
2739         vp = RTOV4(rp);
2740 
2741         mutex_enter(&rp->r_statev4_lock);
2742         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2743                 mutex_exit(&rp->r_statev4_lock);
2744                 goto out;
2745         }
2746         mutex_exit(&rp->r_statev4_lock);
2747 
2748         /*
2749          * Take the read-write lock in read mode to prevent other
2750          * threads from modifying the data during the recall.  This
2751          * doesn't affect mmappers.
2752          */
2753         (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
2754 
2755         /* Proceed with delegreturn */
2756 
2757         mutex_enter(&rp->r_statev4_lock);
2758         if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
2759                 mutex_exit(&rp->r_statev4_lock);
2760                 nfs_rw_exit(&rp->r_rwlock);
2761                 goto out;
2762         }
2763         dtype = rp->r_deleg_type;
2764         cr = rp->r_deleg_cred;
2765         ASSERT(cr != NULL);
2766         crhold(cr);
2767         mutex_exit(&rp->r_statev4_lock);
2768 
2769         flags = args->flags;
2770 
2771         /*
2772          * If the file is being truncated at the server, then throw
2773          * away all of the pages, it doesn't matter what flavor of
2774          * delegation we have.
2775          */
2776 
2777         if (args->truncate) {
2778                 ncg->nfs4_callback_stats.recall_trunc.value.ui64++;
2779                 nfs4_invalidate_pages(vp, 0, cr);
2780         } else if (dtype == OPEN_DELEGATE_WRITE) {
2781 
2782                 mutex_enter(&rp->r_statelock);
2783                 rdirty = rp->r_flags & R4DIRTY;
2784                 mutex_exit(&rp->r_statelock);
2785 
2786                 if (rdirty) {
2787                         error = VOP_PUTPAGE(vp, 0, 0, 0, cr, NULL);
2788 
2789                         if (error)
2790                                 CB_WARN1("nfs4delegreturn_thread:"
2791                                 " VOP_PUTPAGE: %d\n", error);
2792                 }
2793                 /* turn off NFS4_DR_PUSH because we just did that above. */
2794                 flags &= ~NFS4_DR_PUSH;
2795         }
2796 
2797         mutex_enter(&rp->r_statelock);
2798         rip =  rp->r_flags & R4RECOVERRP;
2799         mutex_exit(&rp->r_statelock);
2800 
2801         /* If a failed recovery is indicated, discard the pages */
2802 
2803         if (rip) {
2804 
2805                 error = VOP_PUTPAGE(vp, 0, 0, B_INVAL, cr, NULL);
2806 
2807                 if (error)
2808                         CB_WARN1("nfs4delegreturn_thread: VOP_PUTPAGE: %d\n",
2809                             error);
2810         }
2811 
2812         /*
2813          * Pass the flags to nfs4delegreturn_impl, but be sure not to pass
2814          * NFS4_DR_DID_OP, which just calls nfs4delegreturn_async again.
2815          */
2816         flags &= ~NFS4_DR_DID_OP;
2817 
2818         (void) nfs4delegreturn_impl(rp, flags, ncg);
2819 
2820         nfs_rw_exit(&rp->r_rwlock);
2821         crfree(cr);
2822 out:
2823         kmem_free(args, sizeof (struct cb_recall_pass));
2824         VN_RELE(vp);
2825         mutex_enter(&cpr_lock);
2826         CALLB_CPR_EXIT(&cpr_info);
2827         mutex_destroy(&cpr_lock);
2828         zthread_exit();
2829 }
2830 
2831 /*
2832  * This function has one assumption that the caller of this function is
2833  * either doing recovery (therefore cannot call nfs4_start_op) or has
2834  * already called nfs4_start_op().
2835  */
2836 void
2837 nfs4_delegation_accept(rnode4_t *rp, open_claim_type4 claim, OPEN4res *res,
2838         nfs4_ga_res_t *garp, cred_t *cr)
2839 {
2840         open_read_delegation4 *orp;
2841         open_write_delegation4 *owp;
2842         nfs4_server_t *np;
2843         bool_t already = FALSE;
2844         bool_t recall = FALSE;
2845         bool_t valid_garp = TRUE;
2846         bool_t delegation_granted = FALSE;
2847         bool_t dr_needed = FALSE;
2848         bool_t recov;
2849         int dr_flags = 0;
2850         long mapcnt;
2851         uint_t rflag;
2852         mntinfo4_t *mi;
2853         struct nfs4_callback_globals *ncg;
2854         open_delegation_type4 odt;
2855 
2856         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
2857         ASSERT(ncg != NULL);
2858 
2859         mi = VTOMI4(RTOV4(rp));
2860 
2861         /*
2862          * Accept a delegation granted to the client via an OPEN.
2863          * Set the delegation fields in the rnode and insert the
2864          * rnode onto the list anchored in the nfs4_server_t.  The
2865          * proper locking order requires the nfs4_server_t first,
2866          * even though it may not be needed in all cases.
2867          *
2868          * NB: find_nfs4_server returns with s_lock held.
2869          */
2870 
2871         if ((np = find_nfs4_server(mi)) == NULL)
2872                 return;
2873 
2874         /* grab the statelock too, for examining r_mapcnt */
2875         mutex_enter(&rp->r_statelock);
2876         mutex_enter(&rp->r_statev4_lock);
2877 
2878         if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
2879             rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2880                 already = TRUE;
2881 
2882         odt = res->delegation.delegation_type;
2883 
2884         if (odt == OPEN_DELEGATE_READ) {
2885 
2886                 rp->r_deleg_type = res->delegation.delegation_type;
2887                 orp = &res->delegation.open_delegation4_u.read;
2888                 rp->r_deleg_stateid = orp->stateid;
2889                 rp->r_deleg_perms = orp->permissions;
2890                 if (claim == CLAIM_PREVIOUS)
2891                         if ((recall = orp->recall) != 0)
2892                                 dr_needed = TRUE;
2893 
2894                 delegation_granted = TRUE;
2895 
2896                 ncg->nfs4_callback_stats.delegations.value.ui64++;
2897                 ncg->nfs4_callback_stats.delegaccept_r.value.ui64++;
2898 
2899         } else if (odt == OPEN_DELEGATE_WRITE) {
2900 
2901                 rp->r_deleg_type = res->delegation.delegation_type;
2902                 owp = &res->delegation.open_delegation4_u.write;
2903                 rp->r_deleg_stateid = owp->stateid;
2904                 rp->r_deleg_perms = owp->permissions;
2905                 rp->r_deleg_limit = owp->space_limit;
2906                 if (claim == CLAIM_PREVIOUS)
2907                         if ((recall = owp->recall) != 0)
2908                                 dr_needed = TRUE;
2909 
2910                 delegation_granted = TRUE;
2911 
2912                 if (garp == NULL || !garp->n4g_change_valid) {
2913                         valid_garp = FALSE;
2914                         rp->r_deleg_change = 0;
2915                         rp->r_deleg_change_grant = 0;
2916                 } else {
2917                         rp->r_deleg_change = garp->n4g_change;
2918                         rp->r_deleg_change_grant = garp->n4g_change;
2919                 }
2920                 mapcnt = rp->r_mapcnt;
2921                 rflag = rp->r_flags;
2922 
2923                 /*
2924                  * Update the delegation change attribute if
2925                  * there are mappers for the file is dirty.  This
2926                  * might be the case during recovery after server
2927                  * reboot.
2928                  */
2929                 if (mapcnt > 0 || rflag & R4DIRTY)
2930                         rp->r_deleg_change++;
2931 
2932                 NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2933                     "nfs4_delegation_accept: r_deleg_change: 0x%x\n",
2934                     (int)(rp->r_deleg_change >> 32)));
2935                 NFS4_DEBUG(nfs4_callback_debug, (CE_NOTE,
2936                     "nfs4_delegation_accept: r_delg_change_grant: 0x%x\n",
2937                     (int)(rp->r_deleg_change_grant >> 32)));
2938 
2939 
2940                 ncg->nfs4_callback_stats.delegations.value.ui64++;
2941                 ncg->nfs4_callback_stats.delegaccept_rw.value.ui64++;
2942         } else if (already) {
2943                 /*
2944                  * No delegation granted.  If the rnode currently has
2945                  * has one, then consider it tainted and return it.
2946                  */
2947                 dr_needed = TRUE;
2948         }
2949 
2950         if (delegation_granted) {
2951                 /* Add the rnode to the list. */
2952                 if (!already) {
2953                         crhold(cr);
2954                         rp->r_deleg_cred = cr;
2955 
2956                         ASSERT(mutex_owned(&np->s_lock));
2957                         list_insert_head(&np->s_deleg_list, rp);
2958                         /* added list node gets a reference */
2959                         np->s_refcnt++;
2960                         nfs4_inc_state_ref_count_nolock(np, mi);
2961                 }
2962                 rp->r_deleg_needs_recovery = OPEN_DELEGATE_NONE;
2963         }
2964 
2965         /*
2966          * We've now safely accepted the delegation, if any.  Drop the
2967          * locks and figure out what post-processing is needed.  We'd
2968          * like to retain r_statev4_lock, but nfs4_server_rele takes
2969          * s_lock which would be a lock ordering violation.
2970          */
2971         mutex_exit(&rp->r_statev4_lock);
2972         mutex_exit(&rp->r_statelock);
2973         mutex_exit(&np->s_lock);
2974         nfs4_server_rele(np);
2975 
2976         /*
2977          * Check to see if we are in recovery.  Remember that
2978          * this function is protected by start_op, so a recovery
2979          * cannot begin until we are out of here.
2980          */
2981         mutex_enter(&mi->mi_lock);
2982         recov = mi->mi_recovflags & MI4_RECOV_ACTIV;
2983         mutex_exit(&mi->mi_lock);
2984 
2985         mutex_enter(&rp->r_statev4_lock);
2986 
2987         if (nfs4_delegreturn_policy == IMMEDIATE || !valid_garp)
2988                 dr_needed = TRUE;
2989 
2990         if (dr_needed && rp->r_deleg_return_pending == FALSE) {
2991                 if (recov) {
2992                         /*
2993                          * We cannot call delegreturn from inside
2994                          * of recovery or VOP_PUTPAGE will hang
2995                          * due to nfs4_start_fop call in
2996                          * nfs4write.  Use dlistadd to add the
2997                          * rnode to the list of rnodes needing
2998                          * cleaning.  We do not need to do reopen
2999                          * here because recov_openfiles will do it.
3000                          * In the non-recall case, just discard the
3001                          * delegation as it is no longer valid.
3002                          */
3003                         if (recall)
3004                                 dr_flags = NFS4_DR_PUSH;
3005                         else
3006                                 dr_flags = NFS4_DR_PUSH|NFS4_DR_DISCARD;
3007 
3008                         nfs4_dlistadd(rp, ncg, dr_flags);
3009                         dr_flags = 0;
3010                 } else {
3011                         /*
3012                          * Push the modified data back to the server,
3013                          * reopen any delegation open streams, and return
3014                          * the delegation.  Drop the statev4_lock first!
3015                          */
3016                         dr_flags =  NFS4_DR_PUSH|NFS4_DR_DID_OP|NFS4_DR_REOPEN;
3017                 }
3018         }
3019         mutex_exit(&rp->r_statev4_lock);
3020         if (dr_flags)
3021                 (void) nfs4delegreturn_impl(rp, dr_flags, ncg);
3022 }
3023 
3024 /*
3025  * nfs4delegabandon - Abandon the delegation on an rnode4.  This code
3026  * is called when the client receives EXPIRED, BAD_STATEID, OLD_STATEID
3027  * or BADSEQID and the recovery code is unable to recover.  Push any
3028  * dirty data back to the server and return the delegation (if any).
3029  */
3030 
3031 void
3032 nfs4delegabandon(rnode4_t *rp)
3033 {
3034         vnode_t *vp;
3035         struct cb_recall_pass *pp;
3036         open_delegation_type4 dt;
3037 
3038         mutex_enter(&rp->r_statev4_lock);
3039         dt = rp->r_deleg_type;
3040         mutex_exit(&rp->r_statev4_lock);
3041 
3042         if (dt == OPEN_DELEGATE_NONE)
3043                 return;
3044 
3045         vp = RTOV4(rp);
3046         VN_HOLD(vp);
3047 
3048         pp = kmem_alloc(sizeof (struct cb_recall_pass), KM_SLEEP);
3049         pp->rp = rp;
3050         /*
3051          * Recovery on the file has failed and we want to return
3052          * the delegation.  We don't want to reopen files and
3053          * nfs4delegreturn_thread() figures out what to do about
3054          * the data.  The only thing to do is attempt to return
3055          * the delegation.
3056          */
3057         pp->flags = 0;
3058         pp->truncate = FALSE;
3059 
3060         /*
3061          * Fire up a thread to do the delegreturn; this is
3062          * necessary because we could be inside a GETPAGE or
3063          * PUTPAGE and we cannot do another one.
3064          */
3065 
3066         (void) zthread_create(NULL, 0, nfs4delegreturn_thread, pp, 0,
3067             minclsyspri);
3068 }
3069 
3070 static int
3071 wait_for_recall1(vnode_t *vp, nfs4_op_hint_t op, nfs4_recov_state_t *rsp,
3072         int flg)
3073 {
3074         rnode4_t *rp;
3075         int error = 0;
3076 
3077 #ifdef lint
3078         op = op;
3079 #endif
3080 
3081         if (vp && vp->v_type == VREG) {
3082                 rp = VTOR4(vp);
3083 
3084                 /*
3085                  * Take r_deleg_recall_lock in read mode to synchronize
3086                  * with delegreturn.
3087                  */
3088                 error = nfs_rw_enter_sig(&rp->r_deleg_recall_lock,
3089                     RW_READER, INTR4(vp));
3090 
3091                 if (error == 0)
3092                         rsp->rs_flags |= flg;
3093 
3094         }
3095         return (error);
3096 }
3097 
3098 void
3099 nfs4_end_op_recall(vnode_t *vp1, vnode_t *vp2, nfs4_recov_state_t *rsp)
3100 {
3101         NFS4_DEBUG(nfs4_recall_debug,
3102             (CE_NOTE, "nfs4_end_op_recall: 0x%p, 0x%p\n",
3103             (void *)vp1, (void *)vp2));
3104 
3105         if (vp2 && rsp->rs_flags & NFS4_RS_RECALL_HELD2)
3106                 nfs_rw_exit(&VTOR4(vp2)->r_deleg_recall_lock);
3107         if (vp1 && rsp->rs_flags & NFS4_RS_RECALL_HELD1)
3108                 nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
3109 }
3110 
3111 int
3112 wait_for_recall(vnode_t *vp1, vnode_t *vp2, nfs4_op_hint_t op,
3113         nfs4_recov_state_t *rsp)
3114 {
3115         int error;
3116 
3117         NFS4_DEBUG(nfs4_recall_debug,
3118             (CE_NOTE, "wait_for_recall:    0x%p, 0x%p\n",
3119             (void *)vp1, (void *) vp2));
3120 
3121         rsp->rs_flags &= ~(NFS4_RS_RECALL_HELD1|NFS4_RS_RECALL_HELD2);
3122 
3123         if ((error = wait_for_recall1(vp1, op, rsp, NFS4_RS_RECALL_HELD1)) != 0)
3124                 return (error);
3125 
3126         if ((error = wait_for_recall1(vp2, op, rsp, NFS4_RS_RECALL_HELD2))
3127             != 0) {
3128                 if (rsp->rs_flags & NFS4_RS_RECALL_HELD1) {
3129                         nfs_rw_exit(&VTOR4(vp1)->r_deleg_recall_lock);
3130                         rsp->rs_flags &= ~NFS4_RS_RECALL_HELD1;
3131                 }
3132 
3133                 return (error);
3134         }
3135 
3136         return (0);
3137 }
3138 
3139 /*
3140  * nfs4_dlistadd - Add this rnode to a list of rnodes to be
3141  * DELEGRETURN'd at the end of recovery.
3142  */
3143 
3144 static void
3145 nfs4_dlistadd(rnode4_t *rp, struct nfs4_callback_globals *ncg, int flags)
3146 {
3147         struct nfs4_dnode *dp;
3148 
3149         ASSERT(mutex_owned(&rp->r_statev4_lock));
3150         /*
3151          * Mark the delegation as having a return pending.
3152          * This will prevent the use of the delegation stateID
3153          * by read, write, setattr and open.
3154          */
3155         rp->r_deleg_return_pending = TRUE;
3156         dp = kmem_alloc(sizeof (*dp), KM_SLEEP);
3157         VN_HOLD(RTOV4(rp));
3158         dp->rnodep = rp;
3159         dp->flags = flags;
3160         mutex_enter(&ncg->nfs4_dlist_lock);
3161         list_insert_head(&ncg->nfs4_dlist, dp);
3162 #ifdef  DEBUG
3163         ncg->nfs4_dlistadd_c++;
3164 #endif
3165         mutex_exit(&ncg->nfs4_dlist_lock);
3166 }
3167 
3168 /*
3169  * nfs4_dlistclean_impl - Do DELEGRETURN for each rnode on the list.
3170  * of files awaiting cleaning.  If the override_flags are non-zero
3171  * then use them rather than the flags that were set when the rnode
3172  * was added to the dlist.
3173  */
3174 static void
3175 nfs4_dlistclean_impl(struct nfs4_callback_globals *ncg, int override_flags)
3176 {
3177         rnode4_t *rp;
3178         struct nfs4_dnode *dp;
3179         int flags;
3180 
3181         ASSERT(override_flags == 0 || override_flags == NFS4_DR_DISCARD);
3182 
3183         mutex_enter(&ncg->nfs4_dlist_lock);
3184         while ((dp = list_head(&ncg->nfs4_dlist)) != NULL) {
3185 #ifdef  DEBUG
3186                 ncg->nfs4_dlistclean_c++;
3187 #endif
3188                 list_remove(&ncg->nfs4_dlist, dp);
3189                 mutex_exit(&ncg->nfs4_dlist_lock);
3190                 rp = dp->rnodep;
3191                 flags = (override_flags != 0) ? override_flags : dp->flags;
3192                 kmem_free(dp, sizeof (*dp));
3193                 (void) nfs4delegreturn_impl(rp, flags, ncg);
3194                 VN_RELE(RTOV4(rp));
3195                 mutex_enter(&ncg->nfs4_dlist_lock);
3196         }
3197         mutex_exit(&ncg->nfs4_dlist_lock);
3198 }
3199 
3200 void
3201 nfs4_dlistclean(void)
3202 {
3203         struct nfs4_callback_globals *ncg;
3204 
3205         ncg = zone_getspecific(nfs4_callback_zone_key, nfs_zone());
3206         ASSERT(ncg != NULL);
3207 
3208         nfs4_dlistclean_impl(ncg, 0);
3209 }
--- EOF ---