1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T
28 * All Rights Reserved
29 */
30
31 /*
32 * Portions of this source code were derived from Berkeley 4.3 BSD
33 * under license from the Regents of the University of California.
34 */
35
36
37 /*
38 * Implements a kernel based, client side RPC over Connection Oriented
39 * Transports (COTS).
40 */
41
42 /*
43 * Much of this file has been re-written to let NFS work better over slow
44 * transports. A description follows.
45 *
46 * One of the annoying things about kRPC/COTS is that it will temporarily
47 * create more than one connection between a client and server. This
48 * happens because when a connection is made, the end-points entry in the
49 * linked list of connections (headed by cm_hd), is removed so that other
50 * threads don't mess with it. Went ahead and bit the bullet by keeping
51 * the endpoint on the connection list and introducing state bits,
52 * condition variables etc. to the connection entry data structure (struct
53 * cm_xprt).
54 *
55 * Here is a summary of the changes to cm-xprt:
56 *
57 * x_ctime is the timestamp of when the endpoint was last
58 * connected or disconnected. If an end-point is ever disconnected
59 * or re-connected, then any outstanding RPC request is presumed
60 * lost, telling clnt_cots_kcallit that it needs to re-send the
61 * request, not just wait for the original request's reply to
62 * arrive.
63 *
64 * x_thread flag which tells us if a thread is doing a connection attempt.
65 *
66 * x_waitdis flag which tells us we are waiting a disconnect ACK.
67 *
68 * x_needdis flag which tells us we need to send a T_DISCONN_REQ
69 * to kill the connection.
70 *
71 * x_needrel flag which tells us we need to send a T_ORDREL_REQ to
72 * gracefully close the connection.
73 *
74 * #defined bitmasks for the all the b_* bits so that more
75 * efficient (and at times less clumsy) masks can be used to
76 * manipulated state in cases where multiple bits have to
77 * set/cleared/checked in the same critical section.
78 *
79 * x_conn_cv and x_dis-_cv are new condition variables to let
80 * threads knows when the connection attempt is done, and to let
81 * the connecting thread know when the disconnect handshake is
82 * done.
83 *
84 * Added the CONN_HOLD() macro so that all reference holds have the same
85 * look and feel.
86 *
87 * In the private (cku_private) portion of the client handle,
88 *
89 * cku_flags replaces the cku_sent a boolean. cku_flags keeps
90 * track of whether a request as been sent, and whether the
91 * client's handles call record is on the dispatch list (so that
92 * the reply can be matched by XID to the right client handle).
93 * The idea of CKU_ONQUEUE is that we can exit clnt_cots_kcallit()
94 * and still have the response find the right client handle so
95 * that the retry of CLNT_CALL() gets the result. Testing, found
96 * situations where if the timeout was increased, performance
97 * degraded. This was due to us hitting a window where the thread
98 * was back in rfscall() (probably printing server not responding)
99 * while the response came back but no place to put it.
100 *
101 * cku_ctime is just a cache of x_ctime. If they match,
102 * clnt_cots_kcallit() won't to send a retry (unless the maximum
103 * receive count limit as been reached). If the don't match, then
104 * we assume the request has been lost, and a retry of the request
105 * is needed.
106 *
107 * cku_recv_attempts counts the number of receive count attempts
108 * after one try is sent on the wire.
109 *
110 * Added the clnt_delay() routine so that interruptible and
111 * noninterruptible delays are possible.
112 *
113 * CLNT_MIN_TIMEOUT has been bumped to 10 seconds from 3. This is used to
114 * control how long the client delays before returned after getting
115 * ECONNREFUSED. At 3 seconds, 8 client threads per mount really does bash
116 * a server that may be booting and not yet started nfsd.
117 *
118 * CLNT_MAXRECV_WITHOUT_RETRY is a new macro (value of 3) (with a tunable)
119 * Why don't we just wait forever (receive an infinite # of times)?
120 * Because the server may have rebooted. More insidious is that some
121 * servers (ours) will drop NFS/TCP requests in some cases. This is bad,
122 * but it is a reality.
123 *
124 * The case of a server doing orderly release really messes up the
125 * client's recovery, especially if the server's TCP implementation is
126 * buggy. It was found was that the kRPC/COTS client was breaking some
127 * TPI rules, such as not waiting for the acknowledgement of a
128 * T_DISCON_REQ (hence the added case statements T_ERROR_ACK, T_OK_ACK and
129 * T_DISCON_REQ in clnt_dispatch_notifyall()).
130 *
131 * One of things that we've seen is that a kRPC TCP endpoint goes into
132 * TIMEWAIT and a thus a reconnect takes a long time to satisfy because
133 * that the TIMEWAIT state takes a while to finish. If a server sends a
134 * T_ORDREL_IND, there is little point in an RPC client doing a
135 * T_ORDREL_REQ, because the RPC request isn't going to make it (the
136 * server is saying that it won't accept any more data). So kRPC was
137 * changed to send a T_DISCON_REQ when we get a T_ORDREL_IND. So now the
138 * connection skips the TIMEWAIT state and goes straight to a bound state
139 * that kRPC can quickly switch to connected.
140 *
141 * Code that issues TPI request must use waitforack() to wait for the
142 * corresponding ack (assuming there is one) in any future modifications.
143 * This works around problems that may be introduced by breaking TPI rules
144 * (by submitting new calls before earlier requests have been acked) in the
145 * case of a signal or other early return. waitforack() depends on
146 * clnt_dispatch_notifyconn() to issue the wakeup when the ack
147 * arrives, so adding new TPI calls may require corresponding changes
148 * to clnt_dispatch_notifyconn(). Presently, the timeout period is based on
149 * CLNT_MIN_TIMEOUT which is 10 seconds. If you modify this value, be sure
150 * not to set it too low or TPI ACKS will be lost.
151 */
152
153 #include <sys/param.h>
154 #include <sys/types.h>
155 #include <sys/user.h>
156 #include <sys/systm.h>
157 #include <sys/sysmacros.h>
158 #include <sys/proc.h>
159 #include <sys/socket.h>
160 #include <sys/file.h>
161 #include <sys/stream.h>
162 #include <sys/strsubr.h>
163 #include <sys/stropts.h>
164 #include <sys/strsun.h>
165 #include <sys/timod.h>
166 #include <sys/tiuser.h>
167 #include <sys/tihdr.h>
168 #include <sys/t_kuser.h>
169 #include <sys/fcntl.h>
170 #include <sys/errno.h>
171 #include <sys/kmem.h>
172 #include <sys/debug.h>
173 #include <sys/systm.h>
174 #include <sys/kstat.h>
175 #include <sys/t_lock.h>
176 #include <sys/ddi.h>
177 #include <sys/cmn_err.h>
178 #include <sys/time.h>
179 #include <sys/isa_defs.h>
180 #include <sys/callb.h>
181 #include <sys/sunddi.h>
182 #include <sys/atomic.h>
183 #include <sys/sdt.h>
184
185 #include <netinet/in.h>
186 #include <netinet/tcp.h>
187
188 #include <rpc/types.h>
189 #include <rpc/xdr.h>
190 #include <rpc/auth.h>
191 #include <rpc/clnt.h>
192 #include <rpc/rpc_msg.h>
193
194 #define COTS_DEFAULT_ALLOCSIZE 2048
195
196 #define WIRE_HDR_SIZE 20 /* serialized call header, sans proc number */
197 #define MSG_OFFSET 128 /* offset of call into the mblk */
198
199 const char *kinet_ntop6(uchar_t *, char *, size_t);
200
201 static int clnt_cots_ksettimers(CLIENT *, struct rpc_timers *,
202 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t);
203 static enum clnt_stat clnt_cots_kcallit(CLIENT *, rpcproc_t, xdrproc_t,
204 caddr_t, xdrproc_t, caddr_t, struct timeval);
205 static void clnt_cots_kabort(CLIENT *);
206 static void clnt_cots_kerror(CLIENT *, struct rpc_err *);
207 static bool_t clnt_cots_kfreeres(CLIENT *, xdrproc_t, caddr_t);
208 static void clnt_cots_kdestroy(CLIENT *);
209 static bool_t clnt_cots_kcontrol(CLIENT *, int, char *);
210
211
212 /* List of transports managed by the connection manager. */
213 struct cm_xprt {
214 TIUSER *x_tiptr; /* transport handle */
215 queue_t *x_wq; /* send queue */
216 clock_t x_time; /* last time we handed this xprt out */
217 clock_t x_ctime; /* time we went to CONNECTED */
218 int x_tidu_size; /* TIDU size of this transport */
219 union {
220 struct {
221 unsigned int
222 #ifdef _BIT_FIELDS_HTOL
223 b_closing: 1, /* we've sent a ord rel on this conn */
224 b_dead: 1, /* transport is closed or disconn */
225 b_doomed: 1, /* too many conns, let this go idle */
226 b_connected: 1, /* this connection is connected */
227
228 b_ordrel: 1, /* do an orderly release? */
229 b_thread: 1, /* thread doing connect */
230 b_waitdis: 1, /* waiting for disconnect ACK */
231 b_needdis: 1, /* need T_DISCON_REQ */
232
233 b_needrel: 1, /* need T_ORDREL_REQ */
234 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */
235 /* disconnect during connect */
236
237 b_pad: 22;
238
239 #endif
240
241 #ifdef _BIT_FIELDS_LTOH
242 b_pad: 22,
243
244 b_early_disc: 1, /* got a T_ORDREL_IND or T_DISCON_IND */
245 /* disconnect during connect */
246 b_needrel: 1, /* need T_ORDREL_REQ */
247
248 b_needdis: 1, /* need T_DISCON_REQ */
249 b_waitdis: 1, /* waiting for disconnect ACK */
250 b_thread: 1, /* thread doing connect */
251 b_ordrel: 1, /* do an orderly release? */
252
253 b_connected: 1, /* this connection is connected */
254 b_doomed: 1, /* too many conns, let this go idle */
255 b_dead: 1, /* transport is closed or disconn */
256 b_closing: 1; /* we've sent a ord rel on this conn */
257 #endif
258 } bit; unsigned int word;
259
260 #define x_closing x_state.bit.b_closing
261 #define x_dead x_state.bit.b_dead
262 #define x_doomed x_state.bit.b_doomed
263 #define x_connected x_state.bit.b_connected
264
265 #define x_ordrel x_state.bit.b_ordrel
266 #define x_thread x_state.bit.b_thread
267 #define x_waitdis x_state.bit.b_waitdis
268 #define x_needdis x_state.bit.b_needdis
269
270 #define x_needrel x_state.bit.b_needrel
271 #define x_early_disc x_state.bit.b_early_disc
272
273 #define x_state_flags x_state.word
274
275 #define X_CLOSING 0x80000000
276 #define X_DEAD 0x40000000
277 #define X_DOOMED 0x20000000
278 #define X_CONNECTED 0x10000000
279
280 #define X_ORDREL 0x08000000
281 #define X_THREAD 0x04000000
282 #define X_WAITDIS 0x02000000
283 #define X_NEEDDIS 0x01000000
284
285 #define X_NEEDREL 0x00800000
286 #define X_EARLYDISC 0x00400000
287
288 #define X_BADSTATES (X_CLOSING | X_DEAD | X_DOOMED)
289
290 } x_state;
291 int x_ref; /* number of users of this xprt */
292 int x_family; /* address family of transport */
293 dev_t x_rdev; /* device number of transport */
294 struct cm_xprt *x_next;
295
296 struct netbuf x_server; /* destination address */
297 struct netbuf x_src; /* src address (for retries) */
298 kmutex_t x_lock; /* lock on this entry */
299 kcondvar_t x_cv; /* to signal when can be closed */
300 kcondvar_t x_conn_cv; /* to signal when connection attempt */
301 /* is complete */
302 kstat_t *x_ksp;
303
304 kcondvar_t x_dis_cv; /* to signal when disconnect attempt */
305 /* is complete */
306 zoneid_t x_zoneid; /* zone this xprt belongs to */
307 };
308
309 typedef struct cm_kstat_xprt {
310 kstat_named_t x_wq;
311 kstat_named_t x_server;
312 kstat_named_t x_family;
313 kstat_named_t x_rdev;
314 kstat_named_t x_time;
315 kstat_named_t x_state;
316 kstat_named_t x_ref;
317 kstat_named_t x_port;
318 } cm_kstat_xprt_t;
319
320 static cm_kstat_xprt_t cm_kstat_template = {
321 { "write_queue", KSTAT_DATA_UINT32 },
322 { "server", KSTAT_DATA_STRING },
323 { "addr_family", KSTAT_DATA_UINT32 },
324 { "device", KSTAT_DATA_UINT32 },
325 { "time_stamp", KSTAT_DATA_UINT32 },
326 { "status", KSTAT_DATA_UINT32 },
327 { "ref_count", KSTAT_DATA_INT32 },
328 { "port", KSTAT_DATA_UINT32 },
329 };
330
331 /*
332 * The inverse of this is connmgr_release().
333 */
334 #define CONN_HOLD(Cm_entry) {\
335 mutex_enter(&(Cm_entry)->x_lock); \
336 (Cm_entry)->x_ref++; \
337 mutex_exit(&(Cm_entry)->x_lock); \
338 }
339
340
341 /*
342 * Private data per rpc handle. This structure is allocated by
343 * clnt_cots_kcreate, and freed by clnt_cots_kdestroy.
344 */
345 typedef struct cku_private_s {
346 CLIENT cku_client; /* client handle */
347 calllist_t cku_call; /* for dispatching calls */
348 struct rpc_err cku_err; /* error status */
349
350 struct netbuf cku_srcaddr; /* source address for retries */
351 int cku_addrfmly; /* for binding port */
352 struct netbuf cku_addr; /* remote address */
353 dev_t cku_device; /* device to use */
354 uint_t cku_flags;
355 #define CKU_ONQUEUE 0x1
356 #define CKU_SENT 0x2
357
358 bool_t cku_progress; /* for CLSET_PROGRESS */
359 uint32_t cku_xid; /* current XID */
360 clock_t cku_ctime; /* time stamp of when */
361 /* connection was created */
362 uint_t cku_recv_attempts;
363 XDR cku_outxdr; /* xdr routine for output */
364 XDR cku_inxdr; /* xdr routine for input */
365 char cku_rpchdr[WIRE_HDR_SIZE + 4];
366 /* pre-serialized rpc header */
367
368 uint_t cku_outbuflen; /* default output mblk length */
369 struct cred *cku_cred; /* credentials */
370 bool_t cku_nodelayonerr;
371 /* for CLSET_NODELAYONERR */
372 int cku_useresvport; /* Use reserved port */
373 struct rpc_cots_client *cku_stats; /* stats for zone */
374 } cku_private_t;
375
376 static struct cm_xprt *connmgr_wrapconnect(struct cm_xprt *,
377 const struct timeval *, struct netbuf *, int, struct netbuf *,
378 struct rpc_err *, bool_t, bool_t, cred_t *);
379
380 static bool_t connmgr_connect(struct cm_xprt *, queue_t *, struct netbuf *,
381 int, calllist_t *, int *, bool_t reconnect,
382 const struct timeval *, bool_t, cred_t *);
383
384 static bool_t connmgr_setopt(queue_t *, int, int, calllist_t *, cred_t *cr);
385 static void connmgr_sndrel(struct cm_xprt *);
386 static void connmgr_snddis(struct cm_xprt *);
387 static void connmgr_close(struct cm_xprt *);
388 static void connmgr_release(struct cm_xprt *);
389 static struct cm_xprt *connmgr_wrapget(struct netbuf *, const struct timeval *,
390 cku_private_t *);
391
392 static struct cm_xprt *connmgr_get(struct netbuf *, const struct timeval *,
393 struct netbuf *, int, struct netbuf *, struct rpc_err *, dev_t,
394 bool_t, int, cred_t *);
395
396 static void connmgr_cancelconn(struct cm_xprt *);
397 static enum clnt_stat connmgr_cwait(struct cm_xprt *, const struct timeval *,
398 bool_t);
399 static void connmgr_dis_and_wait(struct cm_xprt *);
400
401 static int clnt_dispatch_send(queue_t *, mblk_t *, calllist_t *, uint_t,
402 uint_t);
403
404 static int clnt_delay(clock_t, bool_t);
405
406 static int waitforack(calllist_t *, t_scalar_t, const struct timeval *, bool_t);
407
408 /*
409 * Operations vector for TCP/IP based RPC
410 */
411 static struct clnt_ops tcp_ops = {
412 clnt_cots_kcallit, /* do rpc call */
413 clnt_cots_kabort, /* abort call */
414 clnt_cots_kerror, /* return error status */
415 clnt_cots_kfreeres, /* free results */
416 clnt_cots_kdestroy, /* destroy rpc handle */
417 clnt_cots_kcontrol, /* the ioctl() of rpc */
418 clnt_cots_ksettimers, /* set retry timers */
419 };
420
421 static int rpc_kstat_instance = 0; /* keeps the current instance */
422 /* number for the next kstat_create */
423
424 static struct cm_xprt *cm_hd = NULL;
425 static kmutex_t connmgr_lock; /* for connection mngr's list of transports */
426
427 extern kmutex_t clnt_max_msg_lock;
428
429 static calllist_t *clnt_pending = NULL;
430 extern kmutex_t clnt_pending_lock;
431
432 static int clnt_cots_hash_size = DEFAULT_HASH_SIZE;
433
434 static call_table_t *cots_call_ht;
435
436 static const struct rpc_cots_client {
437 kstat_named_t rccalls;
438 kstat_named_t rcbadcalls;
439 kstat_named_t rcbadxids;
440 kstat_named_t rctimeouts;
441 kstat_named_t rcnewcreds;
442 kstat_named_t rcbadverfs;
443 kstat_named_t rctimers;
444 kstat_named_t rccantconn;
445 kstat_named_t rcnomem;
446 kstat_named_t rcintrs;
447 } cots_rcstat_tmpl = {
448 { "calls", KSTAT_DATA_UINT64 },
449 { "badcalls", KSTAT_DATA_UINT64 },
450 { "badxids", KSTAT_DATA_UINT64 },
451 { "timeouts", KSTAT_DATA_UINT64 },
452 { "newcreds", KSTAT_DATA_UINT64 },
453 { "badverfs", KSTAT_DATA_UINT64 },
454 { "timers", KSTAT_DATA_UINT64 },
455 { "cantconn", KSTAT_DATA_UINT64 },
456 { "nomem", KSTAT_DATA_UINT64 },
457 { "interrupts", KSTAT_DATA_UINT64 }
458 };
459
460 #define COTSRCSTAT_INCR(p, x) \
461 atomic_add_64(&(p)->x.value.ui64, 1)
462
463 #define CLNT_MAX_CONNS 1 /* concurrent connections between clnt/srvr */
464 int clnt_max_conns = CLNT_MAX_CONNS;
465
466 #define CLNT_MIN_TIMEOUT 10 /* seconds to wait after we get a */
467 /* connection reset */
468 #define CLNT_MIN_CONNTIMEOUT 5 /* seconds to wait for a connection */
469
470
471 int clnt_cots_min_tout = CLNT_MIN_TIMEOUT;
472 int clnt_cots_min_conntout = CLNT_MIN_CONNTIMEOUT;
473
474 /*
475 * Limit the number of times we will attempt to receive a reply without
476 * re-sending a response.
477 */
478 #define CLNT_MAXRECV_WITHOUT_RETRY 3
479 uint_t clnt_cots_maxrecv = CLNT_MAXRECV_WITHOUT_RETRY;
480
481 uint_t *clnt_max_msg_sizep;
482 void (*clnt_stop_idle)(queue_t *wq);
483
484 #define ptoh(p) (&((p)->cku_client))
485 #define htop(h) ((cku_private_t *)((h)->cl_private))
486
487 /*
488 * Times to retry
489 */
490 #define REFRESHES 2 /* authentication refreshes */
491
492 /*
493 * The following is used to determine the global default behavior for
494 * COTS when binding to a local port.
495 *
496 * If the value is set to 1 the default will be to select a reserved
497 * (aka privileged) port, if the value is zero the default will be to
498 * use non-reserved ports. Users of kRPC may override this by using
499 * CLNT_CONTROL() and CLSET_BINDRESVPORT.
500 */
501 int clnt_cots_do_bindresvport = 1;
502
503 static zone_key_t zone_cots_key;
504
505 /*
506 * We need to do this after all kernel threads in the zone have exited.
507 */
508 /* ARGSUSED */
509 static void
510 clnt_zone_destroy(zoneid_t zoneid, void *unused)
511 {
512 struct cm_xprt **cmp;
513 struct cm_xprt *cm_entry;
514 struct cm_xprt *freelist = NULL;
515
516 mutex_enter(&connmgr_lock);
517 cmp = &cm_hd;
518 while ((cm_entry = *cmp) != NULL) {
519 if (cm_entry->x_zoneid == zoneid) {
520 *cmp = cm_entry->x_next;
521 cm_entry->x_next = freelist;
522 freelist = cm_entry;
523 } else {
524 cmp = &cm_entry->x_next;
525 }
526 }
527 mutex_exit(&connmgr_lock);
528 while ((cm_entry = freelist) != NULL) {
529 freelist = cm_entry->x_next;
530 connmgr_close(cm_entry);
531 }
532 }
533
534 int
535 clnt_cots_kcreate(dev_t dev, struct netbuf *addr, int family, rpcprog_t prog,
536 rpcvers_t vers, uint_t max_msgsize, cred_t *cred, CLIENT **ncl)
537 {
538 CLIENT *h;
539 cku_private_t *p;
540 struct rpc_msg call_msg;
541 struct rpcstat *rpcstat;
542
543 RPCLOG(8, "clnt_cots_kcreate: prog %u\n", prog);
544
545 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone());
546 ASSERT(rpcstat != NULL);
547
548 /* Allocate and intialize the client handle. */
549 p = kmem_zalloc(sizeof (*p), KM_SLEEP);
550
551 h = ptoh(p);
552
553 h->cl_private = (caddr_t)p;
554 h->cl_auth = authkern_create();
555 h->cl_ops = &tcp_ops;
556
557 cv_init(&p->cku_call.call_cv, NULL, CV_DEFAULT, NULL);
558 mutex_init(&p->cku_call.call_lock, NULL, MUTEX_DEFAULT, NULL);
559
560 /*
561 * If the current sanity check size in rpcmod is smaller
562 * than the size needed, then increase the sanity check.
563 */
564 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL &&
565 max_msgsize > *clnt_max_msg_sizep) {
566 mutex_enter(&clnt_max_msg_lock);
567 if (max_msgsize > *clnt_max_msg_sizep)
568 *clnt_max_msg_sizep = max_msgsize;
569 mutex_exit(&clnt_max_msg_lock);
570 }
571
572 p->cku_outbuflen = COTS_DEFAULT_ALLOCSIZE;
573
574 /* Preserialize the call message header */
575
576 call_msg.rm_xid = 0;
577 call_msg.rm_direction = CALL;
578 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION;
579 call_msg.rm_call.cb_prog = prog;
580 call_msg.rm_call.cb_vers = vers;
581
582 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, WIRE_HDR_SIZE, XDR_ENCODE);
583
584 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) {
585 RPCLOG0(1, "clnt_cots_kcreate - Fatal header serialization "
586 "error\n");
587 auth_destroy(h->cl_auth);
588 kmem_free(p, sizeof (cku_private_t));
589 RPCLOG0(1, "clnt_cots_kcreate: create failed error EINVAL\n");
590 return (EINVAL); /* XXX */
591 }
592
593 /*
594 * The zalloc initialized the fields below.
595 * p->cku_xid = 0;
596 * p->cku_flags = 0;
597 * p->cku_srcaddr.len = 0;
598 * p->cku_srcaddr.maxlen = 0;
599 */
600
601 p->cku_cred = cred;
602 p->cku_device = dev;
603 p->cku_addrfmly = family;
604 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP);
605 p->cku_addr.maxlen = addr->maxlen;
606 p->cku_addr.len = addr->len;
607 bcopy(addr->buf, p->cku_addr.buf, addr->len);
608 p->cku_stats = rpcstat->rpc_cots_client;
609 p->cku_useresvport = -1; /* value is has not been set */
610
611 *ncl = h;
612 return (0);
613 }
614
615 /*ARGSUSED*/
616 static void
617 clnt_cots_kabort(CLIENT *h)
618 {
619 }
620
621 /*
622 * Return error info on this handle.
623 */
624 static void
625 clnt_cots_kerror(CLIENT *h, struct rpc_err *err)
626 {
627 /* LINTED pointer alignment */
628 cku_private_t *p = htop(h);
629
630 *err = p->cku_err;
631 }
632
633 static bool_t
634 clnt_cots_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr)
635 {
636 /* LINTED pointer alignment */
637 cku_private_t *p = htop(h);
638 XDR *xdrs;
639
640 xdrs = &(p->cku_outxdr);
641 xdrs->x_op = XDR_FREE;
642 return ((*xdr_res)(xdrs, res_ptr));
643 }
644
645 static bool_t
646 clnt_cots_kcontrol(CLIENT *h, int cmd, char *arg)
647 {
648 cku_private_t *p = htop(h);
649
650 switch (cmd) {
651 case CLSET_PROGRESS:
652 p->cku_progress = TRUE;
653 return (TRUE);
654
655 case CLSET_XID:
656 if (arg == NULL)
657 return (FALSE);
658
659 p->cku_xid = *((uint32_t *)arg);
660 return (TRUE);
661
662 case CLGET_XID:
663 if (arg == NULL)
664 return (FALSE);
665
666 *((uint32_t *)arg) = p->cku_xid;
667 return (TRUE);
668
669 case CLSET_NODELAYONERR:
670 if (arg == NULL)
671 return (FALSE);
672
673 if (*((bool_t *)arg) == TRUE) {
674 p->cku_nodelayonerr = TRUE;
675 return (TRUE);
676 }
677 if (*((bool_t *)arg) == FALSE) {
678 p->cku_nodelayonerr = FALSE;
679 return (TRUE);
680 }
681 return (FALSE);
682
683 case CLGET_NODELAYONERR:
684 if (arg == NULL)
685 return (FALSE);
686
687 *((bool_t *)arg) = p->cku_nodelayonerr;
688 return (TRUE);
689
690 case CLSET_BINDRESVPORT:
691 if (arg == NULL)
692 return (FALSE);
693
694 if (*(int *)arg != 1 && *(int *)arg != 0)
695 return (FALSE);
696
697 p->cku_useresvport = *(int *)arg;
698
699 return (TRUE);
700
701 case CLGET_BINDRESVPORT:
702 if (arg == NULL)
703 return (FALSE);
704
705 *(int *)arg = p->cku_useresvport;
706
707 return (TRUE);
708
709 default:
710 return (FALSE);
711 }
712 }
713
714 /*
715 * Destroy rpc handle. Frees the space used for output buffer,
716 * private data, and handle structure.
717 */
718 static void
719 clnt_cots_kdestroy(CLIENT *h)
720 {
721 /* LINTED pointer alignment */
722 cku_private_t *p = htop(h);
723 calllist_t *call = &p->cku_call;
724
725 RPCLOG(8, "clnt_cots_kdestroy h: %p\n", (void *)h);
726 RPCLOG(8, "clnt_cots_kdestroy h: xid=0x%x\n", p->cku_xid);
727
728 if (p->cku_flags & CKU_ONQUEUE) {
729 RPCLOG(64, "clnt_cots_kdestroy h: removing call for xid 0x%x "
730 "from dispatch list\n", p->cku_xid);
731 call_table_remove(call);
732 }
733
734 if (call->call_reply)
735 freemsg(call->call_reply);
736 cv_destroy(&call->call_cv);
737 mutex_destroy(&call->call_lock);
738
739 kmem_free(p->cku_srcaddr.buf, p->cku_srcaddr.maxlen);
740 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
741 kmem_free(p, sizeof (*p));
742 }
743
744 static int clnt_cots_pulls;
745 #define RM_HDR_SIZE 4 /* record mark header size */
746
747 /*
748 * Call remote procedure.
749 */
750 static enum clnt_stat
751 clnt_cots_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args,
752 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait)
753 {
754 /* LINTED pointer alignment */
755 cku_private_t *p = htop(h);
756 calllist_t *call = &p->cku_call;
757 XDR *xdrs;
758 struct rpc_msg reply_msg;
759 mblk_t *mp;
760 #ifdef RPCDEBUG
761 clock_t time_sent;
762 #endif
763 struct netbuf *retryaddr;
764 struct cm_xprt *cm_entry = NULL;
765 queue_t *wq;
766 int len, waitsecs, max_waitsecs;
767 int mpsize;
768 int refreshes = REFRESHES;
769 int interrupted;
770 int tidu_size;
771 enum clnt_stat status;
772 struct timeval cwait;
773 bool_t delay_first = FALSE;
774 clock_t ticks;
775
776 RPCLOG(2, "clnt_cots_kcallit, procnum %u\n", procnum);
777 COTSRCSTAT_INCR(p->cku_stats, rccalls);
778
779 RPCLOG(2, "clnt_cots_kcallit: wait.tv_sec: %ld\n", wait.tv_sec);
780 RPCLOG(2, "clnt_cots_kcallit: wait.tv_usec: %ld\n", wait.tv_usec);
781 /*
782 * Bug ID 1240234:
783 * Look out for zero length timeouts. We don't want to
784 * wait zero seconds for a connection to be established.
785 */
786 if (wait.tv_sec < clnt_cots_min_conntout) {
787 cwait.tv_sec = clnt_cots_min_conntout;
788 cwait.tv_usec = 0;
789 RPCLOG(8, "clnt_cots_kcallit: wait.tv_sec (%ld) too low,",
790 wait.tv_sec);
791 RPCLOG(8, " setting to: %d\n", clnt_cots_min_conntout);
792 } else {
793 cwait = wait;
794 }
795
796 call_again:
797 if (cm_entry) {
798 connmgr_release(cm_entry);
799 cm_entry = NULL;
800 }
801
802 mp = NULL;
803
804 /*
805 * If the call is not a retry, allocate a new xid and cache it
806 * for future retries.
807 * Bug ID 1246045:
808 * Treat call as a retry for purposes of binding the source
809 * port only if we actually attempted to send anything on
810 * the previous call.
811 */
812 if (p->cku_xid == 0) {
813 p->cku_xid = alloc_xid();
814 call->call_zoneid = rpc_zoneid();
815
816 /*
817 * We need to ASSERT here that our xid != 0 because this
818 * determines whether or not our call record gets placed on
819 * the hash table or the linked list. By design, we mandate
820 * that RPC calls over cots must have xid's != 0, so we can
821 * ensure proper management of the hash table.
822 */
823 ASSERT(p->cku_xid != 0);
824
825 retryaddr = NULL;
826 p->cku_flags &= ~CKU_SENT;
827
828 if (p->cku_flags & CKU_ONQUEUE) {
829 RPCLOG(8, "clnt_cots_kcallit: new call, dequeuing old"
830 " one (%p)\n", (void *)call);
831 call_table_remove(call);
832 p->cku_flags &= ~CKU_ONQUEUE;
833 RPCLOG(64, "clnt_cots_kcallit: removing call from "
834 "dispatch list because xid was zero (now 0x%x)\n",
835 p->cku_xid);
836 }
837
838 if (call->call_reply != NULL) {
839 freemsg(call->call_reply);
840 call->call_reply = NULL;
841 }
842 } else if (p->cku_srcaddr.buf == NULL || p->cku_srcaddr.len == 0) {
843 retryaddr = NULL;
844
845 } else if (p->cku_flags & CKU_SENT) {
846 retryaddr = &p->cku_srcaddr;
847
848 } else {
849 /*
850 * Bug ID 1246045: Nothing was sent, so set retryaddr to
851 * NULL and let connmgr_get() bind to any source port it
852 * can get.
853 */
854 retryaddr = NULL;
855 }
856
857 RPCLOG(64, "clnt_cots_kcallit: xid = 0x%x", p->cku_xid);
858 RPCLOG(64, " flags = 0x%x\n", p->cku_flags);
859
860 p->cku_err.re_status = RPC_TIMEDOUT;
861 p->cku_err.re_errno = p->cku_err.re_terrno = 0;
862
863 cm_entry = connmgr_wrapget(retryaddr, &cwait, p);
864
865 if (cm_entry == NULL) {
866 RPCLOG(1, "clnt_cots_kcallit: can't connect status %s\n",
867 clnt_sperrno(p->cku_err.re_status));
868
869 /*
870 * The reasons why we fail to create a connection are
871 * varied. In most cases we don't want the caller to
872 * immediately retry. This could have one or more
873 * bad effects. This includes flooding the net with
874 * connect requests to ports with no listener; a hard
875 * kernel loop due to all the "reserved" TCP ports being
876 * in use.
877 */
878 delay_first = TRUE;
879
880 /*
881 * Even if we end up returning EINTR, we still count a
882 * a "can't connect", because the connection manager
883 * might have been committed to waiting for or timing out on
884 * a connection.
885 */
886 COTSRCSTAT_INCR(p->cku_stats, rccantconn);
887 switch (p->cku_err.re_status) {
888 case RPC_INTR:
889 p->cku_err.re_errno = EINTR;
890
891 /*
892 * No need to delay because a UNIX signal(2)
893 * interrupted us. The caller likely won't
894 * retry the CLNT_CALL() and even if it does,
895 * we assume the caller knows what it is doing.
896 */
897 delay_first = FALSE;
898 break;
899
900 case RPC_TIMEDOUT:
901 p->cku_err.re_errno = ETIMEDOUT;
902
903 /*
904 * No need to delay because timed out already
905 * on the connection request and assume that the
906 * transport time out is longer than our minimum
907 * timeout, or least not too much smaller.
908 */
909 delay_first = FALSE;
910 break;
911
912 case RPC_SYSTEMERROR:
913 case RPC_TLIERROR:
914 /*
915 * We want to delay here because a transient
916 * system error has a better chance of going away
917 * if we delay a bit. If it's not transient, then
918 * we don't want end up in a hard kernel loop
919 * due to retries.
920 */
921 ASSERT(p->cku_err.re_errno != 0);
922 break;
923
924
925 case RPC_CANTCONNECT:
926 /*
927 * RPC_CANTCONNECT is set on T_ERROR_ACK which
928 * implies some error down in the TCP layer or
929 * below. If cku_nodelayonerror is set then we
930 * assume the caller knows not to try too hard.
931 */
932 RPCLOG0(8, "clnt_cots_kcallit: connection failed,");
933 RPCLOG0(8, " re_status=RPC_CANTCONNECT,");
934 RPCLOG(8, " re_errno=%d,", p->cku_err.re_errno);
935 RPCLOG(8, " cku_nodelayonerr=%d", p->cku_nodelayonerr);
936 if (p->cku_nodelayonerr == TRUE)
937 delay_first = FALSE;
938
939 p->cku_err.re_errno = EIO;
940
941 break;
942
943 case RPC_XPRTFAILED:
944 /*
945 * We want to delay here because we likely
946 * got a refused connection.
947 */
948 if (p->cku_err.re_errno == 0)
949 p->cku_err.re_errno = EIO;
950
951 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n",
952 p->cku_err.re_errno);
953
954 break;
955
956 default:
957 /*
958 * We delay here because it is better to err
959 * on the side of caution. If we got here then
960 * status could have been RPC_SUCCESS, but we
961 * know that we did not get a connection, so
962 * force the rpc status to RPC_CANTCONNECT.
963 */
964 p->cku_err.re_status = RPC_CANTCONNECT;
965 p->cku_err.re_errno = EIO;
966 break;
967 }
968 if (delay_first == TRUE)
969 ticks = clnt_cots_min_tout * drv_usectohz(1000000);
970 goto cots_done;
971 }
972
973 /*
974 * If we've never sent any request on this connection (send count
975 * is zero, or the connection has been reset), cache the
976 * the connection's create time and send a request (possibly a retry)
977 */
978 if ((p->cku_flags & CKU_SENT) == 0 ||
979 p->cku_ctime != cm_entry->x_ctime) {
980 p->cku_ctime = cm_entry->x_ctime;
981
982 } else if ((p->cku_flags & CKU_SENT) && (p->cku_flags & CKU_ONQUEUE) &&
983 (call->call_reply != NULL ||
984 p->cku_recv_attempts < clnt_cots_maxrecv)) {
985
986 /*
987 * If we've sent a request and our call is on the dispatch
988 * queue and we haven't made too many receive attempts, then
989 * don't re-send, just receive.
990 */
991 p->cku_recv_attempts++;
992 goto read_again;
993 }
994
995 /*
996 * Now we create the RPC request in a STREAMS message. We have to do
997 * this after the call to connmgr_get so that we have the correct
998 * TIDU size for the transport.
999 */
1000 tidu_size = cm_entry->x_tidu_size;
1001 len = MSG_OFFSET + MAX(tidu_size, RM_HDR_SIZE + WIRE_HDR_SIZE);
1002
1003 while ((mp = allocb(len, BPRI_MED)) == NULL) {
1004 if (strwaitbuf(len, BPRI_MED)) {
1005 p->cku_err.re_status = RPC_SYSTEMERROR;
1006 p->cku_err.re_errno = ENOSR;
1007 COTSRCSTAT_INCR(p->cku_stats, rcnomem);
1008 goto cots_done;
1009 }
1010 }
1011 xdrs = &p->cku_outxdr;
1012 xdrmblk_init(xdrs, mp, XDR_ENCODE, tidu_size);
1013 mpsize = MBLKSIZE(mp);
1014 ASSERT(mpsize >= len);
1015 ASSERT(mp->b_rptr == mp->b_datap->db_base);
1016
1017 /*
1018 * If the size of mblk is not appreciably larger than what we
1019 * asked, then resize the mblk to exactly len bytes. The reason for
1020 * this: suppose len is 1600 bytes, the tidu is 1460 bytes
1021 * (from TCP over ethernet), and the arguments to the RPC require
1022 * 2800 bytes. Ideally we want the protocol to render two
1023 * ~1400 byte segments over the wire. However if allocb() gives us a 2k
1024 * mblk, and we allocate a second mblk for the remainder, the protocol
1025 * module may generate 3 segments over the wire:
1026 * 1460 bytes for the first, 448 (2048 - 1600) for the second, and
1027 * 892 for the third. If we "waste" 448 bytes in the first mblk,
1028 * the XDR encoding will generate two ~1400 byte mblks, and the
1029 * protocol module is more likely to produce properly sized segments.
1030 */
1031 if ((mpsize >> 1) <= len)
1032 mp->b_rptr += (mpsize - len);
1033
1034 /*
1035 * Adjust b_rptr to reserve space for the non-data protocol headers
1036 * any downstream modules might like to add, and for the
1037 * record marking header.
1038 */
1039 mp->b_rptr += (MSG_OFFSET + RM_HDR_SIZE);
1040
1041 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) {
1042 /* Copy in the preserialized RPC header information. */
1043 bcopy(p->cku_rpchdr, mp->b_rptr, WIRE_HDR_SIZE);
1044
1045 /* Use XDR_SETPOS() to set the b_wptr to past the RPC header. */
1046 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base +
1047 WIRE_HDR_SIZE));
1048
1049 ASSERT((mp->b_wptr - mp->b_rptr) == WIRE_HDR_SIZE);
1050
1051 /* Serialize the procedure number and the arguments. */
1052 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) ||
1053 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) ||
1054 (!(*xdr_args)(xdrs, argsp))) {
1055 p->cku_err.re_status = RPC_CANTENCODEARGS;
1056 p->cku_err.re_errno = EIO;
1057 goto cots_done;
1058 }
1059
1060 (*(uint32_t *)(mp->b_rptr)) = p->cku_xid;
1061 } else {
1062 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[WIRE_HDR_SIZE];
1063 IXDR_PUT_U_INT32(uproc, procnum);
1064
1065 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid;
1066
1067 /* Use XDR_SETPOS() to set the b_wptr. */
1068 XDR_SETPOS(xdrs, (uint_t)(mp->b_rptr - mp->b_datap->db_base));
1069
1070 /* Serialize the procedure number and the arguments. */
1071 if (!AUTH_WRAP(h->cl_auth, p->cku_rpchdr, WIRE_HDR_SIZE+4,
1072 xdrs, xdr_args, argsp)) {
1073 p->cku_err.re_status = RPC_CANTENCODEARGS;
1074 p->cku_err.re_errno = EIO;
1075 goto cots_done;
1076 }
1077 }
1078
1079 RPCLOG(2, "clnt_cots_kcallit: connected, sending call, tidu_size %d\n",
1080 tidu_size);
1081
1082 wq = cm_entry->x_wq;
1083 waitsecs = 0;
1084
1085 dispatch_again:
1086 status = clnt_dispatch_send(wq, mp, call, p->cku_xid,
1087 (p->cku_flags & CKU_ONQUEUE));
1088
1089 if ((status == RPC_CANTSEND) && (call->call_reason == ENOBUFS)) {
1090 /*
1091 * QFULL condition, allow some time for queue to drain
1092 * and try again. Give up after waiting for all timeout
1093 * specified for the call, or zone is going away.
1094 */
1095 max_waitsecs = wait.tv_sec ? wait.tv_sec : clnt_cots_min_tout;
1096 if ((waitsecs++ < max_waitsecs) &&
1097 !(zone_status_get(curproc->p_zone) >=
1098 ZONE_IS_SHUTTING_DOWN)) {
1099
1100 /* wait 1 sec for queue to drain */
1101 if (clnt_delay(drv_usectohz(1000000),
1102 h->cl_nosignal) == EINTR) {
1103 p->cku_err.re_errno = EINTR;
1104 p->cku_err.re_status = RPC_INTR;
1105
1106 goto cots_done;
1107 }
1108
1109 /* and try again */
1110 goto dispatch_again;
1111 }
1112 p->cku_err.re_status = status;
1113 p->cku_err.re_errno = call->call_reason;
1114 DTRACE_PROBE(krpc__e__clntcots__kcallit__cantsend);
1115
1116 goto cots_done;
1117 }
1118
1119 if (waitsecs) {
1120 /* adjust timeout to account for time wait to send */
1121 wait.tv_sec -= waitsecs;
1122 if (wait.tv_sec < 0) {
1123 /* pick up reply on next retry */
1124 wait.tv_sec = 0;
1125 }
1126 DTRACE_PROBE2(clnt_cots__sendwait, CLIENT *, h,
1127 int, waitsecs);
1128 }
1129
1130 RPCLOG(64, "clnt_cots_kcallit: sent call for xid 0x%x\n",
1131 (uint_t)p->cku_xid);
1132 p->cku_flags = (CKU_ONQUEUE|CKU_SENT);
1133 p->cku_recv_attempts = 1;
1134
1135 #ifdef RPCDEBUG
1136 time_sent = lbolt;
1137 #endif
1138
1139 /*
1140 * Wait for a reply or a timeout. If there is no error or timeout,
1141 * (both indicated by call_status), call->call_reply will contain
1142 * the RPC reply message.
1143 */
1144 read_again:
1145 mutex_enter(&call->call_lock);
1146 interrupted = 0;
1147 if (call->call_status == RPC_TIMEDOUT) {
1148 /*
1149 * Indicate that the lwp is not to be stopped while waiting
1150 * for this network traffic. This is to avoid deadlock while
1151 * debugging a process via /proc and also to avoid recursive
1152 * mutex_enter()s due to NFS page faults while stopping
1153 * (NFS holds locks when it calls here).
1154 */
1155 clock_t cv_wait_ret;
1156 clock_t timout;
1157 clock_t oldlbolt;
1158
1159 klwp_t *lwp = ttolwp(curthread);
1160
1161 if (lwp != NULL)
1162 lwp->lwp_nostop++;
1163
1164 oldlbolt = lbolt;
1165 timout = wait.tv_sec * drv_usectohz(1000000) +
1166 drv_usectohz(wait.tv_usec) + oldlbolt;
1167 /*
1168 * Iterate until the call_status is changed to something
1169 * other that RPC_TIMEDOUT, or if cv_timedwait_sig() returns
1170 * something <=0 zero. The latter means that we timed
1171 * out.
1172 */
1173 if (h->cl_nosignal)
1174 while ((cv_wait_ret = cv_timedwait(&call->call_cv,
1175 &call->call_lock, timout)) > 0 &&
1176 call->call_status == RPC_TIMEDOUT)
1177 ;
1178 else
1179 while ((cv_wait_ret = cv_timedwait_sig(
1180 &call->call_cv,
1181 &call->call_lock, timout)) > 0 &&
1182 call->call_status == RPC_TIMEDOUT)
1183 ;
1184
1185 switch (cv_wait_ret) {
1186 case 0:
1187 /*
1188 * If we got out of the above loop with
1189 * cv_timedwait_sig() returning 0, then we were
1190 * interrupted regardless what call_status is.
1191 */
1192 interrupted = 1;
1193 break;
1194 case -1:
1195 /* cv_timedwait_sig() timed out */
1196 break;
1197 default:
1198
1199 /*
1200 * We were cv_signaled(). If we didn't
1201 * get a successful call_status and returned
1202 * before time expired, delay up to clnt_cots_min_tout
1203 * seconds so that the caller doesn't immediately
1204 * try to call us again and thus force the
1205 * same condition that got us here (such
1206 * as a RPC_XPRTFAILED due to the server not
1207 * listening on the end-point.
1208 */
1209 if (call->call_status != RPC_SUCCESS) {
1210 clock_t curlbolt;
1211 clock_t diff;
1212
1213 curlbolt = ddi_get_lbolt();
1214 ticks = clnt_cots_min_tout *
1215 drv_usectohz(1000000);
1216 diff = curlbolt - oldlbolt;
1217 if (diff < ticks) {
1218 delay_first = TRUE;
1219 if (diff > 0)
1220 ticks -= diff;
1221 }
1222 }
1223 break;
1224 }
1225
1226 if (lwp != NULL)
1227 lwp->lwp_nostop--;
1228 }
1229 /*
1230 * Get the reply message, if any. This will be freed at the end
1231 * whether or not an error occurred.
1232 */
1233 mp = call->call_reply;
1234 call->call_reply = NULL;
1235
1236 /*
1237 * call_err is the error info when the call is on dispatch queue.
1238 * cku_err is the error info returned to the caller.
1239 * Sync cku_err with call_err for local message processing.
1240 */
1241
1242 status = call->call_status;
1243 p->cku_err = call->call_err;
1244 mutex_exit(&call->call_lock);
1245
1246 if (status != RPC_SUCCESS) {
1247 switch (status) {
1248 case RPC_TIMEDOUT:
1249 if (interrupted) {
1250 COTSRCSTAT_INCR(p->cku_stats, rcintrs);
1251 p->cku_err.re_status = RPC_INTR;
1252 p->cku_err.re_errno = EINTR;
1253 RPCLOG(1, "clnt_cots_kcallit: xid 0x%x",
1254 p->cku_xid);
1255 RPCLOG(1, "signal interrupted at %ld", lbolt);
1256 RPCLOG(1, ", was sent at %ld\n", time_sent);
1257 } else {
1258 COTSRCSTAT_INCR(p->cku_stats, rctimeouts);
1259 p->cku_err.re_errno = ETIMEDOUT;
1260 RPCLOG(1, "clnt_cots_kcallit: timed out at %ld",
1261 lbolt);
1262 RPCLOG(1, ", was sent at %ld\n", time_sent);
1263 }
1264 break;
1265
1266 case RPC_XPRTFAILED:
1267 if (p->cku_err.re_errno == 0)
1268 p->cku_err.re_errno = EIO;
1269
1270 RPCLOG(1, "clnt_cots_kcallit: transport failed: %d\n",
1271 p->cku_err.re_errno);
1272 break;
1273
1274 case RPC_SYSTEMERROR:
1275 ASSERT(p->cku_err.re_errno);
1276 RPCLOG(1, "clnt_cots_kcallit: system error: %d\n",
1277 p->cku_err.re_errno);
1278 break;
1279
1280 default:
1281 p->cku_err.re_status = RPC_SYSTEMERROR;
1282 p->cku_err.re_errno = EIO;
1283 RPCLOG(1, "clnt_cots_kcallit: error: %s\n",
1284 clnt_sperrno(status));
1285 break;
1286 }
1287 if (p->cku_err.re_status != RPC_TIMEDOUT) {
1288
1289 if (p->cku_flags & CKU_ONQUEUE) {
1290 call_table_remove(call);
1291 p->cku_flags &= ~CKU_ONQUEUE;
1292 }
1293
1294 RPCLOG(64, "clnt_cots_kcallit: non TIMEOUT so xid 0x%x "
1295 "taken off dispatch list\n", p->cku_xid);
1296 if (call->call_reply) {
1297 freemsg(call->call_reply);
1298 call->call_reply = NULL;
1299 }
1300 } else if (wait.tv_sec != 0) {
1301 /*
1302 * We've sent the request over TCP and so we have
1303 * every reason to believe it will get
1304 * delivered. In which case returning a timeout is not
1305 * appropriate.
1306 */
1307 if (p->cku_progress == TRUE &&
1308 p->cku_recv_attempts < clnt_cots_maxrecv) {
1309 p->cku_err.re_status = RPC_INPROGRESS;
1310 }
1311 }
1312 goto cots_done;
1313 }
1314
1315 xdrs = &p->cku_inxdr;
1316 xdrmblk_init(xdrs, mp, XDR_DECODE, 0);
1317
1318 reply_msg.rm_direction = REPLY;
1319 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED;
1320 reply_msg.acpted_rply.ar_stat = SUCCESS;
1321
1322 reply_msg.acpted_rply.ar_verf = _null_auth;
1323 /*
1324 * xdr_results will be done in AUTH_UNWRAP.
1325 */
1326 reply_msg.acpted_rply.ar_results.where = NULL;
1327 reply_msg.acpted_rply.ar_results.proc = xdr_void;
1328
1329 if (xdr_replymsg(xdrs, &reply_msg)) {
1330 enum clnt_stat re_status;
1331
1332 _seterr_reply(&reply_msg, &p->cku_err);
1333
1334 re_status = p->cku_err.re_status;
1335 if (re_status == RPC_SUCCESS) {
1336 /*
1337 * Reply is good, check auth.
1338 */
1339 if (!AUTH_VALIDATE(h->cl_auth,
1340 &reply_msg.acpted_rply.ar_verf)) {
1341 COTSRCSTAT_INCR(p->cku_stats, rcbadverfs);
1342 RPCLOG0(1, "clnt_cots_kcallit: validation "
1343 "failure\n");
1344 freemsg(mp);
1345 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
1346 mutex_enter(&call->call_lock);
1347 if (call->call_reply == NULL)
1348 call->call_status = RPC_TIMEDOUT;
1349 mutex_exit(&call->call_lock);
1350 goto read_again;
1351 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs,
1352 xdr_results, resultsp)) {
1353 RPCLOG0(1, "clnt_cots_kcallit: validation "
1354 "failure (unwrap)\n");
1355 p->cku_err.re_status = RPC_CANTDECODERES;
1356 p->cku_err.re_errno = EIO;
1357 }
1358 } else {
1359 /* set errno in case we can't recover */
1360 if (re_status != RPC_VERSMISMATCH &&
1361 re_status != RPC_AUTHERROR &&
1362 re_status != RPC_PROGVERSMISMATCH)
1363 p->cku_err.re_errno = EIO;
1364
1365 if (re_status == RPC_AUTHERROR) {
1366 /*
1367 * Maybe our credential need to be refreshed
1368 */
1369 if (cm_entry) {
1370 /*
1371 * There is the potential that the
1372 * cm_entry has/will be marked dead,
1373 * so drop the connection altogether,
1374 * force REFRESH to establish new
1375 * connection.
1376 */
1377 connmgr_cancelconn(cm_entry);
1378 cm_entry = NULL;
1379 }
1380
1381 if ((refreshes > 0) &&
1382 AUTH_REFRESH(h->cl_auth, &reply_msg,
1383 p->cku_cred)) {
1384 refreshes--;
1385 (void) xdr_rpc_free_verifier(xdrs,
1386 &reply_msg);
1387 freemsg(mp);
1388 mp = NULL;
1389
1390 if (p->cku_flags & CKU_ONQUEUE) {
1391 call_table_remove(call);
1392 p->cku_flags &= ~CKU_ONQUEUE;
1393 }
1394
1395 RPCLOG(64,
1396 "clnt_cots_kcallit: AUTH_ERROR, xid"
1397 " 0x%x removed off dispatch list\n",
1398 p->cku_xid);
1399 if (call->call_reply) {
1400 freemsg(call->call_reply);
1401 call->call_reply = NULL;
1402 }
1403
1404 COTSRCSTAT_INCR(p->cku_stats,
1405 rcbadcalls);
1406 COTSRCSTAT_INCR(p->cku_stats,
1407 rcnewcreds);
1408 goto call_again;
1409 }
1410
1411 /*
1412 * We have used the client handle to
1413 * do an AUTH_REFRESH and the RPC status may
1414 * be set to RPC_SUCCESS; Let's make sure to
1415 * set it to RPC_AUTHERROR.
1416 */
1417 p->cku_err.re_status = RPC_AUTHERROR;
1418
1419 /*
1420 * Map recoverable and unrecoverable
1421 * authentication errors to appropriate errno
1422 */
1423 switch (p->cku_err.re_why) {
1424 case AUTH_TOOWEAK:
1425 /*
1426 * This could be a failure where the
1427 * server requires use of a reserved
1428 * port, check and optionally set the
1429 * client handle useresvport trying
1430 * one more time. Next go round we
1431 * fall out with the tooweak error.
1432 */
1433 if (p->cku_useresvport != 1) {
1434 p->cku_useresvport = 1;
1435 p->cku_xid = 0;
1436 (void) xdr_rpc_free_verifier
1437 (xdrs, &reply_msg);
1438 freemsg(mp);
1439 goto call_again;
1440 }
1441 /* FALLTHRU */
1442 case AUTH_BADCRED:
1443 case AUTH_BADVERF:
1444 case AUTH_INVALIDRESP:
1445 case AUTH_FAILED:
1446 case RPCSEC_GSS_NOCRED:
1447 case RPCSEC_GSS_FAILED:
1448 p->cku_err.re_errno = EACCES;
1449 break;
1450 case AUTH_REJECTEDCRED:
1451 case AUTH_REJECTEDVERF:
1452 default: p->cku_err.re_errno = EIO;
1453 break;
1454 }
1455 RPCLOG(1, "clnt_cots_kcallit : authentication"
1456 " failed with RPC_AUTHERROR of type %d\n",
1457 (int)p->cku_err.re_why);
1458 }
1459 }
1460 } else {
1461 /* reply didn't decode properly. */
1462 p->cku_err.re_status = RPC_CANTDECODERES;
1463 p->cku_err.re_errno = EIO;
1464 RPCLOG0(1, "clnt_cots_kcallit: decode failure\n");
1465 }
1466
1467 (void) xdr_rpc_free_verifier(xdrs, &reply_msg);
1468
1469 if (p->cku_flags & CKU_ONQUEUE) {
1470 call_table_remove(call);
1471 p->cku_flags &= ~CKU_ONQUEUE;
1472 }
1473
1474 RPCLOG(64, "clnt_cots_kcallit: xid 0x%x taken off dispatch list",
1475 p->cku_xid);
1476 RPCLOG(64, " status is %s\n", clnt_sperrno(p->cku_err.re_status));
1477 cots_done:
1478 if (cm_entry)
1479 connmgr_release(cm_entry);
1480
1481 if (mp != NULL)
1482 freemsg(mp);
1483 if ((p->cku_flags & CKU_ONQUEUE) == 0 && call->call_reply) {
1484 freemsg(call->call_reply);
1485 call->call_reply = NULL;
1486 }
1487 if (p->cku_err.re_status != RPC_SUCCESS) {
1488 RPCLOG0(1, "clnt_cots_kcallit: tail-end failure\n");
1489 COTSRCSTAT_INCR(p->cku_stats, rcbadcalls);
1490 }
1491
1492 /*
1493 * No point in delaying if the zone is going away.
1494 */
1495 if (delay_first == TRUE &&
1496 !(zone_status_get(curproc->p_zone) >= ZONE_IS_SHUTTING_DOWN)) {
1497 if (clnt_delay(ticks, h->cl_nosignal) == EINTR) {
1498 p->cku_err.re_errno = EINTR;
1499 p->cku_err.re_status = RPC_INTR;
1500 }
1501 }
1502 return (p->cku_err.re_status);
1503 }
1504
1505 /*
1506 * Kinit routine for cots. This sets up the correct operations in
1507 * the client handle, as the handle may have previously been a clts
1508 * handle, and clears the xid field so there is no way a new call
1509 * could be mistaken for a retry. It also sets in the handle the
1510 * information that is passed at create/kinit time but needed at
1511 * call time, as cots creates the transport at call time - device,
1512 * address of the server, protocol family.
1513 */
1514 void
1515 clnt_cots_kinit(CLIENT *h, dev_t dev, int family, struct netbuf *addr,
1516 int max_msgsize, cred_t *cred)
1517 {
1518 /* LINTED pointer alignment */
1519 cku_private_t *p = htop(h);
1520 calllist_t *call = &p->cku_call;
1521
1522 h->cl_ops = &tcp_ops;
1523 if (p->cku_flags & CKU_ONQUEUE) {
1524 call_table_remove(call);
1525 p->cku_flags &= ~CKU_ONQUEUE;
1526 RPCLOG(64, "clnt_cots_kinit: removing call for xid 0x%x from"
1527 " dispatch list\n", p->cku_xid);
1528 }
1529
1530 if (call->call_reply != NULL) {
1531 freemsg(call->call_reply);
1532 call->call_reply = NULL;
1533 }
1534
1535 call->call_bucket = NULL;
1536 call->call_hash = 0;
1537
1538 /*
1539 * We don't clear cku_flags here, because clnt_cots_kcallit()
1540 * takes care of handling the cku_flags reset.
1541 */
1542 p->cku_xid = 0;
1543 p->cku_device = dev;
1544 p->cku_addrfmly = family;
1545 p->cku_cred = cred;
1546
1547 if (p->cku_addr.maxlen < addr->len) {
1548 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL)
1549 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen);
1550 p->cku_addr.buf = kmem_zalloc(addr->maxlen, KM_SLEEP);
1551 p->cku_addr.maxlen = addr->maxlen;
1552 }
1553
1554 p->cku_addr.len = addr->len;
1555 bcopy(addr->buf, p->cku_addr.buf, addr->len);
1556
1557 /*
1558 * If the current sanity check size in rpcmod is smaller
1559 * than the size needed, then increase the sanity check.
1560 */
1561 if (max_msgsize != 0 && clnt_max_msg_sizep != NULL &&
1562 max_msgsize > *clnt_max_msg_sizep) {
1563 mutex_enter(&clnt_max_msg_lock);
1564 if (max_msgsize > *clnt_max_msg_sizep)
1565 *clnt_max_msg_sizep = max_msgsize;
1566 mutex_exit(&clnt_max_msg_lock);
1567 }
1568 }
1569
1570 /*
1571 * ksettimers is a no-op for cots, with the exception of setting the xid.
1572 */
1573 /* ARGSUSED */
1574 static int
1575 clnt_cots_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all,
1576 int minimum, void (*feedback)(int, int, caddr_t), caddr_t arg,
1577 uint32_t xid)
1578 {
1579 /* LINTED pointer alignment */
1580 cku_private_t *p = htop(h);
1581
1582 if (xid)
1583 p->cku_xid = xid;
1584 COTSRCSTAT_INCR(p->cku_stats, rctimers);
1585 return (0);
1586 }
1587
1588 extern void rpc_poptimod(struct vnode *);
1589 extern int kstr_push(struct vnode *, char *);
1590
1591 int
1592 conn_kstat_update(kstat_t *ksp, int rw)
1593 {
1594 struct cm_xprt *cm_entry;
1595 struct cm_kstat_xprt *cm_ksp_data;
1596 uchar_t *b;
1597 char *fbuf;
1598
1599 if (rw == KSTAT_WRITE)
1600 return (EACCES);
1601 if (ksp == NULL || ksp->ks_private == NULL)
1602 return (EIO);
1603 cm_entry = (struct cm_xprt *)ksp->ks_private;
1604 cm_ksp_data = (struct cm_kstat_xprt *)ksp->ks_data;
1605
1606 cm_ksp_data->x_wq.value.ui32 = (uint32_t)(uintptr_t)cm_entry->x_wq;
1607 cm_ksp_data->x_family.value.ui32 = cm_entry->x_family;
1608 cm_ksp_data->x_rdev.value.ui32 = (uint32_t)cm_entry->x_rdev;
1609 cm_ksp_data->x_time.value.ui32 = cm_entry->x_time;
1610 cm_ksp_data->x_ref.value.ui32 = cm_entry->x_ref;
1611 cm_ksp_data->x_state.value.ui32 = cm_entry->x_state_flags;
1612
1613 if (cm_entry->x_server.buf) {
1614 fbuf = cm_ksp_data->x_server.value.str.addr.ptr;
1615 if (cm_entry->x_family == AF_INET &&
1616 cm_entry->x_server.len ==
1617 sizeof (struct sockaddr_in)) {
1618 struct sockaddr_in *sa;
1619 sa = (struct sockaddr_in *)
1620 cm_entry->x_server.buf;
1621 b = (uchar_t *)&sa->sin_addr;
1622 (void) sprintf(fbuf,
1623 "%03d.%03d.%03d.%03d", b[0] & 0xFF, b[1] & 0xFF,
1624 b[2] & 0xFF, b[3] & 0xFF);
1625 cm_ksp_data->x_port.value.ui32 =
1626 (uint32_t)sa->sin_port;
1627 } else if (cm_entry->x_family == AF_INET6 &&
1628 cm_entry->x_server.len >=
1629 sizeof (struct sockaddr_in6)) {
1630 /* extract server IP address & port */
1631 struct sockaddr_in6 *sin6;
1632 sin6 = (struct sockaddr_in6 *)cm_entry->x_server.buf;
1633 (void) kinet_ntop6((uchar_t *)&sin6->sin6_addr, fbuf,
1634 INET6_ADDRSTRLEN);
1635 cm_ksp_data->x_port.value.ui32 = sin6->sin6_port;
1636 } else {
1637 struct sockaddr_in *sa;
1638
1639 sa = (struct sockaddr_in *)cm_entry->x_server.buf;
1640 b = (uchar_t *)&sa->sin_addr;
1641 (void) sprintf(fbuf,
1642 "%03d.%03d.%03d.%03d", b[0] & 0xFF, b[1] & 0xFF,
1643 b[2] & 0xFF, b[3] & 0xFF);
1644 }
1645 KSTAT_NAMED_STR_BUFLEN(&cm_ksp_data->x_server) =
1646 strlen(fbuf) + 1;
1647 }
1648
1649 return (0);
1650 }
1651
1652
1653 /*
1654 * We want a version of delay which is interruptible by a UNIX signal
1655 * Return EINTR if an interrupt occured.
1656 */
1657 static int
1658 clnt_delay(clock_t ticks, bool_t nosignal)
1659 {
1660 if (nosignal == TRUE) {
1661 delay(ticks);
1662 return (0);
1663 }
1664 return (delay_sig(ticks));
1665 }
1666
1667 /*
1668 * Wait for a connection until a timeout, or until we are
1669 * signalled that there has been a connection state change.
1670 */
1671 static enum clnt_stat
1672 connmgr_cwait(struct cm_xprt *cm_entry, const struct timeval *waitp,
1673 bool_t nosignal)
1674 {
1675 bool_t interrupted;
1676 clock_t timout, cv_stat;
1677 enum clnt_stat clstat;
1678 unsigned int old_state;
1679
1680 ASSERT(MUTEX_HELD(&connmgr_lock));
1681 /*
1682 * We wait for the transport connection to be made, or an
1683 * indication that it could not be made.
1684 */
1685 clstat = RPC_TIMEDOUT;
1686 interrupted = FALSE;
1687
1688 old_state = cm_entry->x_state_flags;
1689 /*
1690 * Now loop until cv_timedwait{_sig} returns because of
1691 * a signal(0) or timeout(-1) or cv_signal(>0). But it may be
1692 * cv_signalled for various other reasons too. So loop
1693 * until there is a state change on the connection.
1694 */
1695
1696 timout = waitp->tv_sec * drv_usectohz(1000000) +
1697 drv_usectohz(waitp->tv_usec) + lbolt;
1698
1699 if (nosignal) {
1700 while ((cv_stat = cv_timedwait(&cm_entry->x_conn_cv,
1701 &connmgr_lock, timout)) > 0 &&
1702 cm_entry->x_state_flags == old_state)
1703 ;
1704 } else {
1705 while ((cv_stat = cv_timedwait_sig(&cm_entry->x_conn_cv,
1706 &connmgr_lock, timout)) > 0 &&
1707 cm_entry->x_state_flags == old_state)
1708 ;
1709
1710 if (cv_stat == 0) /* got intr signal? */
1711 interrupted = TRUE;
1712 }
1713
1714 if ((cm_entry->x_state_flags & (X_BADSTATES|X_CONNECTED)) ==
1715 X_CONNECTED) {
1716 clstat = RPC_SUCCESS;
1717 } else {
1718 if (interrupted == TRUE)
1719 clstat = RPC_INTR;
1720 RPCLOG(1, "connmgr_cwait: can't connect, error: %s\n",
1721 clnt_sperrno(clstat));
1722 }
1723
1724 return (clstat);
1725 }
1726
1727 /*
1728 * Primary interface for how RPC grabs a connection.
1729 */
1730 static struct cm_xprt *
1731 connmgr_wrapget(
1732 struct netbuf *retryaddr,
1733 const struct timeval *waitp,
1734 cku_private_t *p)
1735 {
1736 struct cm_xprt *cm_entry;
1737
1738 cm_entry = connmgr_get(retryaddr, waitp, &p->cku_addr, p->cku_addrfmly,
1739 &p->cku_srcaddr, &p->cku_err, p->cku_device,
1740 p->cku_client.cl_nosignal, p->cku_useresvport, p->cku_cred);
1741
1742 if (cm_entry == NULL) {
1743 /*
1744 * Re-map the call status to RPC_INTR if the err code is
1745 * EINTR. This can happen if calls status is RPC_TLIERROR.
1746 * However, don't re-map if signalling has been turned off.
1747 * XXX Really need to create a separate thread whenever
1748 * there isn't an existing connection.
1749 */
1750 if (p->cku_err.re_errno == EINTR) {
1751 if (p->cku_client.cl_nosignal == TRUE)
1752 p->cku_err.re_errno = EIO;
1753 else
1754 p->cku_err.re_status = RPC_INTR;
1755 }
1756 }
1757
1758 return (cm_entry);
1759 }
1760
1761 /*
1762 * Obtains a transport to the server specified in addr. If a suitable transport
1763 * does not already exist in the list of cached transports, a new connection
1764 * is created, connected, and added to the list. The connection is for sending
1765 * only - the reply message may come back on another transport connection.
1766 *
1767 * To implement round-robin load balancing with multiple client connections,
1768 * the last entry on the list is always selected. Once the entry is selected
1769 * it's re-inserted to the head of the list.
1770 */
1771 static struct cm_xprt *
1772 connmgr_get(
1773 struct netbuf *retryaddr,
1774 const struct timeval *waitp, /* changed to a ptr to converse stack */
1775 struct netbuf *destaddr,
1776 int addrfmly,
1777 struct netbuf *srcaddr,
1778 struct rpc_err *rpcerr,
1779 dev_t device,
1780 bool_t nosignal,
1781 int useresvport,
1782 cred_t *cr)
1783 {
1784 struct cm_xprt *cm_entry;
1785 struct cm_xprt *lru_entry;
1786 struct cm_xprt **cmp, **prev;
1787 queue_t *wq;
1788 TIUSER *tiptr;
1789 int i;
1790 int retval;
1791 int tidu_size;
1792 bool_t connected;
1793 zoneid_t zoneid = rpc_zoneid();
1794
1795 /*
1796 * If the call is not a retry, look for a transport entry that
1797 * goes to the server of interest.
1798 */
1799 mutex_enter(&connmgr_lock);
1800
1801 if (retryaddr == NULL) {
1802 use_new_conn:
1803 i = 0;
1804 cm_entry = lru_entry = NULL;
1805
1806 prev = cmp = &cm_hd;
1807 while ((cm_entry = *cmp) != NULL) {
1808 ASSERT(cm_entry != cm_entry->x_next);
1809 /*
1810 * Garbage collect conections that are marked
1811 * for needs disconnect.
1812 */
1813 if (cm_entry->x_needdis) {
1814 CONN_HOLD(cm_entry);
1815 connmgr_dis_and_wait(cm_entry);
1816 connmgr_release(cm_entry);
1817 /*
1818 * connmgr_lock could have been
1819 * dropped for the disconnect
1820 * processing so start over.
1821 */
1822 goto use_new_conn;
1823 }
1824
1825 /*
1826 * Garbage collect the dead connections that have
1827 * no threads working on them.
1828 */
1829 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) ==
1830 X_DEAD) {
1831 mutex_enter(&cm_entry->x_lock);
1832 if (cm_entry->x_ref != 0) {
1833 /*
1834 * Currently in use.
1835 * Cleanup later.
1836 */
1837 cmp = &cm_entry->x_next;
1838 mutex_exit(&cm_entry->x_lock);
1839 continue;
1840 }
1841 mutex_exit(&cm_entry->x_lock);
1842 *cmp = cm_entry->x_next;
1843 mutex_exit(&connmgr_lock);
1844 connmgr_close(cm_entry);
1845 mutex_enter(&connmgr_lock);
1846 goto use_new_conn;
1847 }
1848
1849
1850 if ((cm_entry->x_state_flags & X_BADSTATES) == 0 &&
1851 cm_entry->x_zoneid == zoneid &&
1852 cm_entry->x_rdev == device &&
1853 destaddr->len == cm_entry->x_server.len &&
1854 bcmp(destaddr->buf, cm_entry->x_server.buf,
1855 destaddr->len) == 0) {
1856 /*
1857 * If the matching entry isn't connected,
1858 * attempt to reconnect it.
1859 */
1860 if (cm_entry->x_connected == FALSE) {
1861 /*
1862 * We don't go through trying
1863 * to find the least recently
1864 * used connected because
1865 * connmgr_reconnect() briefly
1866 * dropped the connmgr_lock,
1867 * allowing a window for our
1868 * accounting to be messed up.
1869 * In any case, a re-connected
1870 * connection is as good as
1871 * a LRU connection.
1872 */
1873 return (connmgr_wrapconnect(cm_entry,
1874 waitp, destaddr, addrfmly, srcaddr,
1875 rpcerr, TRUE, nosignal, cr));
1876 }
1877 i++;
1878
1879 /* keep track of the last entry */
1880 lru_entry = cm_entry;
1881 prev = cmp;
1882 }
1883 cmp = &cm_entry->x_next;
1884 }
1885
1886 if (i > clnt_max_conns) {
1887 RPCLOG(8, "connmgr_get: too many conns, dooming entry"
1888 " %p\n", (void *)lru_entry->x_tiptr);
1889 lru_entry->x_doomed = TRUE;
1890 goto use_new_conn;
1891 }
1892
1893 /*
1894 * If we are at the maximum number of connections to
1895 * the server, hand back the least recently used one.
1896 */
1897 if (i == clnt_max_conns) {
1898 /*
1899 * Copy into the handle the source address of
1900 * the connection, which we will use in case of
1901 * a later retry.
1902 */
1903 if (srcaddr->len != lru_entry->x_src.len) {
1904 if (srcaddr->len > 0)
1905 kmem_free(srcaddr->buf,
1906 srcaddr->maxlen);
1907 srcaddr->buf = kmem_zalloc(
1908 lru_entry->x_src.len, KM_SLEEP);
1909 srcaddr->maxlen = srcaddr->len =
1910 lru_entry->x_src.len;
1911 }
1912 bcopy(lru_entry->x_src.buf, srcaddr->buf, srcaddr->len);
1913 RPCLOG(2, "connmgr_get: call going out on %p\n",
1914 (void *)lru_entry);
1915 lru_entry->x_time = lbolt;
1916 CONN_HOLD(lru_entry);
1917
1918 if ((i > 1) && (prev != &cm_hd)) {
1919 /*
1920 * remove and re-insert entry at head of list.
1921 */
1922 *prev = lru_entry->x_next;
1923 lru_entry->x_next = cm_hd;
1924 cm_hd = lru_entry;
1925 }
1926
1927 mutex_exit(&connmgr_lock);
1928 return (lru_entry);
1929 }
1930
1931 } else {
1932 /*
1933 * This is the retry case (retryaddr != NULL). Retries must
1934 * be sent on the same source port as the original call.
1935 */
1936
1937 /*
1938 * Walk the list looking for a connection with a source address
1939 * that matches the retry address.
1940 */
1941 start_retry_loop:
1942 cmp = &cm_hd;
1943 while ((cm_entry = *cmp) != NULL) {
1944 ASSERT(cm_entry != cm_entry->x_next);
1945
1946 /*
1947 * determine if this connection matches the passed
1948 * in retry address. If it does not match, advance
1949 * to the next element on the list.
1950 */
1951 if (zoneid != cm_entry->x_zoneid ||
1952 device != cm_entry->x_rdev ||
1953 retryaddr->len != cm_entry->x_src.len ||
1954 bcmp(retryaddr->buf, cm_entry->x_src.buf,
1955 retryaddr->len) != 0) {
1956 cmp = &cm_entry->x_next;
1957 continue;
1958 }
1959 /*
1960 * Garbage collect conections that are marked
1961 * for needs disconnect.
1962 */
1963 if (cm_entry->x_needdis) {
1964 CONN_HOLD(cm_entry);
1965 connmgr_dis_and_wait(cm_entry);
1966 connmgr_release(cm_entry);
1967 /*
1968 * connmgr_lock could have been
1969 * dropped for the disconnect
1970 * processing so start over.
1971 */
1972 goto start_retry_loop;
1973 }
1974 /*
1975 * Garbage collect the dead connections that have
1976 * no threads working on them.
1977 */
1978 if ((cm_entry->x_state_flags & (X_DEAD|X_THREAD)) ==
1979 X_DEAD) {
1980 mutex_enter(&cm_entry->x_lock);
1981 if (cm_entry->x_ref != 0) {
1982 /*
1983 * Currently in use.
1984 * Cleanup later.
1985 */
1986 cmp = &cm_entry->x_next;
1987 mutex_exit(&cm_entry->x_lock);
1988 continue;
1989 }
1990 mutex_exit(&cm_entry->x_lock);
1991 *cmp = cm_entry->x_next;
1992 mutex_exit(&connmgr_lock);
1993 connmgr_close(cm_entry);
1994 mutex_enter(&connmgr_lock);
1995 goto start_retry_loop;
1996 }
1997
1998 /*
1999 * Sanity check: if the connection with our source
2000 * port is going to some other server, something went
2001 * wrong, as we never delete connections (i.e. release
2002 * ports) unless they have been idle. In this case,
2003 * it is probably better to send the call out using
2004 * a new source address than to fail it altogether,
2005 * since that port may never be released.
2006 */
2007 if (destaddr->len != cm_entry->x_server.len ||
2008 bcmp(destaddr->buf, cm_entry->x_server.buf,
2009 destaddr->len) != 0) {
2010 RPCLOG(1, "connmgr_get: tiptr %p"
2011 " is going to a different server"
2012 " with the port that belongs"
2013 " to us!\n", (void *)cm_entry->x_tiptr);
2014 retryaddr = NULL;
2015 goto use_new_conn;
2016 }
2017
2018 /*
2019 * If the connection of interest is not connected and we
2020 * can't reconnect it, then the server is probably
2021 * still down. Return NULL to the caller and let it
2022 * retry later if it wants to. We have a delay so the
2023 * machine doesn't go into a tight retry loop. If the
2024 * entry was already connected, or the reconnected was
2025 * successful, return this entry.
2026 */
2027 if (cm_entry->x_connected == FALSE) {
2028 return (connmgr_wrapconnect(cm_entry,
2029 waitp, destaddr, addrfmly, NULL,
2030 rpcerr, TRUE, nosignal, cr));
2031 } else {
2032 CONN_HOLD(cm_entry);
2033
2034 cm_entry->x_time = lbolt;
2035 mutex_exit(&connmgr_lock);
2036 RPCLOG(2, "connmgr_get: found old "
2037 "transport %p for retry\n",
2038 (void *)cm_entry);
2039 return (cm_entry);
2040 }
2041 }
2042
2043 /*
2044 * We cannot find an entry in the list for this retry.
2045 * Either the entry has been removed temporarily to be
2046 * reconnected by another thread, or the original call
2047 * got a port but never got connected,
2048 * and hence the transport never got put in the
2049 * list. Fall through to the "create new connection" code -
2050 * the former case will fail there trying to rebind the port,
2051 * and the later case (and any other pathological cases) will
2052 * rebind and reconnect and not hang the client machine.
2053 */
2054 RPCLOG0(8, "connmgr_get: no entry in list for retry\n");
2055 }
2056 /*
2057 * Set up a transport entry in the connection manager's list.
2058 */
2059 cm_entry = (struct cm_xprt *)
2060 kmem_zalloc(sizeof (struct cm_xprt), KM_SLEEP);
2061
2062 cm_entry->x_server.buf = kmem_zalloc(destaddr->len, KM_SLEEP);
2063 bcopy(destaddr->buf, cm_entry->x_server.buf, destaddr->len);
2064 cm_entry->x_server.len = cm_entry->x_server.maxlen = destaddr->len;
2065
2066 cm_entry->x_state_flags = X_THREAD;
2067 cm_entry->x_ref = 1;
2068 cm_entry->x_family = addrfmly;
2069 cm_entry->x_rdev = device;
2070 cm_entry->x_zoneid = zoneid;
2071 mutex_init(&cm_entry->x_lock, NULL, MUTEX_DEFAULT, NULL);
2072 cv_init(&cm_entry->x_cv, NULL, CV_DEFAULT, NULL);
2073 cv_init(&cm_entry->x_conn_cv, NULL, CV_DEFAULT, NULL);
2074 cv_init(&cm_entry->x_dis_cv, NULL, CV_DEFAULT, NULL);
2075
2076 /*
2077 * Note that we add this partially initialized entry to the
2078 * connection list. This is so that we don't have connections to
2079 * the same server.
2080 *
2081 * Note that x_src is not initialized at this point. This is because
2082 * retryaddr might be NULL in which case x_src is whatever
2083 * t_kbind/bindresvport gives us. If another thread wants a
2084 * connection to the same server, seemingly we have an issue, but we
2085 * don't. If the other thread comes in with retryaddr == NULL, then it
2086 * will never look at x_src, and it will end up waiting in
2087 * connmgr_cwait() for the first thread to finish the connection
2088 * attempt. If the other thread comes in with retryaddr != NULL, then
2089 * that means there was a request sent on a connection, in which case
2090 * the the connection should already exist. Thus the first thread
2091 * never gets here ... it finds the connection it its server in the
2092 * connection list.
2093 *
2094 * But even if theory is wrong, in the retryaddr != NULL case, the 2nd
2095 * thread will skip us because x_src.len == 0.
2096 */
2097 cm_entry->x_next = cm_hd;
2098 cm_hd = cm_entry;
2099 mutex_exit(&connmgr_lock);
2100
2101 /*
2102 * Either we didn't find an entry to the server of interest, or we
2103 * don't have the maximum number of connections to that server -
2104 * create a new connection.
2105 */
2106 RPCLOG0(8, "connmgr_get: creating new connection\n");
2107 rpcerr->re_status = RPC_TLIERROR;
2108
2109 i = t_kopen(NULL, device, FREAD|FWRITE|FNDELAY, &tiptr, zone_kcred());
2110 if (i) {
2111 RPCLOG(1, "connmgr_get: can't open cots device, error %d\n", i);
2112 rpcerr->re_errno = i;
2113 connmgr_cancelconn(cm_entry);
2114 return (NULL);
2115 }
2116 rpc_poptimod(tiptr->fp->f_vnode);
2117
2118 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"rpcmod", 0,
2119 K_TO_K, kcred, &retval)) {
2120 RPCLOG(1, "connmgr_get: can't push cots module, %d\n", i);
2121 (void) t_kclose(tiptr, 1);
2122 rpcerr->re_errno = i;
2123 connmgr_cancelconn(cm_entry);
2124 return (NULL);
2125 }
2126
2127 if (i = strioctl(tiptr->fp->f_vnode, RPC_CLIENT, 0, 0, K_TO_K,
2128 kcred, &retval)) {
2129 RPCLOG(1, "connmgr_get: can't set client status with cots "
2130 "module, %d\n", i);
2131 (void) t_kclose(tiptr, 1);
2132 rpcerr->re_errno = i;
2133 connmgr_cancelconn(cm_entry);
2134 return (NULL);
2135 }
2136
2137 mutex_enter(&connmgr_lock);
2138
2139 wq = tiptr->fp->f_vnode->v_stream->sd_wrq->q_next;
2140 cm_entry->x_wq = wq;
2141
2142 mutex_exit(&connmgr_lock);
2143
2144 if (i = strioctl(tiptr->fp->f_vnode, I_PUSH, (intptr_t)"timod", 0,
2145 K_TO_K, kcred, &retval)) {
2146 RPCLOG(1, "connmgr_get: can't push timod, %d\n", i);
2147 (void) t_kclose(tiptr, 1);
2148 rpcerr->re_errno = i;
2149 connmgr_cancelconn(cm_entry);
2150 return (NULL);
2151 }
2152
2153 /*
2154 * If the caller has not specified reserved port usage then
2155 * take the system default.
2156 */
2157 if (useresvport == -1)
2158 useresvport = clnt_cots_do_bindresvport;
2159
2160 if ((useresvport || retryaddr != NULL) &&
2161 (addrfmly == AF_INET || addrfmly == AF_INET6)) {
2162 bool_t alloc_src = FALSE;
2163
2164 if (srcaddr->len != destaddr->len) {
2165 kmem_free(srcaddr->buf, srcaddr->maxlen);
2166 srcaddr->buf = kmem_zalloc(destaddr->len, KM_SLEEP);
2167 srcaddr->maxlen = destaddr->len;
2168 srcaddr->len = destaddr->len;
2169 alloc_src = TRUE;
2170 }
2171
2172 if ((i = bindresvport(tiptr, retryaddr, srcaddr, TRUE)) != 0) {
2173 (void) t_kclose(tiptr, 1);
2174 RPCLOG(1, "connmgr_get: couldn't bind, retryaddr: "
2175 "%p\n", (void *)retryaddr);
2176
2177 /*
2178 * 1225408: If we allocated a source address, then it
2179 * is either garbage or all zeroes. In that case
2180 * we need to clear srcaddr.
2181 */
2182 if (alloc_src == TRUE) {
2183 kmem_free(srcaddr->buf, srcaddr->maxlen);
2184 srcaddr->maxlen = srcaddr->len = 0;
2185 srcaddr->buf = NULL;
2186 }
2187 rpcerr->re_errno = i;
2188 connmgr_cancelconn(cm_entry);
2189 return (NULL);
2190 }
2191 } else {
2192 if ((i = t_kbind(tiptr, NULL, NULL)) != 0) {
2193 RPCLOG(1, "clnt_cots_kcreate: t_kbind: %d\n", i);
2194 (void) t_kclose(tiptr, 1);
2195 rpcerr->re_errno = i;
2196 connmgr_cancelconn(cm_entry);
2197 return (NULL);
2198 }
2199 }
2200
2201 {
2202 /*
2203 * Keep the kernel stack lean. Don't move this call
2204 * declaration to the top of this function because a
2205 * call is declared in connmgr_wrapconnect()
2206 */
2207 calllist_t call;
2208
2209 bzero(&call, sizeof (call));
2210 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL);
2211
2212 /*
2213 * This is a bound end-point so don't close it's stream.
2214 */
2215 connected = connmgr_connect(cm_entry, wq, destaddr, addrfmly,
2216 &call, &tidu_size, FALSE, waitp, nosignal, cr);
2217 *rpcerr = call.call_err;
2218 cv_destroy(&call.call_cv);
2219
2220 }
2221
2222 mutex_enter(&connmgr_lock);
2223
2224 /*
2225 * Set up a transport entry in the connection manager's list.
2226 */
2227 cm_entry->x_src.buf = kmem_zalloc(srcaddr->len, KM_SLEEP);
2228 bcopy(srcaddr->buf, cm_entry->x_src.buf, srcaddr->len);
2229 cm_entry->x_src.len = cm_entry->x_src.maxlen = srcaddr->len;
2230
2231 cm_entry->x_tiptr = tiptr;
2232 cm_entry->x_time = lbolt;
2233
2234 if (tiptr->tp_info.servtype == T_COTS_ORD)
2235 cm_entry->x_ordrel = TRUE;
2236 else
2237 cm_entry->x_ordrel = FALSE;
2238
2239 cm_entry->x_tidu_size = tidu_size;
2240
2241 if (cm_entry->x_early_disc) {
2242 /*
2243 * We need to check if a disconnect request has come
2244 * while we are connected, if so, then we need to
2245 * set rpcerr->re_status appropriately before returning
2246 * NULL to caller.
2247 */
2248 if (rpcerr->re_status == RPC_SUCCESS)
2249 rpcerr->re_status = RPC_XPRTFAILED;
2250 cm_entry->x_connected = FALSE;
2251 } else
2252 cm_entry->x_connected = connected;
2253
2254 /*
2255 * There could be a discrepancy here such that
2256 * x_early_disc is TRUE yet connected is TRUE as well
2257 * and the connection is actually connected. In that case
2258 * lets be conservative and declare the connection as not
2259 * connected.
2260 */
2261 cm_entry->x_early_disc = FALSE;
2262 cm_entry->x_needdis = (cm_entry->x_connected == FALSE);
2263 cm_entry->x_ctime = lbolt;
2264
2265 /*
2266 * Notify any threads waiting that the connection attempt is done.
2267 */
2268 cm_entry->x_thread = FALSE;
2269 cv_broadcast(&cm_entry->x_conn_cv);
2270
2271 if (cm_entry->x_connected == FALSE) {
2272 mutex_exit(&connmgr_lock);
2273 connmgr_release(cm_entry);
2274 return (NULL);
2275 }
2276
2277 mutex_exit(&connmgr_lock);
2278
2279 return (cm_entry);
2280 }
2281
2282 /*
2283 * Keep the cm_xprt entry on the connecton list when making a connection. This
2284 * is to prevent multiple connections to a slow server from appearing.
2285 * We use the bit field x_thread to tell if a thread is doing a connection
2286 * which keeps other interested threads from messing with connection.
2287 * Those other threads just wait if x_thread is set.
2288 *
2289 * If x_thread is not set, then we do the actual work of connecting via
2290 * connmgr_connect().
2291 *
2292 * mutex convention: called with connmgr_lock held, returns with it released.
2293 */
2294 static struct cm_xprt *
2295 connmgr_wrapconnect(
2296 struct cm_xprt *cm_entry,
2297 const struct timeval *waitp,
2298 struct netbuf *destaddr,
2299 int addrfmly,
2300 struct netbuf *srcaddr,
2301 struct rpc_err *rpcerr,
2302 bool_t reconnect,
2303 bool_t nosignal,
2304 cred_t *cr)
2305 {
2306 ASSERT(MUTEX_HELD(&connmgr_lock));
2307 /*
2308 * Hold this entry as we are about to drop connmgr_lock.
2309 */
2310 CONN_HOLD(cm_entry);
2311
2312 /*
2313 * If there is a thread already making a connection for us, then
2314 * wait for it to complete the connection.
2315 */
2316 if (cm_entry->x_thread == TRUE) {
2317 rpcerr->re_status = connmgr_cwait(cm_entry, waitp, nosignal);
2318
2319 if (rpcerr->re_status != RPC_SUCCESS) {
2320 mutex_exit(&connmgr_lock);
2321 connmgr_release(cm_entry);
2322 return (NULL);
2323 }
2324 } else {
2325 bool_t connected;
2326 calllist_t call;
2327
2328 cm_entry->x_thread = TRUE;
2329
2330 while (cm_entry->x_needrel == TRUE) {
2331 cm_entry->x_needrel = FALSE;
2332
2333 connmgr_sndrel(cm_entry);
2334 delay(drv_usectohz(1000000));
2335
2336 mutex_enter(&connmgr_lock);
2337 }
2338
2339 /*
2340 * If we need to send a T_DISCON_REQ, send one.
2341 */
2342 connmgr_dis_and_wait(cm_entry);
2343
2344 mutex_exit(&connmgr_lock);
2345
2346 bzero(&call, sizeof (call));
2347 cv_init(&call.call_cv, NULL, CV_DEFAULT, NULL);
2348
2349 connected = connmgr_connect(cm_entry, cm_entry->x_wq,
2350 destaddr, addrfmly, &call, &cm_entry->x_tidu_size,
2351 reconnect, waitp, nosignal, cr);
2352
2353 *rpcerr = call.call_err;
2354 cv_destroy(&call.call_cv);
2355
2356 mutex_enter(&connmgr_lock);
2357
2358
2359 if (cm_entry->x_early_disc) {
2360 /*
2361 * We need to check if a disconnect request has come
2362 * while we are connected, if so, then we need to
2363 * set rpcerr->re_status appropriately before returning
2364 * NULL to caller.
2365 */
2366 if (rpcerr->re_status == RPC_SUCCESS)
2367 rpcerr->re_status = RPC_XPRTFAILED;
2368 cm_entry->x_connected = FALSE;
2369 } else
2370 cm_entry->x_connected = connected;
2371
2372 /*
2373 * There could be a discrepancy here such that
2374 * x_early_disc is TRUE yet connected is TRUE as well
2375 * and the connection is actually connected. In that case
2376 * lets be conservative and declare the connection as not
2377 * connected.
2378 */
2379
2380 cm_entry->x_early_disc = FALSE;
2381 cm_entry->x_needdis = (cm_entry->x_connected == FALSE);
2382
2383
2384 /*
2385 * connmgr_connect() may have given up before the connection
2386 * actually timed out. So ensure that before the next
2387 * connection attempt we do a disconnect.
2388 */
2389 cm_entry->x_ctime = lbolt;
2390 cm_entry->x_thread = FALSE;
2391
2392 cv_broadcast(&cm_entry->x_conn_cv);
2393
2394 if (cm_entry->x_connected == FALSE) {
2395 mutex_exit(&connmgr_lock);
2396 connmgr_release(cm_entry);
2397 return (NULL);
2398 }
2399 }
2400
2401 if (srcaddr != NULL) {
2402 /*
2403 * Copy into the handle the
2404 * source address of the
2405 * connection, which we will use
2406 * in case of a later retry.
2407 */
2408 if (srcaddr->len != cm_entry->x_src.len) {
2409 if (srcaddr->maxlen > 0)
2410 kmem_free(srcaddr->buf, srcaddr->maxlen);
2411 srcaddr->buf = kmem_zalloc(cm_entry->x_src.len,
2412 KM_SLEEP);
2413 srcaddr->maxlen = srcaddr->len =
2414 cm_entry->x_src.len;
2415 }
2416 bcopy(cm_entry->x_src.buf, srcaddr->buf, srcaddr->len);
2417 }
2418 cm_entry->x_time = lbolt;
2419 mutex_exit(&connmgr_lock);
2420 return (cm_entry);
2421 }
2422
2423 /*
2424 * If we need to send a T_DISCON_REQ, send one.
2425 */
2426 static void
2427 connmgr_dis_and_wait(struct cm_xprt *cm_entry)
2428 {
2429 ASSERT(MUTEX_HELD(&connmgr_lock));
2430 for (;;) {
2431 while (cm_entry->x_needdis == TRUE) {
2432 RPCLOG(8, "connmgr_dis_and_wait: need "
2433 "T_DISCON_REQ for connection 0x%p\n",
2434 (void *)cm_entry);
2435 cm_entry->x_needdis = FALSE;
2436 cm_entry->x_waitdis = TRUE;
2437
2438 connmgr_snddis(cm_entry);
2439
2440 mutex_enter(&connmgr_lock);
2441 }
2442
2443 if (cm_entry->x_waitdis == TRUE) {
2444 clock_t curlbolt;
2445 clock_t timout;
2446
2447 RPCLOG(8, "connmgr_dis_and_wait waiting for "
2448 "T_DISCON_REQ's ACK for connection %p\n",
2449 (void *)cm_entry);
2450 curlbolt = ddi_get_lbolt();
2451
2452 timout = clnt_cots_min_conntout *
2453 drv_usectohz(1000000) + curlbolt;
2454
2455 /*
2456 * The TPI spec says that the T_DISCON_REQ
2457 * will get acknowledged, but in practice
2458 * the ACK may never get sent. So don't
2459 * block forever.
2460 */
2461 (void) cv_timedwait(&cm_entry->x_dis_cv,
2462 &connmgr_lock, timout);
2463 }
2464 /*
2465 * If we got the ACK, break. If we didn't,
2466 * then send another T_DISCON_REQ.
2467 */
2468 if (cm_entry->x_waitdis == FALSE) {
2469 break;
2470 } else {
2471 RPCLOG(8, "connmgr_dis_and_wait: did"
2472 "not get T_DISCON_REQ's ACK for "
2473 "connection %p\n", (void *)cm_entry);
2474 cm_entry->x_needdis = TRUE;
2475 }
2476 }
2477 }
2478
2479 static void
2480 connmgr_cancelconn(struct cm_xprt *cm_entry)
2481 {
2482 /*
2483 * Mark the connection table entry as dead; the next thread that
2484 * goes through connmgr_release() will notice this and deal with it.
2485 */
2486 mutex_enter(&connmgr_lock);
2487 cm_entry->x_dead = TRUE;
2488
2489 /*
2490 * Notify any threads waiting for the connection that it isn't
2491 * going to happen.
2492 */
2493 cm_entry->x_thread = FALSE;
2494 cv_broadcast(&cm_entry->x_conn_cv);
2495 mutex_exit(&connmgr_lock);
2496
2497 connmgr_release(cm_entry);
2498 }
2499
2500 static void
2501 connmgr_close(struct cm_xprt *cm_entry)
2502 {
2503 mutex_enter(&cm_entry->x_lock);
2504 while (cm_entry->x_ref != 0) {
2505 /*
2506 * Must be a noninterruptible wait.
2507 */
2508 cv_wait(&cm_entry->x_cv, &cm_entry->x_lock);
2509 }
2510
2511 if (cm_entry->x_tiptr != NULL)
2512 (void) t_kclose(cm_entry->x_tiptr, 1);
2513
2514 mutex_exit(&cm_entry->x_lock);
2515 if (cm_entry->x_ksp != NULL) {
2516 mutex_enter(&connmgr_lock);
2517 cm_entry->x_ksp->ks_private = NULL;
2518 mutex_exit(&connmgr_lock);
2519
2520 /*
2521 * Must free the buffer we allocated for the
2522 * server address in the update function
2523 */
2524 if (((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))->
2525 x_server.value.str.addr.ptr != NULL)
2526 kmem_free(((struct cm_kstat_xprt *)(cm_entry->x_ksp->
2527 ks_data))->x_server.value.str.addr.ptr,
2528 INET6_ADDRSTRLEN);
2529 kmem_free(cm_entry->x_ksp->ks_data,
2530 cm_entry->x_ksp->ks_data_size);
2531 kstat_delete(cm_entry->x_ksp);
2532 }
2533
2534 mutex_destroy(&cm_entry->x_lock);
2535 cv_destroy(&cm_entry->x_cv);
2536 cv_destroy(&cm_entry->x_conn_cv);
2537 cv_destroy(&cm_entry->x_dis_cv);
2538
2539 if (cm_entry->x_server.buf != NULL)
2540 kmem_free(cm_entry->x_server.buf, cm_entry->x_server.maxlen);
2541 if (cm_entry->x_src.buf != NULL)
2542 kmem_free(cm_entry->x_src.buf, cm_entry->x_src.maxlen);
2543 kmem_free(cm_entry, sizeof (struct cm_xprt));
2544 }
2545
2546 /*
2547 * Called by KRPC after sending the call message to release the connection
2548 * it was using.
2549 */
2550 static void
2551 connmgr_release(struct cm_xprt *cm_entry)
2552 {
2553 mutex_enter(&cm_entry->x_lock);
2554 cm_entry->x_ref--;
2555 if (cm_entry->x_ref == 0)
2556 cv_signal(&cm_entry->x_cv);
2557 mutex_exit(&cm_entry->x_lock);
2558 }
2559
2560 /*
2561 * Given an open stream, connect to the remote. Returns true if connected,
2562 * false otherwise.
2563 */
2564 static bool_t
2565 connmgr_connect(
2566 struct cm_xprt *cm_entry,
2567 queue_t *wq,
2568 struct netbuf *addr,
2569 int addrfmly,
2570 calllist_t *e,
2571 int *tidu_ptr,
2572 bool_t reconnect,
2573 const struct timeval *waitp,
2574 bool_t nosignal,
2575 cred_t *cr)
2576 {
2577 mblk_t *mp;
2578 struct T_conn_req *tcr;
2579 struct T_info_ack *tinfo;
2580 int interrupted, error;
2581 int tidu_size, kstat_instance;
2582
2583 /* if it's a reconnect, flush any lingering data messages */
2584 if (reconnect)
2585 (void) putctl1(wq, M_FLUSH, FLUSHRW);
2586
2587 /*
2588 * Note: if the receiver uses SCM_UCRED/getpeerucred the pid will
2589 * appear as -1.
2590 */
2591 mp = allocb_cred(sizeof (*tcr) + addr->len, cr, NOPID);
2592 if (mp == NULL) {
2593 /*
2594 * This is unfortunate, but we need to look up the stats for
2595 * this zone to increment the "memory allocation failed"
2596 * counter. curproc->p_zone is safe since we're initiating a
2597 * connection and not in some strange streams context.
2598 */
2599 struct rpcstat *rpcstat;
2600
2601 rpcstat = zone_getspecific(rpcstat_zone_key, rpc_zone());
2602 ASSERT(rpcstat != NULL);
2603
2604 RPCLOG0(1, "connmgr_connect: cannot alloc mp for "
2605 "sending conn request\n");
2606 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcnomem);
2607 e->call_status = RPC_SYSTEMERROR;
2608 e->call_reason = ENOSR;
2609 return (FALSE);
2610 }
2611
2612 mp->b_datap->db_type = M_PROTO;
2613 tcr = (struct T_conn_req *)mp->b_rptr;
2614 bzero(tcr, sizeof (*tcr));
2615 tcr->PRIM_type = T_CONN_REQ;
2616 tcr->DEST_length = addr->len;
2617 tcr->DEST_offset = sizeof (struct T_conn_req);
2618 mp->b_wptr = mp->b_rptr + sizeof (*tcr);
2619
2620 bcopy(addr->buf, mp->b_wptr, tcr->DEST_length);
2621 mp->b_wptr += tcr->DEST_length;
2622
2623 RPCLOG(8, "connmgr_connect: sending conn request on queue "
2624 "%p", (void *)wq);
2625 RPCLOG(8, " call %p\n", (void *)wq);
2626 /*
2627 * We use the entry in the handle that is normally used for
2628 * waiting for RPC replies to wait for the connection accept.
2629 */
2630 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) {
2631 DTRACE_PROBE(krpc__e__connmgr__connect__cantsend);
2632 freemsg(mp);
2633 return (FALSE);
2634 }
2635
2636 mutex_enter(&clnt_pending_lock);
2637
2638 /*
2639 * We wait for the transport connection to be made, or an
2640 * indication that it could not be made.
2641 */
2642 interrupted = 0;
2643
2644 /*
2645 * waitforack should have been called with T_OK_ACK, but the
2646 * present implementation needs to be passed T_INFO_ACK to
2647 * work correctly.
2648 */
2649 error = waitforack(e, T_INFO_ACK, waitp, nosignal);
2650 if (error == EINTR)
2651 interrupted = 1;
2652 if (zone_status_get(curproc->p_zone) >= ZONE_IS_EMPTY) {
2653 /*
2654 * No time to lose; we essentially have been signaled to
2655 * quit.
2656 */
2657 interrupted = 1;
2658 }
2659 #ifdef RPCDEBUG
2660 if (error == ETIME)
2661 RPCLOG0(8, "connmgr_connect: giving up "
2662 "on connection attempt; "
2663 "clnt_dispatch notifyconn "
2664 "diagnostic 'no one waiting for "
2665 "connection' should not be "
2666 "unexpected\n");
2667 #endif
2668 if (e->call_prev)
2669 e->call_prev->call_next = e->call_next;
2670 else
2671 clnt_pending = e->call_next;
2672 if (e->call_next)
2673 e->call_next->call_prev = e->call_prev;
2674 mutex_exit(&clnt_pending_lock);
2675
2676 if (e->call_status != RPC_SUCCESS || error != 0) {
2677 if (interrupted)
2678 e->call_status = RPC_INTR;
2679 else if (error == ETIME)
2680 e->call_status = RPC_TIMEDOUT;
2681 else if (error == EPROTO) {
2682 e->call_status = RPC_SYSTEMERROR;
2683 e->call_reason = EPROTO;
2684 }
2685
2686 RPCLOG(8, "connmgr_connect: can't connect, status: "
2687 "%s\n", clnt_sperrno(e->call_status));
2688
2689 if (e->call_reply) {
2690 freemsg(e->call_reply);
2691 e->call_reply = NULL;
2692 }
2693
2694 return (FALSE);
2695 }
2696 /*
2697 * The result of the "connection accept" is a T_info_ack
2698 * in the call_reply field.
2699 */
2700 ASSERT(e->call_reply != NULL);
2701 mp = e->call_reply;
2702 e->call_reply = NULL;
2703 tinfo = (struct T_info_ack *)mp->b_rptr;
2704
2705 tidu_size = tinfo->TIDU_size;
2706 tidu_size -= (tidu_size % BYTES_PER_XDR_UNIT);
2707 if (tidu_size > COTS_DEFAULT_ALLOCSIZE || (tidu_size <= 0))
2708 tidu_size = COTS_DEFAULT_ALLOCSIZE;
2709 *tidu_ptr = tidu_size;
2710
2711 freemsg(mp);
2712
2713 /*
2714 * Set up the pertinent options. NODELAY is so the transport doesn't
2715 * buffer up RPC messages on either end. This may not be valid for
2716 * all transports. Failure to set this option is not cause to
2717 * bail out so we return success anyway. Note that lack of NODELAY
2718 * or some other way to flush the message on both ends will cause
2719 * lots of retries and terrible performance.
2720 */
2721 if (addrfmly == AF_INET || addrfmly == AF_INET6) {
2722 (void) connmgr_setopt(wq, IPPROTO_TCP, TCP_NODELAY, e, cr);
2723 if (e->call_status == RPC_XPRTFAILED)
2724 return (FALSE);
2725 }
2726
2727 /*
2728 * Since we have a connection, we now need to figure out if
2729 * we need to create a kstat. If x_ksp is not NULL then we
2730 * are reusing a connection and so we do not need to create
2731 * another kstat -- lets just return.
2732 */
2733 if (cm_entry->x_ksp != NULL)
2734 return (TRUE);
2735
2736 /*
2737 * We need to increment rpc_kstat_instance atomically to prevent
2738 * two kstats being created with the same instance.
2739 */
2740 kstat_instance = atomic_add_32_nv((uint32_t *)&rpc_kstat_instance, 1);
2741
2742 if ((cm_entry->x_ksp = kstat_create_zone("unix", kstat_instance,
2743 "rpc_cots_connections", "rpc", KSTAT_TYPE_NAMED,
2744 (uint_t)(sizeof (cm_kstat_xprt_t) / sizeof (kstat_named_t)),
2745 KSTAT_FLAG_VIRTUAL, cm_entry->x_zoneid)) == NULL) {
2746 return (TRUE);
2747 }
2748
2749 cm_entry->x_ksp->ks_lock = &connmgr_lock;
2750 cm_entry->x_ksp->ks_private = cm_entry;
2751 cm_entry->x_ksp->ks_data_size = ((INET6_ADDRSTRLEN * sizeof (char))
2752 + sizeof (cm_kstat_template));
2753 cm_entry->x_ksp->ks_data = kmem_alloc(cm_entry->x_ksp->ks_data_size,
2754 KM_SLEEP);
2755 bcopy(&cm_kstat_template, cm_entry->x_ksp->ks_data,
2756 cm_entry->x_ksp->ks_data_size);
2757 ((struct cm_kstat_xprt *)(cm_entry->x_ksp->ks_data))->
2758 x_server.value.str.addr.ptr =
2759 kmem_alloc(INET6_ADDRSTRLEN, KM_SLEEP);
2760
2761 cm_entry->x_ksp->ks_update = conn_kstat_update;
2762 kstat_install(cm_entry->x_ksp);
2763 return (TRUE);
2764 }
2765
2766 /*
2767 * Called by connmgr_connect to set an option on the new stream.
2768 */
2769 static bool_t
2770 connmgr_setopt(queue_t *wq, int level, int name, calllist_t *e, cred_t *cr)
2771 {
2772 mblk_t *mp;
2773 struct opthdr *opt;
2774 struct T_optmgmt_req *tor;
2775 struct timeval waitp;
2776 int error;
2777
2778 mp = allocb_cred(sizeof (struct T_optmgmt_req) +
2779 sizeof (struct opthdr) + sizeof (int), cr, NOPID);
2780 if (mp == NULL) {
2781 RPCLOG0(1, "connmgr_setopt: cannot alloc mp for option "
2782 "request\n");
2783 return (FALSE);
2784 }
2785
2786 mp->b_datap->db_type = M_PROTO;
2787 tor = (struct T_optmgmt_req *)(mp->b_rptr);
2788 tor->PRIM_type = T_SVR4_OPTMGMT_REQ;
2789 tor->MGMT_flags = T_NEGOTIATE;
2790 tor->OPT_length = sizeof (struct opthdr) + sizeof (int);
2791 tor->OPT_offset = sizeof (struct T_optmgmt_req);
2792
2793 opt = (struct opthdr *)(mp->b_rptr + sizeof (struct T_optmgmt_req));
2794 opt->level = level;
2795 opt->name = name;
2796 opt->len = sizeof (int);
2797 *(int *)((char *)opt + sizeof (*opt)) = 1;
2798 mp->b_wptr += sizeof (struct T_optmgmt_req) + sizeof (struct opthdr) +
2799 sizeof (int);
2800
2801 /*
2802 * We will use this connection regardless
2803 * of whether or not the option is settable.
2804 */
2805 if (clnt_dispatch_send(wq, mp, e, 0, 0) != RPC_SUCCESS) {
2806 DTRACE_PROBE(krpc__e__connmgr__setopt__cantsend);
2807 freemsg(mp);
2808 return (FALSE);
2809 }
2810
2811 mutex_enter(&clnt_pending_lock);
2812
2813 waitp.tv_sec = clnt_cots_min_conntout;
2814 waitp.tv_usec = 0;
2815 error = waitforack(e, T_OPTMGMT_ACK, &waitp, 1);
2816
2817 if (e->call_prev)
2818 e->call_prev->call_next = e->call_next;
2819 else
2820 clnt_pending = e->call_next;
2821 if (e->call_next)
2822 e->call_next->call_prev = e->call_prev;
2823 mutex_exit(&clnt_pending_lock);
2824
2825 if (e->call_reply != NULL) {
2826 freemsg(e->call_reply);
2827 e->call_reply = NULL;
2828 }
2829
2830 if (e->call_status != RPC_SUCCESS || error != 0) {
2831 RPCLOG(1, "connmgr_setopt: can't set option: %d\n", name);
2832 return (FALSE);
2833 }
2834 RPCLOG(8, "connmgr_setopt: successfully set option: %d\n", name);
2835 return (TRUE);
2836 }
2837
2838 #ifdef DEBUG
2839
2840 /*
2841 * This is a knob to let us force code coverage in allocation failure
2842 * case.
2843 */
2844 static int connmgr_failsnd;
2845 #define CONN_SND_ALLOC(Size, Pri) \
2846 ((connmgr_failsnd-- > 0) ? NULL : allocb(Size, Pri))
2847
2848 #else
2849
2850 #define CONN_SND_ALLOC(Size, Pri) allocb(Size, Pri)
2851
2852 #endif
2853
2854 /*
2855 * Sends an orderly release on the specified queue.
2856 * Entered with connmgr_lock. Exited without connmgr_lock
2857 */
2858 static void
2859 connmgr_sndrel(struct cm_xprt *cm_entry)
2860 {
2861 struct T_ordrel_req *torr;
2862 mblk_t *mp;
2863 queue_t *q = cm_entry->x_wq;
2864 ASSERT(MUTEX_HELD(&connmgr_lock));
2865 mp = CONN_SND_ALLOC(sizeof (struct T_ordrel_req), BPRI_LO);
2866 if (mp == NULL) {
2867 cm_entry->x_needrel = TRUE;
2868 mutex_exit(&connmgr_lock);
2869 RPCLOG(1, "connmgr_sndrel: cannot alloc mp for sending ordrel "
2870 "to queue %p\n", (void *)q);
2871 return;
2872 }
2873 mutex_exit(&connmgr_lock);
2874
2875 mp->b_datap->db_type = M_PROTO;
2876 torr = (struct T_ordrel_req *)(mp->b_rptr);
2877 torr->PRIM_type = T_ORDREL_REQ;
2878 mp->b_wptr = mp->b_rptr + sizeof (struct T_ordrel_req);
2879
2880 RPCLOG(8, "connmgr_sndrel: sending ordrel to queue %p\n", (void *)q);
2881 put(q, mp);
2882 }
2883
2884 /*
2885 * Sends an disconnect on the specified queue.
2886 * Entered with connmgr_lock. Exited without connmgr_lock
2887 */
2888 static void
2889 connmgr_snddis(struct cm_xprt *cm_entry)
2890 {
2891 struct T_discon_req *tdis;
2892 mblk_t *mp;
2893 queue_t *q = cm_entry->x_wq;
2894
2895 ASSERT(MUTEX_HELD(&connmgr_lock));
2896 mp = CONN_SND_ALLOC(sizeof (*tdis), BPRI_LO);
2897 if (mp == NULL) {
2898 cm_entry->x_needdis = TRUE;
2899 mutex_exit(&connmgr_lock);
2900 RPCLOG(1, "connmgr_snddis: cannot alloc mp for sending discon "
2901 "to queue %p\n", (void *)q);
2902 return;
2903 }
2904 mutex_exit(&connmgr_lock);
2905
2906 mp->b_datap->db_type = M_PROTO;
2907 tdis = (struct T_discon_req *)mp->b_rptr;
2908 tdis->PRIM_type = T_DISCON_REQ;
2909 mp->b_wptr = mp->b_rptr + sizeof (*tdis);
2910
2911 RPCLOG(8, "connmgr_snddis: sending discon to queue %p\n", (void *)q);
2912 put(q, mp);
2913 }
2914
2915 /*
2916 * Sets up the entry for receiving replies, and calls rpcmod's write put proc
2917 * (through put) to send the call.
2918 */
2919 static int
2920 clnt_dispatch_send(queue_t *q, mblk_t *mp, calllist_t *e, uint_t xid,
2921 uint_t queue_flag)
2922 {
2923 ASSERT(e != NULL);
2924
2925 e->call_status = RPC_TIMEDOUT; /* optimistic, eh? */
2926 e->call_reason = 0;
2927 e->call_wq = q;
2928 e->call_xid = xid;
2929 e->call_notified = FALSE;
2930
2931 if (!canput(q)) {
2932 e->call_status = RPC_CANTSEND;
2933 e->call_reason = ENOBUFS;
2934 return (RPC_CANTSEND);
2935 }
2936
2937 /*
2938 * If queue_flag is set then the calllist_t is already on the hash
2939 * queue. In this case just send the message and return.
2940 */
2941 if (queue_flag) {
2942 put(q, mp);
2943 return (RPC_SUCCESS);
2944
2945 }
2946
2947 /*
2948 * Set up calls for RPC requests (with XID != 0) on the hash
2949 * queue for fast lookups and place other calls (i.e.
2950 * connection management) on the linked list.
2951 */
2952 if (xid != 0) {
2953 RPCLOG(64, "clnt_dispatch_send: putting xid 0x%x on "
2954 "dispatch list\n", xid);
2955 e->call_hash = call_hash(xid, clnt_cots_hash_size);
2956 e->call_bucket = &cots_call_ht[e->call_hash];
2957 call_table_enter(e);
2958 } else {
2959 mutex_enter(&clnt_pending_lock);
2960 if (clnt_pending)
2961 clnt_pending->call_prev = e;
2962 e->call_next = clnt_pending;
2963 e->call_prev = NULL;
2964 clnt_pending = e;
2965 mutex_exit(&clnt_pending_lock);
2966 }
2967
2968 put(q, mp);
2969 return (RPC_SUCCESS);
2970 }
2971
2972 /*
2973 * Called by rpcmod to notify a client with a clnt_pending call that its reply
2974 * has arrived. If we can't find a client waiting for this reply, we log
2975 * the error and return.
2976 */
2977 bool_t
2978 clnt_dispatch_notify(mblk_t *mp, zoneid_t zoneid)
2979 {
2980 calllist_t *e = NULL;
2981 call_table_t *chtp;
2982 uint32_t xid;
2983 uint_t hash;
2984
2985 if ((IS_P2ALIGNED(mp->b_rptr, sizeof (uint32_t))) &&
2986 (mp->b_wptr - mp->b_rptr) >= sizeof (xid))
2987 xid = *((uint32_t *)mp->b_rptr);
2988 else {
2989 int i = 0;
2990 unsigned char *p = (unsigned char *)&xid;
2991 unsigned char *rptr;
2992 mblk_t *tmp = mp;
2993
2994 /*
2995 * Copy the xid, byte-by-byte into xid.
2996 */
2997 while (tmp) {
2998 rptr = tmp->b_rptr;
2999 while (rptr < tmp->b_wptr) {
3000 *p++ = *rptr++;
3001 if (++i >= sizeof (xid))
3002 goto done_xid_copy;
3003 }
3004 tmp = tmp->b_cont;
3005 }
3006
3007 /*
3008 * If we got here, we ran out of mblk space before the
3009 * xid could be copied.
3010 */
3011 ASSERT(tmp == NULL && i < sizeof (xid));
3012
3013 RPCLOG0(1,
3014 "clnt_dispatch_notify: message less than size of xid\n");
3015 return (FALSE);
3016
3017 }
3018 done_xid_copy:
3019
3020 hash = call_hash(xid, clnt_cots_hash_size);
3021 chtp = &cots_call_ht[hash];
3022 /* call_table_find returns with the hash bucket locked */
3023 call_table_find(chtp, xid, e);
3024
3025 if (e != NULL) {
3026 /*
3027 * Found thread waiting for this reply
3028 */
3029 mutex_enter(&e->call_lock);
3030
3031 /*
3032 * verify that the reply is coming in on
3033 * the same zone that it was sent from.
3034 */
3035 if (e->call_zoneid != zoneid) {
3036 mutex_exit(&e->call_lock);
3037 mutex_exit(&chtp->ct_lock);
3038 RPCLOG0(1, "clnt_dispatch_notify: incorrect zoneid\n");
3039 return (FALSE);
3040 }
3041
3042 if (e->call_reply)
3043 /*
3044 * This can happen under the following scenario:
3045 * clnt_cots_kcallit() times out on the response,
3046 * rfscall() repeats the CLNT_CALL() with
3047 * the same xid, clnt_cots_kcallit() sends the retry,
3048 * thereby putting the clnt handle on the pending list,
3049 * the first response arrives, signalling the thread
3050 * in clnt_cots_kcallit(). Before that thread is
3051 * dispatched, the second response arrives as well,
3052 * and clnt_dispatch_notify still finds the handle on
3053 * the pending list, with call_reply set. So free the
3054 * old reply now.
3055 *
3056 * It is also possible for a response intended for
3057 * an RPC call with a different xid to reside here.
3058 * This can happen if the thread that owned this
3059 * client handle prior to the current owner bailed
3060 * out and left its call record on the dispatch
3061 * queue. A window exists where the response can
3062 * arrive before the current owner dispatches its
3063 * RPC call.
3064 *
3065 * In any case, this is the very last point where we
3066 * can safely check the call_reply field before
3067 * placing the new response there.
3068 */
3069 freemsg(e->call_reply);
3070 e->call_reply = mp;
3071 e->call_status = RPC_SUCCESS;
3072 e->call_notified = TRUE;
3073 cv_signal(&e->call_cv);
3074 mutex_exit(&e->call_lock);
3075 mutex_exit(&chtp->ct_lock);
3076 return (TRUE);
3077 } else {
3078 zone_t *zone;
3079 struct rpcstat *rpcstat;
3080
3081 mutex_exit(&chtp->ct_lock);
3082 RPCLOG(65, "clnt_dispatch_notify: no caller for reply 0x%x\n",
3083 xid);
3084 /*
3085 * This is unfortunate, but we need to lookup the zone so we
3086 * can increment its "rcbadxids" counter.
3087 */
3088 zone = zone_find_by_id(zoneid);
3089 if (zone == NULL) {
3090 /*
3091 * The zone went away...
3092 */
3093 return (FALSE);
3094 }
3095 rpcstat = zone_getspecific(rpcstat_zone_key, zone);
3096 if (zone_status_get(zone) >= ZONE_IS_SHUTTING_DOWN) {
3097 /*
3098 * Not interested
3099 */
3100 zone_rele(zone);
3101 return (FALSE);
3102 }
3103 COTSRCSTAT_INCR(rpcstat->rpc_cots_client, rcbadxids);
3104 zone_rele(zone);
3105 }
3106 return (FALSE);
3107 }
3108
3109 /*
3110 * Called by rpcmod when a non-data indication arrives. The ones in which we
3111 * are interested are connection indications and options acks. We dispatch
3112 * based on the queue the indication came in on. If we are not interested in
3113 * what came in, we return false to rpcmod, who will then pass it upstream.
3114 */
3115 bool_t
3116 clnt_dispatch_notifyconn(queue_t *q, mblk_t *mp)
3117 {
3118 calllist_t *e;
3119 int type;
3120
3121 ASSERT((q->q_flag & QREADR) == 0);
3122
3123 type = ((union T_primitives *)mp->b_rptr)->type;
3124 RPCLOG(8, "clnt_dispatch_notifyconn: prim type: [%s]\n",
3125 rpc_tpiprim2name(type));
3126 mutex_enter(&clnt_pending_lock);
3127 for (e = clnt_pending; /* NO CONDITION */; e = e->call_next) {
3128 if (e == NULL) {
3129 mutex_exit(&clnt_pending_lock);
3130 RPCLOG(1, "clnt_dispatch_notifyconn: no one waiting "
3131 "for connection on queue 0x%p\n", (void *)q);
3132 return (FALSE);
3133 }
3134 if (e->call_wq == q)
3135 break;
3136 }
3137
3138 switch (type) {
3139 case T_CONN_CON:
3140 /*
3141 * The transport is now connected, send a T_INFO_REQ to get
3142 * the tidu size.
3143 */
3144 mutex_exit(&clnt_pending_lock);
3145 ASSERT(mp->b_datap->db_lim - mp->b_datap->db_base >=
3146 sizeof (struct T_info_req));
3147 mp->b_rptr = mp->b_datap->db_base;
3148 ((union T_primitives *)mp->b_rptr)->type = T_INFO_REQ;
3149 mp->b_wptr = mp->b_rptr + sizeof (struct T_info_req);
3150 mp->b_datap->db_type = M_PCPROTO;
3151 put(q, mp);
3152 return (TRUE);
3153 case T_INFO_ACK:
3154 case T_OPTMGMT_ACK:
3155 e->call_status = RPC_SUCCESS;
3156 e->call_reply = mp;
3157 e->call_notified = TRUE;
3158 cv_signal(&e->call_cv);
3159 break;
3160 case T_ERROR_ACK:
3161 e->call_status = RPC_CANTCONNECT;
3162 e->call_reply = mp;
3163 e->call_notified = TRUE;
3164 cv_signal(&e->call_cv);
3165 break;
3166 case T_OK_ACK:
3167 /*
3168 * Great, but we are really waiting for a T_CONN_CON
3169 */
3170 freemsg(mp);
3171 break;
3172 default:
3173 mutex_exit(&clnt_pending_lock);
3174 RPCLOG(1, "clnt_dispatch_notifyconn: bad type %d\n", type);
3175 return (FALSE);
3176 }
3177
3178 mutex_exit(&clnt_pending_lock);
3179 return (TRUE);
3180 }
3181
3182 /*
3183 * Called by rpcmod when the transport is (or should be) going away. Informs
3184 * all callers waiting for replies and marks the entry in the connection
3185 * manager's list as unconnected, and either closing (close handshake in
3186 * progress) or dead.
3187 */
3188 void
3189 clnt_dispatch_notifyall(queue_t *q, int32_t msg_type, int32_t reason)
3190 {
3191 calllist_t *e;
3192 call_table_t *ctp;
3193 struct cm_xprt *cm_entry;
3194 int have_connmgr_lock;
3195 int i;
3196
3197 ASSERT((q->q_flag & QREADR) == 0);
3198
3199 RPCLOG(1, "clnt_dispatch_notifyall on queue %p", (void *)q);
3200 RPCLOG(1, " received a notifcation prim type [%s]",
3201 rpc_tpiprim2name(msg_type));
3202 RPCLOG(1, " and reason %d\n", reason);
3203
3204 /*
3205 * Find the transport entry in the connection manager's list, close
3206 * the transport and delete the entry. In the case where rpcmod's
3207 * idle timer goes off, it sends us a T_ORDREL_REQ, indicating we
3208 * should gracefully close the connection.
3209 */
3210 have_connmgr_lock = 1;
3211 mutex_enter(&connmgr_lock);
3212 for (cm_entry = cm_hd; cm_entry; cm_entry = cm_entry->x_next) {
3213 ASSERT(cm_entry != cm_entry->x_next);
3214 if (cm_entry->x_wq == q) {
3215 ASSERT(MUTEX_HELD(&connmgr_lock));
3216 ASSERT(have_connmgr_lock == 1);
3217 switch (msg_type) {
3218 case T_ORDREL_REQ:
3219
3220 if (cm_entry->x_dead) {
3221 RPCLOG(1, "idle timeout on dead "
3222 "connection: %p\n",
3223 (void *)cm_entry);
3224 if (clnt_stop_idle != NULL)
3225 (*clnt_stop_idle)(q);
3226 break;
3227 }
3228
3229 /*
3230 * Only mark the connection as dead if it is
3231 * connected and idle.
3232 * An unconnected connection has probably
3233 * gone idle because the server is down,
3234 * and when it comes back up there will be
3235 * retries that need to use that connection.
3236 */
3237 if (cm_entry->x_connected ||
3238 cm_entry->x_doomed) {
3239 if (cm_entry->x_ordrel) {
3240 if (cm_entry->x_closing ==
3241 TRUE) {
3242 /*
3243 * The connection is
3244 * obviously wedged due
3245 * to a bug or problem
3246 * with the transport.
3247 * Mark it as dead.
3248 * Otherwise we can
3249 * leak connections.
3250 */
3251 cm_entry->x_dead = TRUE;
3252 mutex_exit(
3253 &connmgr_lock);
3254 have_connmgr_lock = 0;
3255 if (clnt_stop_idle !=
3256 NULL)
3257 (*clnt_stop_idle)(q);
3258 break;
3259 }
3260 cm_entry->x_closing = TRUE;
3261 connmgr_sndrel(cm_entry);
3262 have_connmgr_lock = 0;
3263 } else {
3264 cm_entry->x_dead = TRUE;
3265 mutex_exit(&connmgr_lock);
3266 have_connmgr_lock = 0;
3267 if (clnt_stop_idle != NULL)
3268 (*clnt_stop_idle)(q);
3269 }
3270 } else {
3271 /*
3272 * We don't mark the connection
3273 * as dead, but we turn off the
3274 * idle timer.
3275 */
3276 mutex_exit(&connmgr_lock);
3277 have_connmgr_lock = 0;
3278 if (clnt_stop_idle != NULL)
3279 (*clnt_stop_idle)(q);
3280 RPCLOG(1, "clnt_dispatch_notifyall:"
3281 " ignoring timeout from rpcmod"
3282 " (q %p) because we are not "
3283 " connected\n", (void *)q);
3284 }
3285 break;
3286 case T_ORDREL_IND:
3287 /*
3288 * If this entry is marked closing, then we are
3289 * completing a close handshake, and the
3290 * connection is dead. Otherwise, the server is
3291 * trying to close. Since the server will not
3292 * be sending any more RPC replies, we abort
3293 * the connection, including flushing
3294 * any RPC requests that are in-transit.
3295 * In either case, mark the entry as dead so
3296 * that it can be closed by the connection
3297 * manager's garbage collector.
3298 */
3299 cm_entry->x_dead = TRUE;
3300 if (cm_entry->x_closing) {
3301 mutex_exit(&connmgr_lock);
3302 have_connmgr_lock = 0;
3303 if (clnt_stop_idle != NULL)
3304 (*clnt_stop_idle)(q);
3305 } else {
3306 /*
3307 * if we're getting a disconnect
3308 * before we've finished our
3309 * connect attempt, mark it for
3310 * later processing
3311 */
3312 if (cm_entry->x_thread)
3313 cm_entry->x_early_disc = TRUE;
3314 else
3315 cm_entry->x_connected = FALSE;
3316 cm_entry->x_waitdis = TRUE;
3317 connmgr_snddis(cm_entry);
3318 have_connmgr_lock = 0;
3319 }
3320 break;
3321
3322 case T_ERROR_ACK:
3323 case T_OK_ACK:
3324 cm_entry->x_waitdis = FALSE;
3325 cv_signal(&cm_entry->x_dis_cv);
3326 mutex_exit(&connmgr_lock);
3327 return;
3328
3329 case T_DISCON_REQ:
3330 if (cm_entry->x_thread)
3331 cm_entry->x_early_disc = TRUE;
3332 else
3333 cm_entry->x_connected = FALSE;
3334 cm_entry->x_waitdis = TRUE;
3335
3336 connmgr_snddis(cm_entry);
3337 have_connmgr_lock = 0;
3338 break;
3339
3340 case T_DISCON_IND:
3341 default:
3342 /*
3343 * if we're getting a disconnect before
3344 * we've finished our connect attempt,
3345 * mark it for later processing
3346 */
3347 if (cm_entry->x_closing) {
3348 cm_entry->x_dead = TRUE;
3349 mutex_exit(&connmgr_lock);
3350 have_connmgr_lock = 0;
3351 if (clnt_stop_idle != NULL)
3352 (*clnt_stop_idle)(q);
3353 } else {
3354 if (cm_entry->x_thread) {
3355 cm_entry->x_early_disc = TRUE;
3356 } else {
3357 cm_entry->x_dead = TRUE;
3358 cm_entry->x_connected = FALSE;
3359 }
3360 }
3361 break;
3362 }
3363 break;
3364 }
3365 }
3366
3367 if (have_connmgr_lock)
3368 mutex_exit(&connmgr_lock);
3369
3370 if (msg_type == T_ERROR_ACK || msg_type == T_OK_ACK) {
3371 RPCLOG(1, "clnt_dispatch_notifyall: (wq %p) could not find "
3372 "connmgr entry for discon ack\n", (void *)q);
3373 return;
3374 }
3375
3376 /*
3377 * Then kick all the clnt_pending calls out of their wait. There
3378 * should be no clnt_pending calls in the case of rpcmod's idle
3379 * timer firing.
3380 */
3381 for (i = 0; i < clnt_cots_hash_size; i++) {
3382 ctp = &cots_call_ht[i];
3383 mutex_enter(&ctp->ct_lock);
3384 for (e = ctp->ct_call_next;
3385 e != (calllist_t *)ctp;
3386 e = e->call_next) {
3387 if (e->call_wq == q && e->call_notified == FALSE) {
3388 RPCLOG(1,
3389 "clnt_dispatch_notifyall for queue %p ",
3390 (void *)q);
3391 RPCLOG(1, "aborting clnt_pending call %p\n",
3392 (void *)e);
3393
3394 if (msg_type == T_DISCON_IND)
3395 e->call_reason = reason;
3396 e->call_notified = TRUE;
3397 e->call_status = RPC_XPRTFAILED;
3398 cv_signal(&e->call_cv);
3399 }
3400 }
3401 mutex_exit(&ctp->ct_lock);
3402 }
3403
3404 mutex_enter(&clnt_pending_lock);
3405 for (e = clnt_pending; e; e = e->call_next) {
3406 /*
3407 * Only signal those RPC handles that haven't been
3408 * signalled yet. Otherwise we can get a bogus call_reason.
3409 * This can happen if thread A is making a call over a
3410 * connection. If the server is killed, it will cause
3411 * reset, and reason will default to EIO as a result of
3412 * a T_ORDREL_IND. Thread B then attempts to recreate
3413 * the connection but gets a T_DISCON_IND. If we set the
3414 * call_reason code for all threads, then if thread A
3415 * hasn't been dispatched yet, it will get the wrong
3416 * reason. The bogus call_reason can make it harder to
3417 * discriminate between calls that fail because the
3418 * connection attempt failed versus those where the call
3419 * may have been executed on the server.
3420 */
3421 if (e->call_wq == q && e->call_notified == FALSE) {
3422 RPCLOG(1, "clnt_dispatch_notifyall for queue %p ",
3423 (void *)q);
3424 RPCLOG(1, " aborting clnt_pending call %p\n",
3425 (void *)e);
3426
3427 if (msg_type == T_DISCON_IND)
3428 e->call_reason = reason;
3429 e->call_notified = TRUE;
3430 /*
3431 * Let the caller timeout, else he will retry
3432 * immediately.
3433 */
3434 e->call_status = RPC_XPRTFAILED;
3435
3436 /*
3437 * We used to just signal those threads
3438 * waiting for a connection, (call_xid = 0).
3439 * That meant that threads waiting for a response
3440 * waited till their timeout expired. This
3441 * could be a long time if they've specified a
3442 * maximum timeout. (2^31 - 1). So we
3443 * Signal all threads now.
3444 */
3445 cv_signal(&e->call_cv);
3446 }
3447 }
3448 mutex_exit(&clnt_pending_lock);
3449 }
3450
3451
3452 /*ARGSUSED*/
3453 /*
3454 * after resuming a system that's been suspended for longer than the
3455 * NFS server's idle timeout (svc_idle_timeout for Solaris 2), rfscall()
3456 * generates "NFS server X not responding" and "NFS server X ok" messages;
3457 * here we reset inet connections to cause a re-connect and avoid those
3458 * NFS messages. see 4045054
3459 */
3460 boolean_t
3461 connmgr_cpr_reset(void *arg, int code)
3462 {
3463 struct cm_xprt *cxp;
3464
3465 if (code == CB_CODE_CPR_CHKPT)
3466 return (B_TRUE);
3467
3468 if (mutex_tryenter(&connmgr_lock) == 0)
3469 return (B_FALSE);
3470 for (cxp = cm_hd; cxp; cxp = cxp->x_next) {
3471 if ((cxp->x_family == AF_INET || cxp->x_family == AF_INET6) &&
3472 cxp->x_connected == TRUE) {
3473 if (cxp->x_thread)
3474 cxp->x_early_disc = TRUE;
3475 else
3476 cxp->x_connected = FALSE;
3477 cxp->x_needdis = TRUE;
3478 }
3479 }
3480 mutex_exit(&connmgr_lock);
3481 return (B_TRUE);
3482 }
3483
3484 void
3485 clnt_cots_stats_init(zoneid_t zoneid, struct rpc_cots_client **statsp)
3486 {
3487
3488 *statsp = (struct rpc_cots_client *)rpcstat_zone_init_common(zoneid,
3489 "unix", "rpc_cots_client", (const kstat_named_t *)&cots_rcstat_tmpl,
3490 sizeof (cots_rcstat_tmpl));
3491 }
3492
3493 void
3494 clnt_cots_stats_fini(zoneid_t zoneid, struct rpc_cots_client **statsp)
3495 {
3496 rpcstat_zone_fini_common(zoneid, "unix", "rpc_cots_client");
3497 kmem_free(*statsp, sizeof (cots_rcstat_tmpl));
3498 }
3499
3500 void
3501 clnt_cots_init(void)
3502 {
3503 mutex_init(&connmgr_lock, NULL, MUTEX_DEFAULT, NULL);
3504 mutex_init(&clnt_pending_lock, NULL, MUTEX_DEFAULT, NULL);
3505
3506 if (clnt_cots_hash_size < DEFAULT_MIN_HASH_SIZE)
3507 clnt_cots_hash_size = DEFAULT_MIN_HASH_SIZE;
3508
3509 cots_call_ht = call_table_init(clnt_cots_hash_size);
3510 zone_key_create(&zone_cots_key, NULL, NULL, clnt_zone_destroy);
3511 }
3512
3513 void
3514 clnt_cots_fini(void)
3515 {
3516 (void) zone_key_delete(zone_cots_key);
3517 }
3518
3519 /*
3520 * Wait for TPI ack, returns success only if expected ack is received
3521 * within timeout period.
3522 */
3523
3524 static int
3525 waitforack(calllist_t *e, t_scalar_t ack_prim, const struct timeval *waitp,
3526 bool_t nosignal)
3527 {
3528 union T_primitives *tpr;
3529 clock_t timout;
3530 int cv_stat = 1;
3531
3532 ASSERT(MUTEX_HELD(&clnt_pending_lock));
3533 while (e->call_reply == NULL) {
3534 if (waitp != NULL) {
3535 timout = waitp->tv_sec * drv_usectohz(MICROSEC) +
3536 drv_usectohz(waitp->tv_usec) + lbolt;
3537 if (nosignal)
3538 cv_stat = cv_timedwait(&e->call_cv,
3539 &clnt_pending_lock, timout);
3540 else
3541 cv_stat = cv_timedwait_sig(&e->call_cv,
3542 &clnt_pending_lock, timout);
3543 } else {
3544 if (nosignal)
3545 cv_wait(&e->call_cv, &clnt_pending_lock);
3546 else
3547 cv_stat = cv_wait_sig(&e->call_cv,
3548 &clnt_pending_lock);
3549 }
3550 if (cv_stat == -1)
3551 return (ETIME);
3552 if (cv_stat == 0)
3553 return (EINTR);
3554 /*
3555 * if we received an error from the server and we know a reply
3556 * is not going to be sent, do not wait for the full timeout,
3557 * return now.
3558 */
3559 if (e->call_status == RPC_XPRTFAILED)
3560 return (e->call_reason);
3561 }
3562 tpr = (union T_primitives *)e->call_reply->b_rptr;
3563 if (tpr->type == ack_prim)
3564 return (0); /* Success */
3565
3566 if (tpr->type == T_ERROR_ACK) {
3567 if (tpr->error_ack.TLI_error == TSYSERR)
3568 return (tpr->error_ack.UNIX_error);
3569 else
3570 return (t_tlitosyserr(tpr->error_ack.TLI_error));
3571 }
3572
3573 return (EPROTO); /* unknown or unexpected primitive */
3574 }