1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26 /* Copyright (c) 1990 Mentat Inc. */
27
28 #pragma ident "%Z%%M% %I% %E% SMI"
29 const char tcp_version[] = "%Z%%M% %I% %E% SMI";
30
31
32 #include <sys/types.h>
33 #include <sys/stream.h>
34 #include <sys/strsun.h>
35 #include <sys/strsubr.h>
36 #include <sys/stropts.h>
37 #include <sys/strlog.h>
38 #include <sys/strsun.h>
39 #define _SUN_TPI_VERSION 2
40 #include <sys/tihdr.h>
41 #include <sys/timod.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/suntpi.h>
45 #include <sys/xti_inet.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/sdt.h>
49 #include <sys/vtrace.h>
50 #include <sys/kmem.h>
51 #include <sys/ethernet.h>
52 #include <sys/cpuvar.h>
53 #include <sys/dlpi.h>
54 #include <sys/multidata.h>
55 #include <sys/multidata_impl.h>
56 #include <sys/pattr.h>
57 #include <sys/policy.h>
58 #include <sys/priv.h>
59 #include <sys/zone.h>
60 #include <sys/sunldi.h>
61
62 #include <sys/errno.h>
63 #include <sys/signal.h>
64 #include <sys/socket.h>
65 #include <sys/sockio.h>
66 #include <sys/isa_defs.h>
67 #include <sys/md5.h>
68 #include <sys/random.h>
69 #include <sys/sodirect.h>
70 #include <sys/uio.h>
71 #include <netinet/in.h>
72 #include <netinet/tcp.h>
73 #include <netinet/ip6.h>
74 #include <netinet/icmp6.h>
75 #include <net/if.h>
76 #include <net/route.h>
77 #include <inet/ipsec_impl.h>
78
79 #include <inet/common.h>
80 #include <inet/ip.h>
81 #include <inet/ip_impl.h>
82 #include <inet/ip6.h>
83 #include <inet/ip_ndp.h>
84 #include <inet/mi.h>
85 #include <inet/mib2.h>
86 #include <inet/nd.h>
87 #include <inet/optcom.h>
88 #include <inet/snmpcom.h>
89 #include <inet/kstatcom.h>
90 #include <inet/tcp.h>
91 #include <inet/tcp_impl.h>
92 #include <net/pfkeyv2.h>
93 #include <inet/ipsec_info.h>
94 #include <inet/ipdrop.h>
95 #include <inet/tcp_trace.h>
96
97 #include <inet/ipclassifier.h>
98 #include <inet/ip_ire.h>
99 #include <inet/ip_ftable.h>
100 #include <inet/ip_if.h>
101 #include <inet/ipp_common.h>
102 #include <inet/ip_netinfo.h>
103 #include <sys/squeue.h>
104 #include <inet/kssl/ksslapi.h>
105 #include <sys/tsol/label.h>
106 #include <sys/tsol/tnet.h>
107 #include <rpc/pmap_prot.h>
108
109 /*
110 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
111 *
112 * (Read the detailed design doc in PSARC case directory)
113 *
114 * The entire tcp state is contained in tcp_t and conn_t structure
115 * which are allocated in tandem using ipcl_conn_create() and passing
116 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
117 * the references on the tcp_t. The tcp_t structure is never compressed
118 * and packets always land on the correct TCP perimeter from the time
119 * eager is created till the time tcp_t dies (as such the old mentat
120 * TCP global queue is not used for detached state and no IPSEC checking
121 * is required). The global queue is still allocated to send out resets
122 * for connection which have no listeners and IP directly calls
123 * tcp_xmit_listeners_reset() which does any policy check.
124 *
125 * Protection and Synchronisation mechanism:
126 *
127 * The tcp data structure does not use any kind of lock for protecting
128 * its state but instead uses 'squeues' for mutual exclusion from various
129 * read and write side threads. To access a tcp member, the thread should
130 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
131 * squeue_fill). Since the squeues allow a direct function call, caller
132 * can pass any tcp function having prototype of edesc_t as argument
133 * (different from traditional STREAMs model where packets come in only
134 * designated entry points). The list of functions that can be directly
135 * called via squeue are listed before the usual function prototype.
136 *
137 * Referencing:
138 *
139 * TCP is MT-Hot and we use a reference based scheme to make sure that the
140 * tcp structure doesn't disappear when its needed. When the application
141 * creates an outgoing connection or accepts an incoming connection, we
142 * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
143 * The IP reference is just a symbolic reference since ip_tcpclose()
144 * looks at tcp structure after tcp_close_output() returns which could
145 * have dropped the last TCP reference. So as long as the connection is
146 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
147 * conn_t. The classifier puts its own reference when the connection is
148 * inserted in listen or connected hash. Anytime a thread needs to enter
149 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
150 * on write side or by doing a classify on read side and then puts a
151 * reference on the conn before doing squeue_enter/tryenter/fill. For
152 * read side, the classifier itself puts the reference under fanout lock
153 * to make sure that tcp can't disappear before it gets processed. The
154 * squeue will drop this reference automatically so the called function
155 * doesn't have to do a DEC_REF.
156 *
157 * Opening a new connection:
158 *
159 * The outgoing connection open is pretty simple. tcp_open() does the
160 * work in creating the conn/tcp structure and initializing it. The
161 * squeue assignment is done based on the CPU the application
162 * is running on. So for outbound connections, processing is always done
163 * on application CPU which might be different from the incoming CPU
164 * being interrupted by the NIC. An optimal way would be to figure out
165 * the NIC <-> CPU binding at listen time, and assign the outgoing
166 * connection to the squeue attached to the CPU that will be interrupted
167 * for incoming packets (we know the NIC based on the bind IP address).
168 * This might seem like a problem if more data is going out but the
169 * fact is that in most cases the transmit is ACK driven transmit where
170 * the outgoing data normally sits on TCP's xmit queue waiting to be
171 * transmitted.
172 *
173 * Accepting a connection:
174 *
175 * This is a more interesting case because of various races involved in
176 * establishing a eager in its own perimeter. Read the meta comment on
177 * top of tcp_conn_request(). But briefly, the squeue is picked by
178 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
179 *
180 * Closing a connection:
181 *
182 * The close is fairly straight forward. tcp_close() calls tcp_close_output()
183 * via squeue to do the close and mark the tcp as detached if the connection
184 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
185 * reference but tcp_close() drop IP's reference always. So if tcp was
186 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
187 * and 1 because it is in classifier's connected hash. This is the condition
188 * we use to determine that its OK to clean up the tcp outside of squeue
189 * when time wait expires (check the ref under fanout and conn_lock and
190 * if it is 2, remove it from fanout hash and kill it).
191 *
192 * Although close just drops the necessary references and marks the
193 * tcp_detached state, tcp_close needs to know the tcp_detached has been
194 * set (under squeue) before letting the STREAM go away (because a
195 * inbound packet might attempt to go up the STREAM while the close
196 * has happened and tcp_detached is not set). So a special lock and
197 * flag is used along with a condition variable (tcp_closelock, tcp_closed,
198 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
199 * tcp_detached.
200 *
201 * Special provisions and fast paths:
202 *
203 * We make special provision for (AF_INET, SOCK_STREAM) sockets which
204 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
205 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
206 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
207 * check to send packets directly to tcp_rput_data via squeue. Everyone
208 * else comes through tcp_input() on the read side.
209 *
210 * We also make special provisions for sockfs by marking tcp_issocket
211 * whenever we have only sockfs on top of TCP. This allows us to skip
212 * putting the tcp in acceptor hash since a sockfs listener can never
213 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
214 * since eager has already been allocated and the accept now happens
215 * on acceptor STREAM. There is a big blob of comment on top of
216 * tcp_conn_request explaining the new accept. When socket is POP'd,
217 * sockfs sends us an ioctl to mark the fact and we go back to old
218 * behaviour. Once tcp_issocket is unset, its never set for the
219 * life of that connection.
220 *
221 * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
222 * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's
223 * directly to the socket (sodirect) and start an asynchronous copyout
224 * to a user-land receive-side buffer (uioa) when a blocking socket read
225 * (e.g. read, recv, ...) is pending.
226 *
227 * This is accomplished when tcp_issocket is set and tcp_sodirect is not
228 * NULL so points to an sodirect_t and if marked enabled then we enqueue
229 * all mblk_t's directly to the socket.
230 *
231 * Further, if the sodirect_t sod_uioa and if marked enabled (due to a
232 * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous
233 * copyout will be started directly to the user-land uio buffer. Also, as we
234 * have a pending read, TCP's push logic can take into account the number of
235 * bytes to be received and only awake the blocked read()er when the uioa_t
236 * byte count has been satisfied.
237 *
238 * IPsec notes :
239 *
240 * Since a packet is always executed on the correct TCP perimeter
241 * all IPsec processing is defered to IP including checking new
242 * connections and setting IPSEC policies for new connection. The
243 * only exception is tcp_xmit_listeners_reset() which is called
244 * directly from IP and needs to policy check to see if TH_RST
245 * can be sent out.
246 *
247 * PFHooks notes :
248 *
249 * For mdt case, one meta buffer contains multiple packets. Mblks for every
250 * packet are assembled and passed to the hooks. When packets are blocked,
251 * or boundary of any packet is changed, the mdt processing is stopped, and
252 * packets of the meta buffer are send to the IP path one by one.
253 */
254
255 /*
256 * Values for squeue switch:
257 * 1: squeue_enter_nodrain
258 * 2: squeue_enter
259 * 3: squeue_fill
260 */
261 int tcp_squeue_close = 2; /* Setable in /etc/system */
262 int tcp_squeue_wput = 2;
263
264 squeue_func_t tcp_squeue_close_proc;
265 squeue_func_t tcp_squeue_wput_proc;
266
267 /*
268 * Macros for sodirect:
269 *
270 * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the
271 * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t
272 * if it exists and is enabled, else to NULL. Note, in the current
273 * sodirect implementation the sod_lock must not be held across any
274 * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC
275 * will result as sod_lock is the streamhead stdata.sd_lock.
276 *
277 * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the
278 * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve
279 * side tcp code path dealing with a tcp_rcv_list or putnext() isn't
280 * being used when sodirect code paths should be.
281 */
282
283 #define SOD_PTR_ENTER(tcp, sodp) \
284 (sodp) = (tcp)->tcp_sodirect; \
285 \
286 if ((sodp) != NULL) { \
287 mutex_enter((sodp)->sod_lock); \
288 if (!((sodp)->sod_state & SOD_ENABLED)) { \
289 mutex_exit((sodp)->sod_lock); \
290 (sodp) = NULL; \
291 } \
292 }
293
294 #define SOD_NOT_ENABLED(tcp) \
295 ((tcp)->tcp_sodirect == NULL || \
296 !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED))
297
298 /*
299 * This controls how tiny a write must be before we try to copy it
300 * into the the mblk on the tail of the transmit queue. Not much
301 * speedup is observed for values larger than sixteen. Zero will
302 * disable the optimisation.
303 */
304 int tcp_tx_pull_len = 16;
305
306 /*
307 * TCP Statistics.
308 *
309 * How TCP statistics work.
310 *
311 * There are two types of statistics invoked by two macros.
312 *
313 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
314 * supposed to be used in non MT-hot paths of the code.
315 *
316 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
317 * supposed to be used for DEBUG purposes and may be used on a hot path.
318 *
319 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
320 * (use "kstat tcp" to get them).
321 *
322 * There is also additional debugging facility that marks tcp_clean_death()
323 * instances and saves them in tcp_t structure. It is triggered by
324 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
325 * tcp_clean_death() calls that counts the number of times each tag was hit. It
326 * is triggered by TCP_CLD_COUNTERS define.
327 *
328 * How to add new counters.
329 *
330 * 1) Add a field in the tcp_stat structure describing your counter.
331 * 2) Add a line in the template in tcp_kstat2_init() with the name
332 * of the counter.
333 *
334 * IMPORTANT!! - make sure that both are in sync !!
335 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
336 *
337 * Please avoid using private counters which are not kstat-exported.
338 *
339 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
340 * in tcp_t structure.
341 *
342 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
343 */
344
345 #ifndef TCP_DEBUG_COUNTER
346 #ifdef DEBUG
347 #define TCP_DEBUG_COUNTER 1
348 #else
349 #define TCP_DEBUG_COUNTER 0
350 #endif
351 #endif
352
353 #define TCP_CLD_COUNTERS 0
354
355 #define TCP_TAG_CLEAN_DEATH 1
356 #define TCP_MAX_CLEAN_DEATH_TAG 32
357
358 #ifdef lint
359 static int _lint_dummy_;
360 #endif
361
362 #if TCP_CLD_COUNTERS
363 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
364 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
365 #elif defined(lint)
366 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
367 #else
368 #define TCP_CLD_STAT(x)
369 #endif
370
371 #if TCP_DEBUG_COUNTER
372 #define TCP_DBGSTAT(tcps, x) \
373 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
374 #define TCP_G_DBGSTAT(x) \
375 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
376 #elif defined(lint)
377 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
378 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
379 #else
380 #define TCP_DBGSTAT(tcps, x)
381 #define TCP_G_DBGSTAT(x)
382 #endif
383
384 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++)
385
386 tcp_g_stat_t tcp_g_statistics;
387 kstat_t *tcp_g_kstat;
388
389 /*
390 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
391 * tcp write side.
392 */
393 #define CALL_IP_WPUT(connp, q, mp) { \
394 tcp_stack_t *tcps; \
395 \
396 tcps = connp->conn_netstack->netstack_tcp; \
397 ASSERT(((q)->q_flag & QREADR) == 0); \
398 TCP_DBGSTAT(tcps, tcp_ip_output); \
399 connp->conn_send(connp, (mp), (q), IP_WPUT); \
400 }
401
402 /* Macros for timestamp comparisons */
403 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
404 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
405
406 /*
407 * Parameters for TCP Initial Send Sequence number (ISS) generation. When
408 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
409 * by adding three components: a time component which grows by 1 every 4096
410 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
411 * a per-connection component which grows by 125000 for every new connection;
412 * and an "extra" component that grows by a random amount centered
413 * approximately on 64000. This causes the the ISS generator to cycle every
414 * 4.89 hours if no TCP connections are made, and faster if connections are
415 * made.
416 *
417 * When tcp_strong_iss is set to 0, ISS is calculated by adding two
418 * components: a time component which grows by 250000 every second; and
419 * a per-connection component which grows by 125000 for every new connections.
420 *
421 * A third method, when tcp_strong_iss is set to 2, for generating ISS is
422 * prescribed by Steve Bellovin. This involves adding time, the 125000 per
423 * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
424 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
425 * password.
426 */
427 #define ISS_INCR 250000
428 #define ISS_NSEC_SHT 12
429
430 static sin_t sin_null; /* Zero address for quick clears */
431 static sin6_t sin6_null; /* Zero address for quick clears */
432
433 /*
434 * This implementation follows the 4.3BSD interpretation of the urgent
435 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
436 * incompatible changes in protocols like telnet and rlogin.
437 */
438 #define TCP_OLD_URP_INTERPRETATION 1
439
440 #define TCP_IS_DETACHED_NONEAGER(tcp) \
441 (TCP_IS_DETACHED(tcp) && \
442 (!(tcp)->tcp_hard_binding))
443
444 /*
445 * TCP reassembly macros. We hide starting and ending sequence numbers in
446 * b_next and b_prev of messages on the reassembly queue. The messages are
447 * chained using b_cont. These macros are used in tcp_reass() so we don't
448 * have to see the ugly casts and assignments.
449 */
450 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next))
451 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \
452 (mblk_t *)(uintptr_t)(u))
453 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev))
454 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \
455 (mblk_t *)(uintptr_t)(u))
456
457 /*
458 * Implementation of TCP Timers.
459 * =============================
460 *
461 * INTERFACE:
462 *
463 * There are two basic functions dealing with tcp timers:
464 *
465 * timeout_id_t tcp_timeout(connp, func, time)
466 * clock_t tcp_timeout_cancel(connp, timeout_id)
467 * TCP_TIMER_RESTART(tcp, intvl)
468 *
469 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
470 * after 'time' ticks passed. The function called by timeout() must adhere to
471 * the same restrictions as a driver soft interrupt handler - it must not sleep
472 * or call other functions that might sleep. The value returned is the opaque
473 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
474 * cancel the request. The call to tcp_timeout() may fail in which case it
475 * returns zero. This is different from the timeout(9F) function which never
476 * fails.
477 *
478 * The call-back function 'func' always receives 'connp' as its single
479 * argument. It is always executed in the squeue corresponding to the tcp
480 * structure. The tcp structure is guaranteed to be present at the time the
481 * call-back is called.
482 *
483 * NOTE: The call-back function 'func' is never called if tcp is in
484 * the TCPS_CLOSED state.
485 *
486 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
487 * request. locks acquired by the call-back routine should not be held across
488 * the call to tcp_timeout_cancel() or a deadlock may result.
489 *
490 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
491 * Otherwise, it returns an integer value greater than or equal to 0. In
492 * particular, if the call-back function is already placed on the squeue, it can
493 * not be canceled.
494 *
495 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
496 * within squeue context corresponding to the tcp instance. Since the
497 * call-back is also called via the same squeue, there are no race
498 * conditions described in untimeout(9F) manual page since all calls are
499 * strictly serialized.
500 *
501 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
502 * stored in tcp_timer_tid and starts a new one using
503 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
504 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
505 * field.
506 *
507 * NOTE: since the timeout cancellation is not guaranteed, the cancelled
508 * call-back may still be called, so it is possible tcp_timer() will be
509 * called several times. This should not be a problem since tcp_timer()
510 * should always check the tcp instance state.
511 *
512 *
513 * IMPLEMENTATION:
514 *
515 * TCP timers are implemented using three-stage process. The call to
516 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
517 * when the timer expires. The tcp_timer_callback() arranges the call of the
518 * tcp_timer_handler() function via squeue corresponding to the tcp
519 * instance. The tcp_timer_handler() calls actual requested timeout call-back
520 * and passes tcp instance as an argument to it. Information is passed between
521 * stages using the tcp_timer_t structure which contains the connp pointer, the
522 * tcp call-back to call and the timeout id returned by the timeout(9F).
523 *
524 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
525 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
526 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
527 * returns the pointer to this mblk.
528 *
529 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
530 * looks like a normal mblk without actual dblk attached to it.
531 *
532 * To optimize performance each tcp instance holds a small cache of timer
533 * mblocks. In the current implementation it caches up to two timer mblocks per
534 * tcp instance. The cache is preserved over tcp frees and is only freed when
535 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
536 * timer processing happens on a corresponding squeue, the cache manipulation
537 * does not require any locks. Experiments show that majority of timer mblocks
538 * allocations are satisfied from the tcp cache and do not involve kmem calls.
539 *
540 * The tcp_timeout() places a refhold on the connp instance which guarantees
541 * that it will be present at the time the call-back function fires. The
542 * tcp_timer_handler() drops the reference after calling the call-back, so the
543 * call-back function does not need to manipulate the references explicitly.
544 */
545
546 typedef struct tcp_timer_s {
547 conn_t *connp;
548 void (*tcpt_proc)(void *);
549 timeout_id_t tcpt_tid;
550 } tcp_timer_t;
551
552 static kmem_cache_t *tcp_timercache;
553 kmem_cache_t *tcp_sack_info_cache;
554 kmem_cache_t *tcp_iphc_cache;
555
556 /*
557 * For scalability, we must not run a timer for every TCP connection
558 * in TIME_WAIT state. To see why, consider (for time wait interval of
559 * 4 minutes):
560 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
561 *
562 * This list is ordered by time, so you need only delete from the head
563 * until you get to entries which aren't old enough to delete yet.
564 * The list consists of only the detached TIME_WAIT connections.
565 *
566 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
567 * becomes detached TIME_WAIT (either by changing the state and already
568 * being detached or the other way around). This means that the TIME_WAIT
569 * state can be extended (up to doubled) if the connection doesn't become
570 * detached for a long time.
571 *
572 * The list manipulations (including tcp_time_wait_next/prev)
573 * are protected by the tcp_time_wait_lock. The content of the
574 * detached TIME_WAIT connections is protected by the normal perimeters.
575 *
576 * This list is per squeue and squeues are shared across the tcp_stack_t's.
577 * Things on tcp_time_wait_head remain associated with the tcp_stack_t
578 * and conn_netstack.
579 * The tcp_t's that are added to tcp_free_list are disassociated and
580 * have NULL tcp_tcps and conn_netstack pointers.
581 */
582 typedef struct tcp_squeue_priv_s {
583 kmutex_t tcp_time_wait_lock;
584 timeout_id_t tcp_time_wait_tid;
585 tcp_t *tcp_time_wait_head;
586 tcp_t *tcp_time_wait_tail;
587 tcp_t *tcp_free_list;
588 uint_t tcp_free_list_cnt;
589 } tcp_squeue_priv_t;
590
591 /*
592 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
593 * Running it every 5 seconds seems to give the best results.
594 */
595 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
596
597 /*
598 * To prevent memory hog, limit the number of entries in tcp_free_list
599 * to 1% of available memory / number of cpus
600 */
601 uint_t tcp_free_list_max_cnt = 0;
602
603 #define TCP_XMIT_LOWATER 4096
604 #define TCP_XMIT_HIWATER 49152
605 #define TCP_RECV_LOWATER 2048
606 #define TCP_RECV_HIWATER 49152
607
608 /*
609 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days
610 */
611 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
612
613 #define TIDUSZ 4096 /* transport interface data unit size */
614
615 /*
616 * Bind hash list size and has function. It has to be a power of 2 for
617 * hashing.
618 */
619 #define TCP_BIND_FANOUT_SIZE 512
620 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
621 /*
622 * Size of listen and acceptor hash list. It has to be a power of 2 for
623 * hashing.
624 */
625 #define TCP_FANOUT_SIZE 256
626
627 #ifdef _ILP32
628 #define TCP_ACCEPTOR_HASH(accid) \
629 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
630 #else
631 #define TCP_ACCEPTOR_HASH(accid) \
632 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
633 #endif /* _ILP32 */
634
635 #define IP_ADDR_CACHE_SIZE 2048
636 #define IP_ADDR_CACHE_HASH(faddr) \
637 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
638
639 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */
640 #define TCP_HSP_HASH_SIZE 256
641
642 #define TCP_HSP_HASH(addr) \
643 (((addr>>24) ^ (addr >>16) ^ \
644 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE)
645
646 /*
647 * TCP options struct returned from tcp_parse_options.
648 */
649 typedef struct tcp_opt_s {
650 uint32_t tcp_opt_mss;
651 uint32_t tcp_opt_wscale;
652 uint32_t tcp_opt_ts_val;
653 uint32_t tcp_opt_ts_ecr;
654 tcp_t *tcp;
655 } tcp_opt_t;
656
657 /*
658 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
659 */
660
661 #ifdef _BIG_ENDIAN
662 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
663 (TCPOPT_TSTAMP << 8) | 10)
664 #else
665 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
666 (TCPOPT_NOP << 8) | TCPOPT_NOP)
667 #endif
668
669 /*
670 * Flags returned from tcp_parse_options.
671 */
672 #define TCP_OPT_MSS_PRESENT 1
673 #define TCP_OPT_WSCALE_PRESENT 2
674 #define TCP_OPT_TSTAMP_PRESENT 4
675 #define TCP_OPT_SACK_OK_PRESENT 8
676 #define TCP_OPT_SACK_PRESENT 16
677
678 /* TCP option length */
679 #define TCPOPT_NOP_LEN 1
680 #define TCPOPT_MAXSEG_LEN 4
681 #define TCPOPT_WS_LEN 3
682 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1)
683 #define TCPOPT_TSTAMP_LEN 10
684 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2)
685 #define TCPOPT_SACK_OK_LEN 2
686 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2)
687 #define TCPOPT_REAL_SACK_LEN 4
688 #define TCPOPT_MAX_SACK_LEN 36
689 #define TCPOPT_HEADER_LEN 2
690
691 /* TCP cwnd burst factor. */
692 #define TCP_CWND_INFINITE 65535
693 #define TCP_CWND_SS 3
694 #define TCP_CWND_NORMAL 5
695
696 /* Maximum TCP initial cwin (start/restart). */
697 #define TCP_MAX_INIT_CWND 8
698
699 /*
700 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is
701 * either tcp_slow_start_initial or tcp_slow_start_after idle
702 * depending on the caller. If the upper layer has not used the
703 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
704 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
705 * If the upper layer has changed set the tcp_init_cwnd, just use
706 * it to calculate the tcp_cwnd.
707 */
708 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \
709 { \
710 if ((tcp)->tcp_init_cwnd == 0) { \
711 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \
712 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
713 } else { \
714 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \
715 } \
716 tcp->tcp_cwnd_cnt = 0; \
717 }
718
719 /* TCP Timer control structure */
720 typedef struct tcpt_s {
721 pfv_t tcpt_pfv; /* The routine we are to call */
722 tcp_t *tcpt_tcp; /* The parameter we are to pass in */
723 } tcpt_t;
724
725 /* Host Specific Parameter structure */
726 typedef struct tcp_hsp {
727 struct tcp_hsp *tcp_hsp_next;
728 in6_addr_t tcp_hsp_addr_v6;
729 in6_addr_t tcp_hsp_subnet_v6;
730 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */
731 int32_t tcp_hsp_sendspace;
732 int32_t tcp_hsp_recvspace;
733 int32_t tcp_hsp_tstamp;
734 } tcp_hsp_t;
735 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6)
736 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6)
737
738 /*
739 * Functions called directly via squeue having a prototype of edesc_t.
740 */
741 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
742 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
743 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
744 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
745 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
746 void tcp_input(void *arg, mblk_t *mp, void *arg2);
747 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
748 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2);
749 void tcp_output(void *arg, mblk_t *mp, void *arg2);
750 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
751 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
752 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
753
754
755 /* Prototype for TCP functions */
756 static void tcp_random_init(void);
757 int tcp_random(void);
758 static void tcp_accept(tcp_t *tcp, mblk_t *mp);
759 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
760 tcp_t *eager);
761 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
762 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
763 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
764 boolean_t user_specified);
765 static void tcp_closei_local(tcp_t *tcp);
766 static void tcp_close_detached(tcp_t *tcp);
767 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
768 mblk_t *idmp, mblk_t **defermp);
769 static void tcp_connect(tcp_t *tcp, mblk_t *mp);
770 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
771 in_port_t dstport, uint_t srcid);
772 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
773 in_port_t dstport, uint32_t flowinfo, uint_t srcid,
774 uint32_t scope_id);
775 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
776 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
777 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
778 static char *tcp_display(tcp_t *tcp, char *, char);
779 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
780 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
781 static void tcp_eager_unlink(tcp_t *tcp);
782 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
783 int unixerr);
784 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
785 int tlierr, int unixerr);
786 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
787 cred_t *cr);
788 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
789 char *value, caddr_t cp, cred_t *cr);
790 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
791 char *value, caddr_t cp, cred_t *cr);
792 static int tcp_tpistate(tcp_t *tcp);
793 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
794 int caller_holds_lock);
795 static void tcp_bind_hash_remove(tcp_t *tcp);
796 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
797 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
798 static void tcp_acceptor_hash_remove(tcp_t *tcp);
799 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp);
800 static void tcp_info_req(tcp_t *tcp, mblk_t *mp);
801 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp);
802 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
803 void tcp_g_q_setup(tcp_stack_t *);
804 void tcp_g_q_create(tcp_stack_t *);
805 void tcp_g_q_destroy(tcp_stack_t *);
806 static int tcp_header_init_ipv4(tcp_t *tcp);
807 static int tcp_header_init_ipv6(tcp_t *tcp);
808 int tcp_init(tcp_t *tcp, queue_t *q);
809 static int tcp_init_values(tcp_t *tcp);
810 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
811 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
812 t_scalar_t addr_length);
813 static void tcp_ip_ire_mark_advice(tcp_t *tcp);
814 static void tcp_ip_notify(tcp_t *tcp);
815 static mblk_t *tcp_ire_mp(mblk_t *mp);
816 static void tcp_iss_init(tcp_t *tcp);
817 static void tcp_keepalive_killer(void *arg);
818 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
819 static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
820 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
821 int *do_disconnectp, int *t_errorp, int *sys_errorp);
822 static boolean_t tcp_allow_connopt_set(int level, int name);
823 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
824 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
825 int tcp_opt_set(queue_t *q, uint_t optset_context, int level,
826 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
827 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
828 mblk_t *mblk);
829 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
830 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
831 uchar_t *ptr, uint_t len);
832 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
833 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
834 tcp_stack_t *);
835 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value,
836 caddr_t cp, cred_t *cr);
837 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
838 caddr_t cp, cred_t *cr);
839 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
840 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
841 caddr_t cp, cred_t *cr);
842 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
843 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
844 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
845 static void tcp_reinit(tcp_t *tcp);
846 static void tcp_reinit_values(tcp_t *tcp);
847 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
848 tcp_t *thisstream, cred_t *cr);
849
850 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp);
851 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
852 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
853 static void tcp_ss_rexmit(tcp_t *tcp);
854 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
855 static void tcp_process_options(tcp_t *, tcph_t *);
856 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp);
857 static void tcp_rsrv(queue_t *q);
858 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
859 static int tcp_snmp_state(tcp_t *tcp);
860 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
861 cred_t *cr);
862 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
863 cred_t *cr);
864 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
865 cred_t *cr);
866 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
867 cred_t *cr);
868 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
869 cred_t *cr);
870 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value,
871 caddr_t cp, cred_t *cr);
872 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value,
873 caddr_t cp, cred_t *cr);
874 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp,
875 cred_t *cr);
876 static void tcp_timer(void *arg);
877 static void tcp_timer_callback(void *);
878 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
879 boolean_t random);
880 static in_port_t tcp_get_next_priv_port(const tcp_t *);
881 static void tcp_wput_sock(queue_t *q, mblk_t *mp);
882 void tcp_wput_accept(queue_t *q, mblk_t *mp);
883 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
884 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
885 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
886 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss,
887 const int tcp_hdr_len, const int tcp_tcp_hdr_len,
888 const int num_sack_blk, int *usable, uint_t *snxt,
889 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
890 const int mdt_thres);
891 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
892 const int tcp_hdr_len, const int tcp_tcp_hdr_len,
893 const int num_sack_blk, int *usable, uint_t *snxt,
894 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
895 const int mdt_thres);
896 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
897 int num_sack_blk);
898 static void tcp_wsrv(queue_t *q);
899 static int tcp_xmit_end(tcp_t *tcp);
900 static void tcp_ack_timer(void *arg);
901 static mblk_t *tcp_ack_mp(tcp_t *tcp);
902 static void tcp_xmit_early_reset(char *str, mblk_t *mp,
903 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
904 zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
905 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
906 uint32_t ack, int ctl);
907 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *);
908 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *);
909 static int setmaxps(queue_t *q, int maxpsz);
910 static void tcp_set_rto(tcp_t *, time_t);
911 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
912 boolean_t, boolean_t);
913 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
914 boolean_t ipsec_mctl);
915 static mblk_t *tcp_setsockopt_mp(int level, int cmd,
916 char *opt, int optlen);
917 static int tcp_build_hdrs(queue_t *, tcp_t *);
918 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
919 uint32_t seg_seq, uint32_t seg_ack, int seg_len,
920 tcph_t *tcph);
921 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
922 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *);
923 boolean_t tcp_reserved_port_del(in_port_t, in_port_t);
924 boolean_t tcp_reserved_port_check(in_port_t, tcp_stack_t *);
925 static tcp_t *tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *);
926 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
927 static mblk_t *tcp_mdt_info_mp(mblk_t *);
928 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
929 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
930 const boolean_t, const uint32_t, const uint32_t,
931 const uint32_t, const uint32_t, tcp_stack_t *);
932 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
933 const uint_t, const uint_t, boolean_t *);
934 static mblk_t *tcp_lso_info_mp(mblk_t *);
935 static void tcp_lso_update(tcp_t *, ill_lso_capab_t *);
936 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *);
937 extern mblk_t *tcp_timermp_alloc(int);
938 extern void tcp_timermp_free(tcp_t *);
939 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp);
940 static void tcp_stop_lingering(tcp_t *tcp);
941 static void tcp_close_linger_timeout(void *arg);
942 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns);
943 static void tcp_stack_shutdown(netstackid_t stackid, void *arg);
944 static void tcp_stack_fini(netstackid_t stackid, void *arg);
945 static void *tcp_g_kstat_init(tcp_g_stat_t *);
946 static void tcp_g_kstat_fini(kstat_t *);
947 static void *tcp_kstat_init(netstackid_t, tcp_stack_t *);
948 static void tcp_kstat_fini(netstackid_t, kstat_t *);
949 static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *);
950 static void tcp_kstat2_fini(netstackid_t, kstat_t *);
951 static int tcp_kstat_update(kstat_t *kp, int rw);
952 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
953 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
954 tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
955 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
956 tcph_t *tcph, mblk_t *idmp);
957 static squeue_func_t tcp_squeue_switch(int);
958
959 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
960 static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
961 static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
962 static int tcp_close(queue_t *, int);
963 static int tcpclose_accept(queue_t *);
964
965 static void tcp_squeue_add(squeue_t *);
966 static boolean_t tcp_zcopy_check(tcp_t *);
967 static void tcp_zcopy_notify(tcp_t *);
968 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *);
969 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
970 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
971
972 extern void tcp_kssl_input(tcp_t *, mblk_t *);
973
974 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
975 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
976
977 /*
978 * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
979 *
980 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
981 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
982 * (defined in tcp.h) needs to be filled in and passed into the kernel
983 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
984 * structure contains the four-tuple of a TCP connection and a range of TCP
985 * states (specified by ac_start and ac_end). The use of wildcard addresses
986 * and ports is allowed. Connections with a matching four tuple and a state
987 * within the specified range will be aborted. The valid states for the
988 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
989 * inclusive.
990 *
991 * An application which has its connection aborted by this ioctl will receive
992 * an error that is dependent on the connection state at the time of the abort.
993 * If the connection state is < TCPS_TIME_WAIT, an application should behave as
994 * though a RST packet has been received. If the connection state is equal to
995 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
996 * and all resources associated with the connection will be freed.
997 */
998 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
999 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
1000 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
1001 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
1002 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
1003 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
1004 boolean_t, tcp_stack_t *);
1005
1006 static struct module_info tcp_rinfo = {
1007 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
1008 };
1009
1010 static struct module_info tcp_winfo = {
1011 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
1012 };
1013
1014 /*
1015 * Entry points for TCP as a device. The normal case which supports
1016 * the TCP functionality.
1017 * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
1018 */
1019 struct qinit tcp_rinitv4 = {
1020 NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, NULL, &tcp_rinfo
1021 };
1022
1023 struct qinit tcp_rinitv6 = {
1024 NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_close, NULL, &tcp_rinfo
1025 };
1026
1027 struct qinit tcp_winit = {
1028 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1029 };
1030
1031 /* Initial entry point for TCP in socket mode. */
1032 struct qinit tcp_sock_winit = {
1033 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1034 };
1035
1036 /*
1037 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
1038 * an accept. Avoid allocating data structures since eager has already
1039 * been created.
1040 */
1041 struct qinit tcp_acceptor_rinit = {
1042 NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo
1043 };
1044
1045 struct qinit tcp_acceptor_winit = {
1046 (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
1047 };
1048
1049 /*
1050 * Entry points for TCP loopback (read side only)
1051 * The open routine is only used for reopens, thus no need to
1052 * have a separate one for tcp_openv6.
1053 */
1054 struct qinit tcp_loopback_rinit = {
1055 (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, (pfi_t)0,
1056 &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
1057 };
1058
1059 /* For AF_INET aka /dev/tcp */
1060 struct streamtab tcpinfov4 = {
1061 &tcp_rinitv4, &tcp_winit
1062 };
1063
1064 /* For AF_INET6 aka /dev/tcp6 */
1065 struct streamtab tcpinfov6 = {
1066 &tcp_rinitv6, &tcp_winit
1067 };
1068
1069 /*
1070 * Have to ensure that tcp_g_q_close is not done by an
1071 * interrupt thread.
1072 */
1073 static taskq_t *tcp_taskq;
1074
1075 /*
1076 * TCP has a private interface for other kernel modules to reserve a
1077 * port range for them to use. Once reserved, TCP will not use any ports
1078 * in the range. This interface relies on the TCP_EXCLBIND feature. If
1079 * the semantics of TCP_EXCLBIND is changed, implementation of this interface
1080 * has to be verified.
1081 *
1082 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port
1083 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port
1084 * range is [port a, port b] inclusive. And each port range is between
1085 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive.
1086 *
1087 * Note that the default anonymous port range starts from 32768. There is
1088 * no port "collision" between that and the reserved port range. If there
1089 * is port collision (because the default smallest anonymous port is lowered
1090 * or some apps specifically bind to ports in the reserved port range), the
1091 * system may not be able to reserve a port range even there are enough
1092 * unbound ports as a reserved port range contains consecutive ports .
1093 */
1094 #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5
1095 #define TCP_RESERVED_PORTS_RANGE_MAX 1000
1096 #define TCP_SMALLEST_RESERVED_PORT 10240
1097 #define TCP_LARGEST_RESERVED_PORT 20480
1098
1099 /* Structure to represent those reserved port ranges. */
1100 typedef struct tcp_rport_s {
1101 in_port_t lo_port;
1102 in_port_t hi_port;
1103 tcp_t **temp_tcp_array;
1104 } tcp_rport_t;
1105
1106 /* Setable only in /etc/system. Move to ndd? */
1107 boolean_t tcp_icmp_source_quench = B_FALSE;
1108
1109 /*
1110 * Following assumes TPI alignment requirements stay along 32 bit
1111 * boundaries
1112 */
1113 #define ROUNDUP32(x) \
1114 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
1115
1116 /* Template for response to info request. */
1117 static struct T_info_ack tcp_g_t_info_ack = {
1118 T_INFO_ACK, /* PRIM_type */
1119 0, /* TSDU_size */
1120 T_INFINITE, /* ETSDU_size */
1121 T_INVALID, /* CDATA_size */
1122 T_INVALID, /* DDATA_size */
1123 sizeof (sin_t), /* ADDR_size */
1124 0, /* OPT_size - not initialized here */
1125 TIDUSZ, /* TIDU_size */
1126 T_COTS_ORD, /* SERV_type */
1127 TCPS_IDLE, /* CURRENT_state */
1128 (XPG4_1|EXPINLINE) /* PROVIDER_flag */
1129 };
1130
1131 static struct T_info_ack tcp_g_t_info_ack_v6 = {
1132 T_INFO_ACK, /* PRIM_type */
1133 0, /* TSDU_size */
1134 T_INFINITE, /* ETSDU_size */
1135 T_INVALID, /* CDATA_size */
1136 T_INVALID, /* DDATA_size */
1137 sizeof (sin6_t), /* ADDR_size */
1138 0, /* OPT_size - not initialized here */
1139 TIDUSZ, /* TIDU_size */
1140 T_COTS_ORD, /* SERV_type */
1141 TCPS_IDLE, /* CURRENT_state */
1142 (XPG4_1|EXPINLINE) /* PROVIDER_flag */
1143 };
1144
1145 #define MS 1L
1146 #define SECONDS (1000 * MS)
1147 #define MINUTES (60 * SECONDS)
1148 #define HOURS (60 * MINUTES)
1149 #define DAYS (24 * HOURS)
1150
1151 #define PARAM_MAX (~(uint32_t)0)
1152
1153 /* Max size IP datagram is 64k - 1 */
1154 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
1155 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
1156 /* Max of the above */
1157 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4
1158
1159 /* Largest TCP port number */
1160 #define TCP_MAX_PORT (64 * 1024 - 1)
1161
1162 /*
1163 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
1164 * layer header. It has to be a multiple of 4.
1165 */
1166 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
1167 #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val
1168
1169 /*
1170 * All of these are alterable, within the min/max values given, at run time.
1171 * Note that the default value of "tcp_time_wait_interval" is four minutes,
1172 * per the TCP spec.
1173 */
1174 /* BEGIN CSTYLED */
1175 static tcpparam_t lcl_tcp_param_arr[] = {
1176 /*min max value name */
1177 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"},
1178 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" },
1179 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" },
1180 { 1, 1024, 1, "tcp_conn_req_min" },
1181 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" },
1182 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" },
1183 { 0, 10, 0, "tcp_debug" },
1184 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"},
1185 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"},
1186 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"},
1187 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"},
1188 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"},
1189 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"},
1190 { 1, 255, 64, "tcp_ipv4_ttl"},
1191 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"},
1192 { 0, 100, 10, "tcp_maxpsz_multiplier" },
1193 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"},
1194 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
1195 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"},
1196 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"},
1197 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"},
1198 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"},
1199 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"},
1200 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" },
1201 { 0, 16, 0, "tcp_snd_lowat_fraction" },
1202 { 0, 128000, 0, "tcp_sth_rcv_hiwat" },
1203 { 0, 128000, 0, "tcp_sth_rcv_lowat" },
1204 { 1, 10000, 3, "tcp_dupack_fast_retransmit" },
1205 { 0, 1, 0, "tcp_ignore_path_mtu" },
1206 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"},
1207 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"},
1208 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
1209 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
1210 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
1211 { 1, 65536, 4, "tcp_recv_hiwat_minmss"},
1212 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"},
1213 { 0, TCP_MSS_MAX, 64, "tcp_co_min"},
1214 { 8192, (1<<30), 1024*1024, "tcp_max_buf"},
1215 /*
1216 * Question: What default value should I set for tcp_strong_iss?
1217 */
1218 { 0, 2, 1, "tcp_strong_iss"},
1219 { 0, 65536, 20, "tcp_rtt_updates"},
1220 { 0, 1, 1, "tcp_wscale_always"},
1221 { 0, 1, 0, "tcp_tstamp_always"},
1222 { 0, 1, 1, "tcp_tstamp_if_wscale"},
1223 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"},
1224 { 0, 16, 2, "tcp_deferred_acks_max"},
1225 { 1, 16384, 4, "tcp_slow_start_after_idle"},
1226 { 1, 4, 4, "tcp_slow_start_initial"},
1227 { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"},
1228 { 0, 2, 2, "tcp_sack_permitted"},
1229 { 0, 1, 0, "tcp_trace"},
1230 { 0, 1, 1, "tcp_compression_enabled"},
1231 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"},
1232 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"},
1233 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
1234 { 0, 1, 0, "tcp_rev_src_routes"},
1235 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"},
1236 { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"},
1237 { 0, 16, 8, "tcp_local_dacks_max"},
1238 { 0, 2, 1, "tcp_ecn_permitted"},
1239 { 0, 1, 1, "tcp_rst_sent_rate_enabled"},
1240 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"},
1241 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"},
1242 { 0, 1, 0, "tcp_use_smss_as_mss_opt"},
1243 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"},
1244 };
1245 /* END CSTYLED */
1246
1247 /*
1248 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
1249 * each header fragment in the header buffer. Each parameter value has
1250 * to be a multiple of 4 (32-bit aligned).
1251 */
1252 static tcpparam_t lcl_tcp_mdt_head_param =
1253 { 32, 256, 32, "tcp_mdt_hdr_head_min" };
1254 static tcpparam_t lcl_tcp_mdt_tail_param =
1255 { 0, 256, 32, "tcp_mdt_hdr_tail_min" };
1256 #define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val
1257 #define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val
1258
1259 /*
1260 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
1261 * the maximum number of payload buffers associated per Multidata.
1262 */
1263 static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
1264 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
1265 #define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val
1266
1267 /* Round up the value to the nearest mss. */
1268 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss))
1269
1270 /*
1271 * Set ECN capable transport (ECT) code point in IP header.
1272 *
1273 * Note that there are 2 ECT code points '01' and '10', which are called
1274 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code
1275 * point ECT(0) for TCP as described in RFC 2481.
1276 */
1277 #define SET_ECT(tcp, iph) \
1278 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1279 /* We need to clear the code point first. */ \
1280 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
1281 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
1282 } else { \
1283 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
1284 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
1285 }
1286
1287 /*
1288 * The format argument to pass to tcp_display().
1289 * DISP_PORT_ONLY means that the returned string has only port info.
1290 * DISP_ADDR_AND_PORT means that the returned string also contains the
1291 * remote and local IP address.
1292 */
1293 #define DISP_PORT_ONLY 1
1294 #define DISP_ADDR_AND_PORT 2
1295
1296 #define NDD_TOO_QUICK_MSG \
1297 "ndd get info rate too high for non-privileged users, try again " \
1298 "later.\n"
1299 #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n"
1300
1301 #define IS_VMLOANED_MBLK(mp) \
1302 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
1303
1304
1305 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
1306 boolean_t tcp_mdt_chain = B_TRUE;
1307
1308 /*
1309 * MDT threshold in the form of effective send MSS multiplier; we take
1310 * the MDT path if the amount of unsent data exceeds the threshold value
1311 * (default threshold is 1*SMSS).
1312 */
1313 uint_t tcp_mdt_smss_threshold = 1;
1314
1315 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */
1316
1317 /*
1318 * Forces all connections to obey the value of the tcps_maxpsz_multiplier
1319 * tunable settable via NDD. Otherwise, the per-connection behavior is
1320 * determined dynamically during tcp_adapt_ire(), which is the default.
1321 */
1322 boolean_t tcp_static_maxpsz = B_FALSE;
1323
1324 /* Setable in /etc/system */
1325 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
1326 uint32_t tcp_random_anon_port = 1;
1327
1328 /*
1329 * To reach to an eager in Q0 which can be dropped due to an incoming
1330 * new SYN request when Q0 is full, a new doubly linked list is
1331 * introduced. This list allows to select an eager from Q0 in O(1) time.
1332 * This is needed to avoid spending too much time walking through the
1333 * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
1334 * this new list has to be a member of Q0.
1335 * This list is headed by listener's tcp_t. When the list is empty,
1336 * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
1337 * of listener's tcp_t point to listener's tcp_t itself.
1338 *
1339 * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
1340 * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
1341 * These macros do not affect the eager's membership to Q0.
1342 */
1343
1344
1345 #define MAKE_DROPPABLE(listener, eager) \
1346 if ((eager)->tcp_eager_next_drop_q0 == NULL) { \
1347 (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
1348 = (eager); \
1349 (eager)->tcp_eager_prev_drop_q0 = (listener); \
1350 (eager)->tcp_eager_next_drop_q0 = \
1351 (listener)->tcp_eager_next_drop_q0; \
1352 (listener)->tcp_eager_next_drop_q0 = (eager); \
1353 }
1354
1355 #define MAKE_UNDROPPABLE(eager) \
1356 if ((eager)->tcp_eager_next_drop_q0 != NULL) { \
1357 (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \
1358 = (eager)->tcp_eager_prev_drop_q0; \
1359 (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \
1360 = (eager)->tcp_eager_next_drop_q0; \
1361 (eager)->tcp_eager_prev_drop_q0 = NULL; \
1362 (eager)->tcp_eager_next_drop_q0 = NULL; \
1363 }
1364
1365 /*
1366 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
1367 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
1368 * data, TCP will not respond with an ACK. RFC 793 requires that
1369 * TCP responds with an ACK for such a bogus ACK. By not following
1370 * the RFC, we prevent TCP from getting into an ACK storm if somehow
1371 * an attacker successfully spoofs an acceptable segment to our
1372 * peer; or when our peer is "confused."
1373 */
1374 uint32_t tcp_drop_ack_unsent_cnt = 10;
1375
1376 /*
1377 * Hook functions to enable cluster networking
1378 * On non-clustered systems these vectors must always be NULL.
1379 */
1380
1381 void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family,
1382 uint8_t *laddrp, in_port_t lport) = NULL;
1383 void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family,
1384 uint8_t *laddrp, in_port_t lport) = NULL;
1385 void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family,
1386 uint8_t *laddrp, in_port_t lport,
1387 uint8_t *faddrp, in_port_t fport) = NULL;
1388 void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family,
1389 uint8_t *laddrp, in_port_t lport,
1390 uint8_t *faddrp, in_port_t fport) = NULL;
1391
1392 /*
1393 * The following are defined in ip.c
1394 */
1395 extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family,
1396 uint8_t *laddrp);
1397 extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
1398 uint8_t *laddrp, uint8_t *faddrp);
1399
1400 #define CL_INET_CONNECT(tcp) { \
1401 if (cl_inet_connect != NULL) { \
1402 /* \
1403 * Running in cluster mode - register active connection \
1404 * information \
1405 */ \
1406 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1407 if ((tcp)->tcp_ipha->ipha_src != 0) { \
1408 (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\
1409 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
1410 (in_port_t)(tcp)->tcp_lport, \
1411 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
1412 (in_port_t)(tcp)->tcp_fport); \
1413 } \
1414 } else { \
1415 if (!IN6_IS_ADDR_UNSPECIFIED( \
1416 &(tcp)->tcp_ip6h->ip6_src)) {\
1417 (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\
1418 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
1419 (in_port_t)(tcp)->tcp_lport, \
1420 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
1421 (in_port_t)(tcp)->tcp_fport); \
1422 } \
1423 } \
1424 } \
1425 }
1426
1427 #define CL_INET_DISCONNECT(tcp) { \
1428 if (cl_inet_disconnect != NULL) { \
1429 /* \
1430 * Running in cluster mode - deregister active \
1431 * connection information \
1432 */ \
1433 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1434 if ((tcp)->tcp_ip_src != 0) { \
1435 (*cl_inet_disconnect)(IPPROTO_TCP, \
1436 AF_INET, \
1437 (uint8_t *)(&((tcp)->tcp_ip_src)),\
1438 (in_port_t)(tcp)->tcp_lport, \
1439 (uint8_t *) \
1440 (&((tcp)->tcp_ipha->ipha_dst)),\
1441 (in_port_t)(tcp)->tcp_fport); \
1442 } \
1443 } else { \
1444 if (!IN6_IS_ADDR_UNSPECIFIED( \
1445 &(tcp)->tcp_ip_src_v6)) { \
1446 (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\
1447 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
1448 (in_port_t)(tcp)->tcp_lport, \
1449 (uint8_t *) \
1450 (&((tcp)->tcp_ip6h->ip6_dst)),\
1451 (in_port_t)(tcp)->tcp_fport); \
1452 } \
1453 } \
1454 } \
1455 }
1456
1457 /*
1458 * Cluster networking hook for traversing current connection list.
1459 * This routine is used to extract the current list of live connections
1460 * which must continue to to be dispatched to this node.
1461 */
1462 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
1463
1464 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
1465 void *arg, tcp_stack_t *tcps);
1466
1467 #define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \
1468 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \
1469 iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \
1470 ip6_t *, ip6h, int, 0);
1471
1472 /*
1473 * Figure out the value of window scale opton. Note that the rwnd is
1474 * ASSUMED to be rounded up to the nearest MSS before the calculation.
1475 * We cannot find the scale value and then do a round up of tcp_rwnd
1476 * because the scale value may not be correct after that.
1477 *
1478 * Set the compiler flag to make this function inline.
1479 */
1480 static void
1481 tcp_set_ws_value(tcp_t *tcp)
1482 {
1483 int i;
1484 uint32_t rwnd = tcp->tcp_rwnd;
1485
1486 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
1487 i++, rwnd >>= 1)
1488 ;
1489 tcp->tcp_rcv_ws = i;
1490 }
1491
1492 /*
1493 * Remove a connection from the list of detached TIME_WAIT connections.
1494 * It returns B_FALSE if it can't remove the connection from the list
1495 * as the connection has already been removed from the list due to an
1496 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
1497 */
1498 static boolean_t
1499 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
1500 {
1501 boolean_t locked = B_FALSE;
1502
1503 if (tcp_time_wait == NULL) {
1504 tcp_time_wait = *((tcp_squeue_priv_t **)
1505 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
1506 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1507 locked = B_TRUE;
1508 } else {
1509 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
1510 }
1511
1512 if (tcp->tcp_time_wait_expire == 0) {
1513 ASSERT(tcp->tcp_time_wait_next == NULL);
1514 ASSERT(tcp->tcp_time_wait_prev == NULL);
1515 if (locked)
1516 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1517 return (B_FALSE);
1518 }
1519 ASSERT(TCP_IS_DETACHED(tcp));
1520 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1521
1522 if (tcp == tcp_time_wait->tcp_time_wait_head) {
1523 ASSERT(tcp->tcp_time_wait_prev == NULL);
1524 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
1525 if (tcp_time_wait->tcp_time_wait_head != NULL) {
1526 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
1527 NULL;
1528 } else {
1529 tcp_time_wait->tcp_time_wait_tail = NULL;
1530 }
1531 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
1532 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
1533 ASSERT(tcp->tcp_time_wait_next == NULL);
1534 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
1535 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1536 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
1537 } else {
1538 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
1539 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
1540 tcp->tcp_time_wait_prev->tcp_time_wait_next =
1541 tcp->tcp_time_wait_next;
1542 tcp->tcp_time_wait_next->tcp_time_wait_prev =
1543 tcp->tcp_time_wait_prev;
1544 }
1545 tcp->tcp_time_wait_next = NULL;
1546 tcp->tcp_time_wait_prev = NULL;
1547 tcp->tcp_time_wait_expire = 0;
1548
1549 if (locked)
1550 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1551 return (B_TRUE);
1552 }
1553
1554 /*
1555 * Add a connection to the list of detached TIME_WAIT connections
1556 * and set its time to expire.
1557 */
1558 static void
1559 tcp_time_wait_append(tcp_t *tcp)
1560 {
1561 tcp_stack_t *tcps = tcp->tcp_tcps;
1562 tcp_squeue_priv_t *tcp_time_wait =
1563 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
1564 SQPRIVATE_TCP));
1565
1566 tcp_timers_stop(tcp);
1567
1568 /* Freed above */
1569 ASSERT(tcp->tcp_timer_tid == 0);
1570 ASSERT(tcp->tcp_ack_tid == 0);
1571
1572 /* must have happened at the time of detaching the tcp */
1573 ASSERT(tcp->tcp_ptpahn == NULL);
1574 ASSERT(tcp->tcp_flow_stopped == 0);
1575 ASSERT(tcp->tcp_time_wait_next == NULL);
1576 ASSERT(tcp->tcp_time_wait_prev == NULL);
1577 ASSERT(tcp->tcp_time_wait_expire == NULL);
1578 ASSERT(tcp->tcp_listener == NULL);
1579
1580 tcp->tcp_time_wait_expire = ddi_get_lbolt();
1581 /*
1582 * The value computed below in tcp->tcp_time_wait_expire may
1583 * appear negative or wrap around. That is ok since our
1584 * interest is only in the difference between the current lbolt
1585 * value and tcp->tcp_time_wait_expire. But the value should not
1586 * be zero, since it means the tcp is not in the TIME_WAIT list.
1587 * The corresponding comparison in tcp_time_wait_collector() uses
1588 * modular arithmetic.
1589 */
1590 tcp->tcp_time_wait_expire +=
1591 drv_usectohz(tcps->tcps_time_wait_interval * 1000);
1592 if (tcp->tcp_time_wait_expire == 0)
1593 tcp->tcp_time_wait_expire = 1;
1594
1595 ASSERT(TCP_IS_DETACHED(tcp));
1596 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1597 ASSERT(tcp->tcp_time_wait_next == NULL);
1598 ASSERT(tcp->tcp_time_wait_prev == NULL);
1599 TCP_DBGSTAT(tcps, tcp_time_wait);
1600
1601 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1602 if (tcp_time_wait->tcp_time_wait_head == NULL) {
1603 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
1604 tcp_time_wait->tcp_time_wait_head = tcp;
1605 } else {
1606 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1607 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
1608 TCPS_TIME_WAIT);
1609 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
1610 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
1611 }
1612 tcp_time_wait->tcp_time_wait_tail = tcp;
1613 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1614 }
1615
1616 /* ARGSUSED */
1617 void
1618 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
1619 {
1620 conn_t *connp = (conn_t *)arg;
1621 tcp_t *tcp = connp->conn_tcp;
1622 tcp_stack_t *tcps = tcp->tcp_tcps;
1623
1624 ASSERT(tcp != NULL);
1625 if (tcp->tcp_state == TCPS_CLOSED) {
1626 return;
1627 }
1628
1629 ASSERT((tcp->tcp_family == AF_INET &&
1630 tcp->tcp_ipversion == IPV4_VERSION) ||
1631 (tcp->tcp_family == AF_INET6 &&
1632 (tcp->tcp_ipversion == IPV4_VERSION ||
1633 tcp->tcp_ipversion == IPV6_VERSION)));
1634 ASSERT(!tcp->tcp_listener);
1635
1636 TCP_STAT(tcps, tcp_time_wait_reap);
1637 ASSERT(TCP_IS_DETACHED(tcp));
1638
1639 /*
1640 * Because they have no upstream client to rebind or tcp_close()
1641 * them later, we axe the connection here and now.
1642 */
1643 tcp_close_detached(tcp);
1644 }
1645
1646 /*
1647 * Remove cached/latched IPsec references.
1648 */
1649 void
1650 tcp_ipsec_cleanup(tcp_t *tcp)
1651 {
1652 conn_t *connp = tcp->tcp_connp;
1653
1654 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1655
1656 if (connp->conn_latch != NULL) {
1657 IPLATCH_REFRELE(connp->conn_latch,
1658 connp->conn_netstack);
1659 connp->conn_latch = NULL;
1660 }
1661 if (connp->conn_policy != NULL) {
1662 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
1663 connp->conn_policy = NULL;
1664 }
1665 }
1666
1667 /*
1668 * Cleaup before placing on free list.
1669 * Disassociate from the netstack/tcp_stack_t since the freelist
1670 * is per squeue and not per netstack.
1671 */
1672 void
1673 tcp_cleanup(tcp_t *tcp)
1674 {
1675 mblk_t *mp;
1676 char *tcp_iphc;
1677 int tcp_iphc_len;
1678 int tcp_hdr_grown;
1679 tcp_sack_info_t *tcp_sack_info;
1680 conn_t *connp = tcp->tcp_connp;
1681 tcp_stack_t *tcps = tcp->tcp_tcps;
1682 netstack_t *ns = tcps->tcps_netstack;
1683
1684 tcp_bind_hash_remove(tcp);
1685
1686 /* Cleanup that which needs the netstack first */
1687 tcp_ipsec_cleanup(tcp);
1688
1689 tcp_free(tcp);
1690
1691 /* Release any SSL context */
1692 if (tcp->tcp_kssl_ent != NULL) {
1693 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
1694 tcp->tcp_kssl_ent = NULL;
1695 }
1696
1697 if (tcp->tcp_kssl_ctx != NULL) {
1698 kssl_release_ctx(tcp->tcp_kssl_ctx);
1699 tcp->tcp_kssl_ctx = NULL;
1700 }
1701 tcp->tcp_kssl_pending = B_FALSE;
1702
1703 conn_delete_ire(connp, NULL);
1704
1705 /*
1706 * Since we will bzero the entire structure, we need to
1707 * remove it and reinsert it in global hash list. We
1708 * know the walkers can't get to this conn because we
1709 * had set CONDEMNED flag earlier and checked reference
1710 * under conn_lock so walker won't pick it and when we
1711 * go the ipcl_globalhash_remove() below, no walker
1712 * can get to it.
1713 */
1714 ipcl_globalhash_remove(connp);
1715
1716 /*
1717 * Now it is safe to decrement the reference counts.
1718 * This might be the last reference on the netstack and TCPS
1719 * in which case it will cause the tcp_g_q_close and
1720 * the freeing of the IP Instance.
1721 */
1722 connp->conn_netstack = NULL;
1723 netstack_rele(ns);
1724 ASSERT(tcps != NULL);
1725 tcp->tcp_tcps = NULL;
1726 TCPS_REFRELE(tcps);
1727
1728 /* Save some state */
1729 mp = tcp->tcp_timercache;
1730
1731 tcp_sack_info = tcp->tcp_sack_info;
1732 tcp_iphc = tcp->tcp_iphc;
1733 tcp_iphc_len = tcp->tcp_iphc_len;
1734 tcp_hdr_grown = tcp->tcp_hdr_grown;
1735
1736 if (connp->conn_cred != NULL) {
1737 crfree(connp->conn_cred);
1738 connp->conn_cred = NULL;
1739 }
1740 if (connp->conn_peercred != NULL) {
1741 crfree(connp->conn_peercred);
1742 connp->conn_peercred = NULL;
1743 }
1744 ipcl_conn_cleanup(connp);
1745 connp->conn_flags = IPCL_TCPCONN;
1746 bzero(tcp, sizeof (tcp_t));
1747
1748 /* restore the state */
1749 tcp->tcp_timercache = mp;
1750
1751 tcp->tcp_sack_info = tcp_sack_info;
1752 tcp->tcp_iphc = tcp_iphc;
1753 tcp->tcp_iphc_len = tcp_iphc_len;
1754 tcp->tcp_hdr_grown = tcp_hdr_grown;
1755
1756 tcp->tcp_connp = connp;
1757
1758 ASSERT(connp->conn_tcp == tcp);
1759 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1760 connp->conn_state_flags = CONN_INCIPIENT;
1761 ASSERT(connp->conn_ulp == IPPROTO_TCP);
1762 ASSERT(connp->conn_ref == 1);
1763 }
1764
1765 /*
1766 * Blows away all tcps whose TIME_WAIT has expired. List traversal
1767 * is done forwards from the head.
1768 * This walks all stack instances since
1769 * tcp_time_wait remains global across all stacks.
1770 */
1771 /* ARGSUSED */
1772 void
1773 tcp_time_wait_collector(void *arg)
1774 {
1775 tcp_t *tcp;
1776 clock_t now;
1777 mblk_t *mp;
1778 conn_t *connp;
1779 kmutex_t *lock;
1780 boolean_t removed;
1781
1782 squeue_t *sqp = (squeue_t *)arg;
1783 tcp_squeue_priv_t *tcp_time_wait =
1784 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1785
1786 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1787 tcp_time_wait->tcp_time_wait_tid = 0;
1788
1789 if (tcp_time_wait->tcp_free_list != NULL &&
1790 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
1791 TCP_G_STAT(tcp_freelist_cleanup);
1792 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
1793 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1794 tcp->tcp_time_wait_next = NULL;
1795 tcp_time_wait->tcp_free_list_cnt--;
1796 ASSERT(tcp->tcp_tcps == NULL);
1797 CONN_DEC_REF(tcp->tcp_connp);
1798 }
1799 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
1800 }
1801
1802 /*
1803 * In order to reap time waits reliably, we should use a
1804 * source of time that is not adjustable by the user -- hence
1805 * the call to ddi_get_lbolt().
1806 */
1807 now = ddi_get_lbolt();
1808 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
1809 /*
1810 * Compare times using modular arithmetic, since
1811 * lbolt can wrapover.
1812 */
1813 if ((now - tcp->tcp_time_wait_expire) < 0) {
1814 break;
1815 }
1816
1817 removed = tcp_time_wait_remove(tcp, tcp_time_wait);
1818 ASSERT(removed);
1819
1820 connp = tcp->tcp_connp;
1821 ASSERT(connp->conn_fanout != NULL);
1822 lock = &connp->conn_fanout->connf_lock;
1823 /*
1824 * This is essentially a TW reclaim fast path optimization for
1825 * performance where the timewait collector checks under the
1826 * fanout lock (so that no one else can get access to the
1827 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
1828 * the classifier hash list. If ref count is indeed 2, we can
1829 * just remove the conn under the fanout lock and avoid
1830 * cleaning up the conn under the squeue, provided that
1831 * clustering callbacks are not enabled. If clustering is
1832 * enabled, we need to make the clustering callback before
1833 * setting the CONDEMNED flag and after dropping all locks and
1834 * so we forego this optimization and fall back to the slow
1835 * path. Also please see the comments in tcp_closei_local
1836 * regarding the refcnt logic.
1837 *
1838 * Since we are holding the tcp_time_wait_lock, its better
1839 * not to block on the fanout_lock because other connections
1840 * can't add themselves to time_wait list. So we do a
1841 * tryenter instead of mutex_enter.
1842 */
1843 if (mutex_tryenter(lock)) {
1844 mutex_enter(&connp->conn_lock);
1845 if ((connp->conn_ref == 2) &&
1846 (cl_inet_disconnect == NULL)) {
1847 ipcl_hash_remove_locked(connp,
1848 connp->conn_fanout);
1849 /*
1850 * Set the CONDEMNED flag now itself so that
1851 * the refcnt cannot increase due to any
1852 * walker. But we have still not cleaned up
1853 * conn_ire_cache. This is still ok since
1854 * we are going to clean it up in tcp_cleanup
1855 * immediately and any interface unplumb
1856 * thread will wait till the ire is blown away
1857 */
1858 connp->conn_state_flags |= CONN_CONDEMNED;
1859 mutex_exit(lock);
1860 mutex_exit(&connp->conn_lock);
1861 if (tcp_time_wait->tcp_free_list_cnt <
1862 tcp_free_list_max_cnt) {
1863 /* Add to head of tcp_free_list */
1864 mutex_exit(
1865 &tcp_time_wait->tcp_time_wait_lock);
1866 tcp_cleanup(tcp);
1867 ASSERT(connp->conn_latch == NULL);
1868 ASSERT(connp->conn_policy == NULL);
1869 ASSERT(tcp->tcp_tcps == NULL);
1870 ASSERT(connp->conn_netstack == NULL);
1871
1872 mutex_enter(
1873 &tcp_time_wait->tcp_time_wait_lock);
1874 tcp->tcp_time_wait_next =
1875 tcp_time_wait->tcp_free_list;
1876 tcp_time_wait->tcp_free_list = tcp;
1877 tcp_time_wait->tcp_free_list_cnt++;
1878 continue;
1879 } else {
1880 /* Do not add to tcp_free_list */
1881 mutex_exit(
1882 &tcp_time_wait->tcp_time_wait_lock);
1883 tcp_bind_hash_remove(tcp);
1884 conn_delete_ire(tcp->tcp_connp, NULL);
1885 tcp_ipsec_cleanup(tcp);
1886 CONN_DEC_REF(tcp->tcp_connp);
1887 }
1888 } else {
1889 CONN_INC_REF_LOCKED(connp);
1890 mutex_exit(lock);
1891 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1892 mutex_exit(&connp->conn_lock);
1893 /*
1894 * We can reuse the closemp here since conn has
1895 * detached (otherwise we wouldn't even be in
1896 * time_wait list). tcp_closemp_used can safely
1897 * be changed without taking a lock as no other
1898 * thread can concurrently access it at this
1899 * point in the connection lifecycle.
1900 */
1901
1902 if (tcp->tcp_closemp.b_prev == NULL)
1903 tcp->tcp_closemp_used = B_TRUE;
1904 else
1905 cmn_err(CE_PANIC,
1906 "tcp_timewait_collector: "
1907 "concurrent use of tcp_closemp: "
1908 "connp %p tcp %p\n", (void *)connp,
1909 (void *)tcp);
1910
1911 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1912 mp = &tcp->tcp_closemp;
1913 squeue_fill(connp->conn_sqp, mp,
1914 tcp_timewait_output, connp,
1915 SQTAG_TCP_TIMEWAIT);
1916 }
1917 } else {
1918 mutex_enter(&connp->conn_lock);
1919 CONN_INC_REF_LOCKED(connp);
1920 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1921 mutex_exit(&connp->conn_lock);
1922 /*
1923 * We can reuse the closemp here since conn has
1924 * detached (otherwise we wouldn't even be in
1925 * time_wait list). tcp_closemp_used can safely
1926 * be changed without taking a lock as no other
1927 * thread can concurrently access it at this
1928 * point in the connection lifecycle.
1929 */
1930
1931 if (tcp->tcp_closemp.b_prev == NULL)
1932 tcp->tcp_closemp_used = B_TRUE;
1933 else
1934 cmn_err(CE_PANIC, "tcp_timewait_collector: "
1935 "concurrent use of tcp_closemp: "
1936 "connp %p tcp %p\n", (void *)connp,
1937 (void *)tcp);
1938
1939 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1940 mp = &tcp->tcp_closemp;
1941 squeue_fill(connp->conn_sqp, mp,
1942 tcp_timewait_output, connp, 0);
1943 }
1944 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1945 }
1946
1947 if (tcp_time_wait->tcp_free_list != NULL)
1948 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
1949
1950 tcp_time_wait->tcp_time_wait_tid =
1951 timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY);
1952 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1953 }
1954 /*
1955 * Reply to a clients T_CONN_RES TPI message. This function
1956 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1957 * on the acceptor STREAM and processed in tcp_wput_accept().
1958 * Read the block comment on top of tcp_conn_request().
1959 */
1960 static void
1961 tcp_accept(tcp_t *listener, mblk_t *mp)
1962 {
1963 tcp_t *acceptor;
1964 tcp_t *eager;
1965 tcp_t *tcp;
1966 struct T_conn_res *tcr;
1967 t_uscalar_t acceptor_id;
1968 t_scalar_t seqnum;
1969 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */
1970 mblk_t *ok_mp;
1971 mblk_t *mp1;
1972 tcp_stack_t *tcps = listener->tcp_tcps;
1973
1974 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1975 tcp_err_ack(listener, mp, TPROTO, 0);
1976 return;
1977 }
1978 tcr = (struct T_conn_res *)mp->b_rptr;
1979
1980 /*
1981 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1982 * read side queue of the streams device underneath us i.e. the
1983 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1984 * look it up in the queue_hash. Under LP64 it sends down the
1985 * minor_t of the accepting endpoint.
1986 *
1987 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1988 * fanout hash lock is held.
1989 * This prevents any thread from entering the acceptor queue from
1990 * below (since it has not been hard bound yet i.e. any inbound
1991 * packets will arrive on the listener or default tcp queue and
1992 * go through tcp_lookup).
1993 * The CONN_INC_REF will prevent the acceptor from closing.
1994 *
1995 * XXX It is still possible for a tli application to send down data
1996 * on the accepting stream while another thread calls t_accept.
1997 * This should not be a problem for well-behaved applications since
1998 * the T_OK_ACK is sent after the queue swapping is completed.
1999 *
2000 * If the accepting fd is the same as the listening fd, avoid
2001 * queue hash lookup since that will return an eager listener in a
2002 * already established state.
2003 */
2004 acceptor_id = tcr->ACCEPTOR_id;
2005 mutex_enter(&listener->tcp_eager_lock);
2006 if (listener->tcp_acceptor_id == acceptor_id) {
2007 eager = listener->tcp_eager_next_q;
2008 /* only count how many T_CONN_INDs so don't count q0 */
2009 if ((listener->tcp_conn_req_cnt_q != 1) ||
2010 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
2011 mutex_exit(&listener->tcp_eager_lock);
2012 tcp_err_ack(listener, mp, TBADF, 0);
2013 return;
2014 }
2015 if (listener->tcp_conn_req_cnt_q0 != 0) {
2016 /* Throw away all the eagers on q0. */
2017 tcp_eager_cleanup(listener, 1);
2018 }
2019 if (listener->tcp_syn_defense) {
2020 listener->tcp_syn_defense = B_FALSE;
2021 if (listener->tcp_ip_addr_cache != NULL) {
2022 kmem_free(listener->tcp_ip_addr_cache,
2023 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2024 listener->tcp_ip_addr_cache = NULL;
2025 }
2026 }
2027 /*
2028 * Transfer tcp_conn_req_max to the eager so that when
2029 * a disconnect occurs we can revert the endpoint to the
2030 * listen state.
2031 */
2032 eager->tcp_conn_req_max = listener->tcp_conn_req_max;
2033 ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
2034 /*
2035 * Get a reference on the acceptor just like the
2036 * tcp_acceptor_hash_lookup below.
2037 */
2038 acceptor = listener;
2039 CONN_INC_REF(acceptor->tcp_connp);
2040 } else {
2041 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
2042 if (acceptor == NULL) {
2043 if (listener->tcp_debug) {
2044 (void) strlog(TCP_MOD_ID, 0, 1,
2045 SL_ERROR|SL_TRACE,
2046 "tcp_accept: did not find acceptor 0x%x\n",
2047 acceptor_id);
2048 }
2049 mutex_exit(&listener->tcp_eager_lock);
2050 tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
2051 return;
2052 }
2053 /*
2054 * Verify acceptor state. The acceptable states for an acceptor
2055 * include TCPS_IDLE and TCPS_BOUND.
2056 */
2057 switch (acceptor->tcp_state) {
2058 case TCPS_IDLE:
2059 /* FALLTHRU */
2060 case TCPS_BOUND:
2061 break;
2062 default:
2063 CONN_DEC_REF(acceptor->tcp_connp);
2064 mutex_exit(&listener->tcp_eager_lock);
2065 tcp_err_ack(listener, mp, TOUTSTATE, 0);
2066 return;
2067 }
2068 }
2069
2070 /* The listener must be in TCPS_LISTEN */
2071 if (listener->tcp_state != TCPS_LISTEN) {
2072 CONN_DEC_REF(acceptor->tcp_connp);
2073 mutex_exit(&listener->tcp_eager_lock);
2074 tcp_err_ack(listener, mp, TOUTSTATE, 0);
2075 return;
2076 }
2077
2078 /*
2079 * Rendezvous with an eager connection request packet hanging off
2080 * 'tcp' that has the 'seqnum' tag. We tagged the detached open
2081 * tcp structure when the connection packet arrived in
2082 * tcp_conn_request().
2083 */
2084 seqnum = tcr->SEQ_number;
2085 eager = listener;
2086 do {
2087 eager = eager->tcp_eager_next_q;
2088 if (eager == NULL) {
2089 CONN_DEC_REF(acceptor->tcp_connp);
2090 mutex_exit(&listener->tcp_eager_lock);
2091 tcp_err_ack(listener, mp, TBADSEQ, 0);
2092 return;
2093 }
2094 } while (eager->tcp_conn_req_seqnum != seqnum);
2095 mutex_exit(&listener->tcp_eager_lock);
2096
2097 /*
2098 * At this point, both acceptor and listener have 2 ref
2099 * that they begin with. Acceptor has one additional ref
2100 * we placed in lookup while listener has 3 additional
2101 * ref for being behind the squeue (tcp_accept() is
2102 * done on listener's squeue); being in classifier hash;
2103 * and eager's ref on listener.
2104 */
2105 ASSERT(listener->tcp_connp->conn_ref >= 5);
2106 ASSERT(acceptor->tcp_connp->conn_ref >= 3);
2107
2108 /*
2109 * The eager at this point is set in its own squeue and
2110 * could easily have been killed (tcp_accept_finish will
2111 * deal with that) because of a TH_RST so we can only
2112 * ASSERT for a single ref.
2113 */
2114 ASSERT(eager->tcp_connp->conn_ref >= 1);
2115
2116 /* Pre allocate the stroptions mblk also */
2117 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
2118 if (opt_mp == NULL) {
2119 CONN_DEC_REF(acceptor->tcp_connp);
2120 CONN_DEC_REF(eager->tcp_connp);
2121 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2122 return;
2123 }
2124 DB_TYPE(opt_mp) = M_SETOPTS;
2125 opt_mp->b_wptr += sizeof (struct stroptions);
2126
2127 /*
2128 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
2129 * from listener to acceptor. The message is chained on opt_mp
2130 * which will be sent onto eager's squeue.
2131 */
2132 if (listener->tcp_bound_if != 0) {
2133 /* allocate optmgmt req */
2134 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2135 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
2136 sizeof (int));
2137 if (mp1 != NULL)
2138 linkb(opt_mp, mp1);
2139 }
2140 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
2141 uint_t on = 1;
2142
2143 /* allocate optmgmt req */
2144 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2145 IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
2146 if (mp1 != NULL)
2147 linkb(opt_mp, mp1);
2148 }
2149
2150 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */
2151 if ((mp1 = copymsg(mp)) == NULL) {
2152 CONN_DEC_REF(acceptor->tcp_connp);
2153 CONN_DEC_REF(eager->tcp_connp);
2154 freemsg(opt_mp);
2155 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2156 return;
2157 }
2158
2159 tcr = (struct T_conn_res *)mp1->b_rptr;
2160
2161 /*
2162 * This is an expanded version of mi_tpi_ok_ack_alloc()
2163 * which allocates a larger mblk and appends the new
2164 * local address to the ok_ack. The address is copied by
2165 * soaccept() for getsockname().
2166 */
2167 {
2168 int extra;
2169
2170 extra = (eager->tcp_family == AF_INET) ?
2171 sizeof (sin_t) : sizeof (sin6_t);
2172
2173 /*
2174 * Try to re-use mp, if possible. Otherwise, allocate
2175 * an mblk and return it as ok_mp. In any case, mp
2176 * is no longer usable upon return.
2177 */
2178 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
2179 CONN_DEC_REF(acceptor->tcp_connp);
2180 CONN_DEC_REF(eager->tcp_connp);
2181 freemsg(opt_mp);
2182 /* Original mp has been freed by now, so use mp1 */
2183 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
2184 return;
2185 }
2186
2187 mp = NULL; /* We should never use mp after this point */
2188
2189 switch (extra) {
2190 case sizeof (sin_t): {
2191 sin_t *sin = (sin_t *)ok_mp->b_wptr;
2192
2193 ok_mp->b_wptr += extra;
2194 sin->sin_family = AF_INET;
2195 sin->sin_port = eager->tcp_lport;
2196 sin->sin_addr.s_addr =
2197 eager->tcp_ipha->ipha_src;
2198 break;
2199 }
2200 case sizeof (sin6_t): {
2201 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
2202
2203 ok_mp->b_wptr += extra;
2204 sin6->sin6_family = AF_INET6;
2205 sin6->sin6_port = eager->tcp_lport;
2206 if (eager->tcp_ipversion == IPV4_VERSION) {
2207 sin6->sin6_flowinfo = 0;
2208 IN6_IPADDR_TO_V4MAPPED(
2209 eager->tcp_ipha->ipha_src,
2210 &sin6->sin6_addr);
2211 } else {
2212 ASSERT(eager->tcp_ip6h != NULL);
2213 sin6->sin6_flowinfo =
2214 eager->tcp_ip6h->ip6_vcf &
2215 ~IPV6_VERS_AND_FLOW_MASK;
2216 sin6->sin6_addr =
2217 eager->tcp_ip6h->ip6_src;
2218 }
2219 sin6->sin6_scope_id = 0;
2220 sin6->__sin6_src_id = 0;
2221 break;
2222 }
2223 default:
2224 break;
2225 }
2226 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
2227 }
2228
2229 /*
2230 * If there are no options we know that the T_CONN_RES will
2231 * succeed. However, we can't send the T_OK_ACK upstream until
2232 * the tcp_accept_swap is done since it would be dangerous to
2233 * let the application start using the new fd prior to the swap.
2234 */
2235 tcp_accept_swap(listener, acceptor, eager);
2236
2237 /*
2238 * tcp_accept_swap unlinks eager from listener but does not drop
2239 * the eager's reference on the listener.
2240 */
2241 ASSERT(eager->tcp_listener == NULL);
2242 ASSERT(listener->tcp_connp->conn_ref >= 5);
2243
2244 /*
2245 * The eager is now associated with its own queue. Insert in
2246 * the hash so that the connection can be reused for a future
2247 * T_CONN_RES.
2248 */
2249 tcp_acceptor_hash_insert(acceptor_id, eager);
2250
2251 /*
2252 * We now do the processing of options with T_CONN_RES.
2253 * We delay till now since we wanted to have queue to pass to
2254 * option processing routines that points back to the right
2255 * instance structure which does not happen until after
2256 * tcp_accept_swap().
2257 *
2258 * Note:
2259 * The sanity of the logic here assumes that whatever options
2260 * are appropriate to inherit from listner=>eager are done
2261 * before this point, and whatever were to be overridden (or not)
2262 * in transfer logic from eager=>acceptor in tcp_accept_swap().
2263 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
2264 * before its ACCEPTOR_id comes down in T_CONN_RES ]
2265 * This may not be true at this point in time but can be fixed
2266 * independently. This option processing code starts with
2267 * the instantiated acceptor instance and the final queue at
2268 * this point.
2269 */
2270
2271 if (tcr->OPT_length != 0) {
2272 /* Options to process */
2273 int t_error = 0;
2274 int sys_error = 0;
2275 int do_disconnect = 0;
2276
2277 if (tcp_conprim_opt_process(eager, mp1,
2278 &do_disconnect, &t_error, &sys_error) < 0) {
2279 eager->tcp_accept_error = 1;
2280 if (do_disconnect) {
2281 /*
2282 * An option failed which does not allow
2283 * connection to be accepted.
2284 *
2285 * We allow T_CONN_RES to succeed and
2286 * put a T_DISCON_IND on the eager queue.
2287 */
2288 ASSERT(t_error == 0 && sys_error == 0);
2289 eager->tcp_send_discon_ind = 1;
2290 } else {
2291 ASSERT(t_error != 0);
2292 freemsg(ok_mp);
2293 /*
2294 * Original mp was either freed or set
2295 * to ok_mp above, so use mp1 instead.
2296 */
2297 tcp_err_ack(listener, mp1, t_error, sys_error);
2298 goto finish;
2299 }
2300 }
2301 /*
2302 * Most likely success in setting options (except if
2303 * eager->tcp_send_discon_ind set).
2304 * mp1 option buffer represented by OPT_length/offset
2305 * potentially modified and contains results of setting
2306 * options at this point
2307 */
2308 }
2309
2310 /* We no longer need mp1, since all options processing has passed */
2311 freemsg(mp1);
2312
2313 putnext(listener->tcp_rq, ok_mp);
2314
2315 mutex_enter(&listener->tcp_eager_lock);
2316 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
2317 tcp_t *tail;
2318 mblk_t *conn_ind;
2319
2320 /*
2321 * This path should not be executed if listener and
2322 * acceptor streams are the same.
2323 */
2324 ASSERT(listener != acceptor);
2325
2326 tcp = listener->tcp_eager_prev_q0;
2327 /*
2328 * listener->tcp_eager_prev_q0 points to the TAIL of the
2329 * deferred T_conn_ind queue. We need to get to the head of
2330 * the queue in order to send up T_conn_ind the same order as
2331 * how the 3WHS is completed.
2332 */
2333 while (tcp != listener) {
2334 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
2335 break;
2336 else
2337 tcp = tcp->tcp_eager_prev_q0;
2338 }
2339 ASSERT(tcp != listener);
2340 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
2341 ASSERT(conn_ind != NULL);
2342 tcp->tcp_conn.tcp_eager_conn_ind = NULL;
2343
2344 /* Move from q0 to q */
2345 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
2346 listener->tcp_conn_req_cnt_q0--;
2347 listener->tcp_conn_req_cnt_q++;
2348 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2349 tcp->tcp_eager_prev_q0;
2350 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2351 tcp->tcp_eager_next_q0;
2352 tcp->tcp_eager_prev_q0 = NULL;
2353 tcp->tcp_eager_next_q0 = NULL;
2354 tcp->tcp_conn_def_q0 = B_FALSE;
2355
2356 /* Make sure the tcp isn't in the list of droppables */
2357 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
2358 tcp->tcp_eager_prev_drop_q0 == NULL);
2359
2360 /*
2361 * Insert at end of the queue because sockfs sends
2362 * down T_CONN_RES in chronological order. Leaving
2363 * the older conn indications at front of the queue
2364 * helps reducing search time.
2365 */
2366 tail = listener->tcp_eager_last_q;
2367 if (tail != NULL)
2368 tail->tcp_eager_next_q = tcp;
2369 else
2370 listener->tcp_eager_next_q = tcp;
2371 listener->tcp_eager_last_q = tcp;
2372 tcp->tcp_eager_next_q = NULL;
2373 mutex_exit(&listener->tcp_eager_lock);
2374 putnext(tcp->tcp_rq, conn_ind);
2375 } else {
2376 mutex_exit(&listener->tcp_eager_lock);
2377 }
2378
2379 /*
2380 * Done with the acceptor - free it
2381 *
2382 * Note: from this point on, no access to listener should be made
2383 * as listener can be equal to acceptor.
2384 */
2385 finish:
2386 ASSERT(acceptor->tcp_detached);
2387 ASSERT(tcps->tcps_g_q != NULL);
2388 acceptor->tcp_rq = tcps->tcps_g_q;
2389 acceptor->tcp_wq = WR(tcps->tcps_g_q);
2390 (void) tcp_clean_death(acceptor, 0, 2);
2391 CONN_DEC_REF(acceptor->tcp_connp);
2392
2393 /*
2394 * In case we already received a FIN we have to make tcp_rput send
2395 * the ordrel_ind. This will also send up a window update if the window
2396 * has opened up.
2397 *
2398 * In the normal case of a successful connection acceptance
2399 * we give the O_T_BIND_REQ to the read side put procedure as an
2400 * indication that this was just accepted. This tells tcp_rput to
2401 * pass up any data queued in tcp_rcv_list.
2402 *
2403 * In the fringe case where options sent with T_CONN_RES failed and
2404 * we required, we would be indicating a T_DISCON_IND to blow
2405 * away this connection.
2406 */
2407
2408 /*
2409 * XXX: we currently have a problem if XTI application closes the
2410 * acceptor stream in between. This problem exists in on10-gate also
2411 * and is well know but nothing can be done short of major rewrite
2412 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
2413 * eager same squeue as listener (we can distinguish non socket
2414 * listeners at the time of handling a SYN in tcp_conn_request)
2415 * and do most of the work that tcp_accept_finish does here itself
2416 * and then get behind the acceptor squeue to access the acceptor
2417 * queue.
2418 */
2419 /*
2420 * We already have a ref on tcp so no need to do one before squeue_fill
2421 */
2422 squeue_fill(eager->tcp_connp->conn_sqp, opt_mp,
2423 tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH);
2424 }
2425
2426 /*
2427 * Swap information between the eager and acceptor for a TLI/XTI client.
2428 * The sockfs accept is done on the acceptor stream and control goes
2429 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
2430 * called. In either case, both the eager and listener are in their own
2431 * perimeter (squeue) and the code has to deal with potential race.
2432 *
2433 * See the block comment on top of tcp_accept() and tcp_wput_accept().
2434 */
2435 static void
2436 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
2437 {
2438 conn_t *econnp, *aconnp;
2439
2440 ASSERT(eager->tcp_rq == listener->tcp_rq);
2441 ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
2442 ASSERT(!eager->tcp_hard_bound);
2443 ASSERT(!TCP_IS_SOCKET(acceptor));
2444 ASSERT(!TCP_IS_SOCKET(eager));
2445 ASSERT(!TCP_IS_SOCKET(listener));
2446
2447 acceptor->tcp_detached = B_TRUE;
2448 /*
2449 * To permit stream re-use by TLI/XTI, the eager needs a copy of
2450 * the acceptor id.
2451 */
2452 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
2453
2454 /* remove eager from listen list... */
2455 mutex_enter(&listener->tcp_eager_lock);
2456 tcp_eager_unlink(eager);
2457 ASSERT(eager->tcp_eager_next_q == NULL &&
2458 eager->tcp_eager_last_q == NULL);
2459 ASSERT(eager->tcp_eager_next_q0 == NULL &&
2460 eager->tcp_eager_prev_q0 == NULL);
2461 mutex_exit(&listener->tcp_eager_lock);
2462 eager->tcp_rq = acceptor->tcp_rq;
2463 eager->tcp_wq = acceptor->tcp_wq;
2464
2465 econnp = eager->tcp_connp;
2466 aconnp = acceptor->tcp_connp;
2467
2468 eager->tcp_rq->q_ptr = econnp;
2469 eager->tcp_wq->q_ptr = econnp;
2470
2471 /*
2472 * In the TLI/XTI loopback case, we are inside the listener's squeue,
2473 * which might be a different squeue from our peer TCP instance.
2474 * For TCP Fusion, the peer expects that whenever tcp_detached is
2475 * clear, our TCP queues point to the acceptor's queues. Thus, use
2476 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
2477 * above reach global visibility prior to the clearing of tcp_detached.
2478 */
2479 membar_producer();
2480 eager->tcp_detached = B_FALSE;
2481
2482 ASSERT(eager->tcp_ack_tid == 0);
2483
2484 econnp->conn_dev = aconnp->conn_dev;
2485 econnp->conn_minor_arena = aconnp->conn_minor_arena;
2486 ASSERT(econnp->conn_minor_arena != NULL);
2487 if (eager->tcp_cred != NULL)
2488 crfree(eager->tcp_cred);
2489 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
2490 ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
2491 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
2492
2493 aconnp->conn_cred = NULL;
2494
2495 econnp->conn_zoneid = aconnp->conn_zoneid;
2496 econnp->conn_allzones = aconnp->conn_allzones;
2497
2498 econnp->conn_mac_exempt = aconnp->conn_mac_exempt;
2499 aconnp->conn_mac_exempt = B_FALSE;
2500
2501 ASSERT(aconnp->conn_peercred == NULL);
2502
2503 /* Do the IPC initialization */
2504 CONN_INC_REF(econnp);
2505
2506 econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
2507 econnp->conn_af_isv6 = aconnp->conn_af_isv6;
2508 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
2509
2510 /* Done with old IPC. Drop its ref on its connp */
2511 CONN_DEC_REF(aconnp);
2512 }
2513
2514
2515 /*
2516 * Adapt to the information, such as rtt and rtt_sd, provided from the
2517 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
2518 *
2519 * Checks for multicast and broadcast destination address.
2520 * Returns zero on failure; non-zero if ok.
2521 *
2522 * Note that the MSS calculation here is based on the info given in
2523 * the IRE. We do not do any calculation based on TCP options. They
2524 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP
2525 * knows which options to use.
2526 *
2527 * Note on how TCP gets its parameters for a connection.
2528 *
2529 * When a tcp_t structure is allocated, it gets all the default parameters.
2530 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
2531 * spipe, rpipe, ... from the route metrics. Route metric overrides the
2532 * default. But if there is an associated tcp_host_param, it will override
2533 * the metrics.
2534 *
2535 * An incoming SYN with a multicast or broadcast destination address, is dropped
2536 * in 1 of 2 places.
2537 *
2538 * 1. If the packet was received over the wire it is dropped in
2539 * ip_rput_process_broadcast()
2540 *
2541 * 2. If the packet was received through internal IP loopback, i.e. the packet
2542 * was generated and received on the same machine, it is dropped in
2543 * ip_wput_local()
2544 *
2545 * An incoming SYN with a multicast or broadcast source address is always
2546 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
2547 * reject an attempt to connect to a broadcast or multicast (destination)
2548 * address.
2549 */
2550 static int
2551 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
2552 {
2553 tcp_hsp_t *hsp;
2554 ire_t *ire;
2555 ire_t *sire = NULL;
2556 iulp_t *ire_uinfo = NULL;
2557 uint32_t mss_max;
2558 uint32_t mss;
2559 boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
2560 conn_t *connp = tcp->tcp_connp;
2561 boolean_t ire_cacheable = B_FALSE;
2562 zoneid_t zoneid = connp->conn_zoneid;
2563 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
2564 MATCH_IRE_SECATTR;
2565 ts_label_t *tsl = crgetlabel(CONN_CRED(connp));
2566 ill_t *ill = NULL;
2567 boolean_t incoming = (ire_mp == NULL);
2568 tcp_stack_t *tcps = tcp->tcp_tcps;
2569 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
2570
2571 ASSERT(connp->conn_ire_cache == NULL);
2572
2573 if (tcp->tcp_ipversion == IPV4_VERSION) {
2574
2575 if (CLASSD(tcp->tcp_connp->conn_rem)) {
2576 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
2577 return (0);
2578 }
2579 /*
2580 * If IP_NEXTHOP is set, then look for an IRE_CACHE
2581 * for the destination with the nexthop as gateway.
2582 * ire_ctable_lookup() is used because this particular
2583 * ire, if it exists, will be marked private.
2584 * If that is not available, use the interface ire
2585 * for the nexthop.
2586 *
2587 * TSol: tcp_update_label will detect label mismatches based
2588 * only on the destination's label, but that would not
2589 * detect label mismatches based on the security attributes
2590 * of routes or next hop gateway. Hence we need to pass the
2591 * label to ire_ftable_lookup below in order to locate the
2592 * right prefix (and/or) ire cache. Similarly we also need
2593 * pass the label to the ire_cache_lookup below to locate
2594 * the right ire that also matches on the label.
2595 */
2596 if (tcp->tcp_connp->conn_nexthop_set) {
2597 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
2598 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
2599 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW,
2600 ipst);
2601 if (ire == NULL) {
2602 ire = ire_ftable_lookup(
2603 tcp->tcp_connp->conn_nexthop_v4,
2604 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
2605 tsl, match_flags, ipst);
2606 if (ire == NULL)
2607 return (0);
2608 } else {
2609 ire_uinfo = &ire->ire_uinfo;
2610 }
2611 } else {
2612 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
2613 zoneid, tsl, ipst);
2614 if (ire != NULL) {
2615 ire_cacheable = B_TRUE;
2616 ire_uinfo = (ire_mp != NULL) ?
2617 &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2618 &ire->ire_uinfo;
2619
2620 } else {
2621 if (ire_mp == NULL) {
2622 ire = ire_ftable_lookup(
2623 tcp->tcp_connp->conn_rem,
2624 0, 0, 0, NULL, &sire, zoneid, 0,
2625 tsl, (MATCH_IRE_RECURSIVE |
2626 MATCH_IRE_DEFAULT), ipst);
2627 if (ire == NULL)
2628 return (0);
2629 ire_uinfo = (sire != NULL) ?
2630 &sire->ire_uinfo :
2631 &ire->ire_uinfo;
2632 } else {
2633 ire = (ire_t *)ire_mp->b_rptr;
2634 ire_uinfo =
2635 &((ire_t *)
2636 ire_mp->b_rptr)->ire_uinfo;
2637 }
2638 }
2639 }
2640 ASSERT(ire != NULL);
2641
2642 if ((ire->ire_src_addr == INADDR_ANY) ||
2643 (ire->ire_type & IRE_BROADCAST)) {
2644 /*
2645 * ire->ire_mp is non null when ire_mp passed in is used
2646 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2647 */
2648 if (ire->ire_mp == NULL)
2649 ire_refrele(ire);
2650 if (sire != NULL)
2651 ire_refrele(sire);
2652 return (0);
2653 }
2654
2655 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
2656 ipaddr_t src_addr;
2657
2658 /*
2659 * ip_bind_connected() has stored the correct source
2660 * address in conn_src.
2661 */
2662 src_addr = tcp->tcp_connp->conn_src;
2663 tcp->tcp_ipha->ipha_src = src_addr;
2664 /*
2665 * Copy of the src addr. in tcp_t is needed
2666 * for the lookup funcs.
2667 */
2668 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
2669 }
2670 /*
2671 * Set the fragment bit so that IP will tell us if the MTU
2672 * should change. IP tells us the latest setting of
2673 * ip_path_mtu_discovery through ire_frag_flag.
2674 */
2675 if (ipst->ips_ip_path_mtu_discovery) {
2676 tcp->tcp_ipha->ipha_fragment_offset_and_flags =
2677 htons(IPH_DF);
2678 }
2679 /*
2680 * If ire_uinfo is NULL, this is the IRE_INTERFACE case
2681 * for IP_NEXTHOP. No cache ire has been found for the
2682 * destination and we are working with the nexthop's
2683 * interface ire. Since we need to forward all packets
2684 * to the nexthop first, we "blindly" set tcp_localnet
2685 * to false, eventhough the destination may also be
2686 * onlink.
2687 */
2688 if (ire_uinfo == NULL)
2689 tcp->tcp_localnet = 0;
2690 else
2691 tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
2692 } else {
2693 /*
2694 * For incoming connection ire_mp = NULL
2695 * For outgoing connection ire_mp != NULL
2696 * Technically we should check conn_incoming_ill
2697 * when ire_mp is NULL and conn_outgoing_ill when
2698 * ire_mp is non-NULL. But this is performance
2699 * critical path and for IPV*_BOUND_IF, outgoing
2700 * and incoming ill are always set to the same value.
2701 */
2702 ill_t *dst_ill = NULL;
2703 ipif_t *dst_ipif = NULL;
2704
2705 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
2706
2707 if (connp->conn_outgoing_ill != NULL) {
2708 /* Outgoing or incoming path */
2709 int err;
2710
2711 dst_ill = conn_get_held_ill(connp,
2712 &connp->conn_outgoing_ill, &err);
2713 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
2714 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
2715 return (0);
2716 }
2717 match_flags |= MATCH_IRE_ILL;
2718 dst_ipif = dst_ill->ill_ipif;
2719 }
2720 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
2721 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst);
2722
2723 if (ire != NULL) {
2724 ire_cacheable = B_TRUE;
2725 ire_uinfo = (ire_mp != NULL) ?
2726 &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2727 &ire->ire_uinfo;
2728 } else {
2729 if (ire_mp == NULL) {
2730 ire = ire_ftable_lookup_v6(
2731 &tcp->tcp_connp->conn_remv6,
2732 0, 0, 0, dst_ipif, &sire, zoneid,
2733 0, tsl, match_flags, ipst);
2734 if (ire == NULL) {
2735 if (dst_ill != NULL)
2736 ill_refrele(dst_ill);
2737 return (0);
2738 }
2739 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
2740 &ire->ire_uinfo;
2741 } else {
2742 ire = (ire_t *)ire_mp->b_rptr;
2743 ire_uinfo =
2744 &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
2745 }
2746 }
2747 if (dst_ill != NULL)
2748 ill_refrele(dst_ill);
2749
2750 ASSERT(ire != NULL);
2751 ASSERT(ire_uinfo != NULL);
2752
2753 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
2754 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
2755 /*
2756 * ire->ire_mp is non null when ire_mp passed in is used
2757 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2758 */
2759 if (ire->ire_mp == NULL)
2760 ire_refrele(ire);
2761 if (sire != NULL)
2762 ire_refrele(sire);
2763 return (0);
2764 }
2765
2766 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
2767 in6_addr_t src_addr;
2768
2769 /*
2770 * ip_bind_connected_v6() has stored the correct source
2771 * address per IPv6 addr. selection policy in
2772 * conn_src_v6.
2773 */
2774 src_addr = tcp->tcp_connp->conn_srcv6;
2775
2776 tcp->tcp_ip6h->ip6_src = src_addr;
2777 /*
2778 * Copy of the src addr. in tcp_t is needed
2779 * for the lookup funcs.
2780 */
2781 tcp->tcp_ip_src_v6 = src_addr;
2782 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src,
2783 &connp->conn_srcv6));
2784 }
2785 tcp->tcp_localnet =
2786 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
2787 }
2788
2789 /*
2790 * This allows applications to fail quickly when connections are made
2791 * to dead hosts. Hosts can be labeled dead by adding a reject route
2792 * with both the RTF_REJECT and RTF_PRIVATE flags set.
2793 */
2794 if ((ire->ire_flags & RTF_REJECT) &&
2795 (ire->ire_flags & RTF_PRIVATE))
2796 goto error;
2797
2798 /*
2799 * Make use of the cached rtt and rtt_sd values to calculate the
2800 * initial RTO. Note that they are already initialized in
2801 * tcp_init_values().
2802 * If ire_uinfo is NULL, i.e., we do not have a cache ire for
2803 * IP_NEXTHOP, but instead are using the interface ire for the
2804 * nexthop, then we do not use the ire_uinfo from that ire to
2805 * do any initializations.
2806 */
2807 if (ire_uinfo != NULL) {
2808 if (ire_uinfo->iulp_rtt != 0) {
2809 clock_t rto;
2810
2811 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
2812 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
2813 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
2814 tcps->tcps_rexmit_interval_extra +
2815 (tcp->tcp_rtt_sa >> 5);
2816
2817 if (rto > tcps->tcps_rexmit_interval_max) {
2818 tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
2819 } else if (rto < tcps->tcps_rexmit_interval_min) {
2820 tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
2821 } else {
2822 tcp->tcp_rto = rto;
2823 }
2824 }
2825 if (ire_uinfo->iulp_ssthresh != 0)
2826 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
2827 else
2828 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2829 if (ire_uinfo->iulp_spipe > 0) {
2830 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
2831 tcps->tcps_max_buf);
2832 if (tcps->tcps_snd_lowat_fraction != 0)
2833 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2834 tcps->tcps_snd_lowat_fraction;
2835 (void) tcp_maxpsz_set(tcp, B_TRUE);
2836 }
2837 /*
2838 * Note that up till now, acceptor always inherits receive
2839 * window from the listener. But if there is a metrics
2840 * associated with a host, we should use that instead of
2841 * inheriting it from listener. Thus we need to pass this
2842 * info back to the caller.
2843 */
2844 if (ire_uinfo->iulp_rpipe > 0) {
2845 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe,
2846 tcps->tcps_max_buf);
2847 }
2848
2849 if (ire_uinfo->iulp_rtomax > 0) {
2850 tcp->tcp_second_timer_threshold =
2851 ire_uinfo->iulp_rtomax;
2852 }
2853
2854 /*
2855 * Use the metric option settings, iulp_tstamp_ok and
2856 * iulp_wscale_ok, only for active open. What this means
2857 * is that if the other side uses timestamp or window
2858 * scale option, TCP will also use those options. That
2859 * is for passive open. If the application sets a
2860 * large window, window scale is enabled regardless of
2861 * the value in iulp_wscale_ok. This is the behavior
2862 * since 2.6. So we keep it.
2863 * The only case left in passive open processing is the
2864 * check for SACK.
2865 * For ECN, it should probably be like SACK. But the
2866 * current value is binary, so we treat it like the other
2867 * cases. The metric only controls active open.For passive
2868 * open, the ndd param, tcp_ecn_permitted, controls the
2869 * behavior.
2870 */
2871 if (!tcp_detached) {
2872 /*
2873 * The if check means that the following can only
2874 * be turned on by the metrics only IRE, but not off.
2875 */
2876 if (ire_uinfo->iulp_tstamp_ok)
2877 tcp->tcp_snd_ts_ok = B_TRUE;
2878 if (ire_uinfo->iulp_wscale_ok)
2879 tcp->tcp_snd_ws_ok = B_TRUE;
2880 if (ire_uinfo->iulp_sack == 2)
2881 tcp->tcp_snd_sack_ok = B_TRUE;
2882 if (ire_uinfo->iulp_ecn_ok)
2883 tcp->tcp_ecn_ok = B_TRUE;
2884 } else {
2885 /*
2886 * Passive open.
2887 *
2888 * As above, the if check means that SACK can only be
2889 * turned on by the metric only IRE.
2890 */
2891 if (ire_uinfo->iulp_sack > 0) {
2892 tcp->tcp_snd_sack_ok = B_TRUE;
2893 }
2894 }
2895 }
2896
2897
2898 /*
2899 * XXX: Note that currently, ire_max_frag can be as small as 68
2900 * because of PMTUd. So tcp_mss may go to negative if combined
2901 * length of all those options exceeds 28 bytes. But because
2902 * of the tcp_mss_min check below, we may not have a problem if
2903 * tcp_mss_min is of a reasonable value. The default is 1 so
2904 * the negative problem still exists. And the check defeats PMTUd.
2905 * In fact, if PMTUd finds that the MSS should be smaller than
2906 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
2907 * value.
2908 *
2909 * We do not deal with that now. All those problems related to
2910 * PMTUd will be fixed later.
2911 */
2912 ASSERT(ire->ire_max_frag != 0);
2913 mss = tcp->tcp_if_mtu = ire->ire_max_frag;
2914 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) {
2915 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) {
2916 mss = MIN(mss, IPV6_MIN_MTU);
2917 }
2918 }
2919
2920 /* Sanity check for MSS value. */
2921 if (tcp->tcp_ipversion == IPV4_VERSION)
2922 mss_max = tcps->tcps_mss_max_ipv4;
2923 else
2924 mss_max = tcps->tcps_mss_max_ipv6;
2925
2926 if (tcp->tcp_ipversion == IPV6_VERSION &&
2927 (ire->ire_frag_flag & IPH_FRAG_HDR)) {
2928 /*
2929 * After receiving an ICMPv6 "packet too big" message with a
2930 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
2931 * will insert a 8-byte fragment header in every packet; we
2932 * reduce the MSS by that amount here.
2933 */
2934 mss -= sizeof (ip6_frag_t);
2935 }
2936
2937 if (tcp->tcp_ipsec_overhead == 0)
2938 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
2939
2940 mss -= tcp->tcp_ipsec_overhead;
2941
2942 if (mss < tcps->tcps_mss_min)
2943 mss = tcps->tcps_mss_min;
2944 if (mss > mss_max)
2945 mss = mss_max;
2946
2947 /* Note that this is the maximum MSS, excluding all options. */
2948 tcp->tcp_mss = mss;
2949
2950 /*
2951 * Initialize the ISS here now that we have the full connection ID.
2952 * The RFC 1948 method of initial sequence number generation requires
2953 * knowledge of the full connection ID before setting the ISS.
2954 */
2955
2956 tcp_iss_init(tcp);
2957
2958 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL))
2959 tcp->tcp_loopback = B_TRUE;
2960
2961 if (tcp->tcp_ipversion == IPV4_VERSION) {
2962 hsp = tcp_hsp_lookup(tcp->tcp_remote, tcps);
2963 } else {
2964 hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6, tcps);
2965 }
2966
2967 if (hsp != NULL) {
2968 /* Only modify if we're going to make them bigger */
2969 if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) {
2970 tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace;
2971 if (tcps->tcps_snd_lowat_fraction != 0)
2972 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2973 tcps->tcps_snd_lowat_fraction;
2974 }
2975
2976 if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) {
2977 tcp->tcp_rwnd = hsp->tcp_hsp_recvspace;
2978 }
2979
2980 /* Copy timestamp flag only for active open */
2981 if (!tcp_detached)
2982 tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp;
2983 }
2984
2985 if (sire != NULL)
2986 IRE_REFRELE(sire);
2987
2988 /*
2989 * If we got an IRE_CACHE and an ILL, go through their properties;
2990 * otherwise, this is deferred until later when we have an IRE_CACHE.
2991 */
2992 if (tcp->tcp_loopback ||
2993 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) {
2994 /*
2995 * For incoming, see if this tcp may be MDT-capable. For
2996 * outgoing, this process has been taken care of through
2997 * tcp_rput_other.
2998 */
2999 tcp_ire_ill_check(tcp, ire, ill, incoming);
3000 tcp->tcp_ire_ill_check_done = B_TRUE;
3001 }
3002
3003 mutex_enter(&connp->conn_lock);
3004 /*
3005 * Make sure that conn is not marked incipient
3006 * for incoming connections. A blind
3007 * removal of incipient flag is cheaper than
3008 * check and removal.
3009 */
3010 connp->conn_state_flags &= ~CONN_INCIPIENT;
3011
3012 /*
3013 * Must not cache forwarding table routes
3014 * or recache an IRE after the conn_t has
3015 * had conn_ire_cache cleared and is flagged
3016 * unusable, (see the CONN_CACHE_IRE() macro).
3017 */
3018 if (ire_cacheable && CONN_CACHE_IRE(connp)) {
3019 rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
3020 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
3021 connp->conn_ire_cache = ire;
3022 IRE_UNTRACE_REF(ire);
3023 rw_exit(&ire->ire_bucket->irb_lock);
3024 mutex_exit(&connp->conn_lock);
3025 return (1);
3026 }
3027 rw_exit(&ire->ire_bucket->irb_lock);
3028 }
3029 mutex_exit(&connp->conn_lock);
3030
3031 if (ire->ire_mp == NULL)
3032 ire_refrele(ire);
3033 return (1);
3034
3035 error:
3036 if (ire->ire_mp == NULL)
3037 ire_refrele(ire);
3038 if (sire != NULL)
3039 ire_refrele(sire);
3040 return (0);
3041 }
3042
3043 /*
3044 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a
3045 * O_T_BIND_REQ/T_BIND_REQ message.
3046 */
3047 static void
3048 tcp_bind(tcp_t *tcp, mblk_t *mp)
3049 {
3050 sin_t *sin;
3051 sin6_t *sin6;
3052 mblk_t *mp1;
3053 in_port_t requested_port;
3054 in_port_t allocated_port;
3055 struct T_bind_req *tbr;
3056 boolean_t bind_to_req_port_only;
3057 boolean_t backlog_update = B_FALSE;
3058 boolean_t user_specified;
3059 in6_addr_t v6addr;
3060 ipaddr_t v4addr;
3061 uint_t origipversion;
3062 int err;
3063 queue_t *q = tcp->tcp_wq;
3064 conn_t *connp = tcp->tcp_connp;
3065 mlp_type_t addrtype, mlptype;
3066 zone_t *zone;
3067 cred_t *cr;
3068 in_port_t mlp_port;
3069 tcp_stack_t *tcps = tcp->tcp_tcps;
3070
3071 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
3072 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
3073 if (tcp->tcp_debug) {
3074 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3075 "tcp_bind: bad req, len %u",
3076 (uint_t)(mp->b_wptr - mp->b_rptr));
3077 }
3078 tcp_err_ack(tcp, mp, TPROTO, 0);
3079 return;
3080 }
3081 /* Make sure the largest address fits */
3082 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
3083 if (mp1 == NULL) {
3084 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3085 return;
3086 }
3087 mp = mp1;
3088 tbr = (struct T_bind_req *)mp->b_rptr;
3089 if (tcp->tcp_state >= TCPS_BOUND) {
3090 if ((tcp->tcp_state == TCPS_BOUND ||
3091 tcp->tcp_state == TCPS_LISTEN) &&
3092 tcp->tcp_conn_req_max != tbr->CONIND_number &&
3093 tbr->CONIND_number > 0) {
3094 /*
3095 * Handle listen() increasing CONIND_number.
3096 * This is more "liberal" then what the TPI spec
3097 * requires but is needed to avoid a t_unbind
3098 * when handling listen() since the port number
3099 * might be "stolen" between the unbind and bind.
3100 */
3101 backlog_update = B_TRUE;
3102 goto do_bind;
3103 }
3104 if (tcp->tcp_debug) {
3105 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3106 "tcp_bind: bad state, %d", tcp->tcp_state);
3107 }
3108 tcp_err_ack(tcp, mp, TOUTSTATE, 0);
3109 return;
3110 }
3111 origipversion = tcp->tcp_ipversion;
3112
3113 switch (tbr->ADDR_length) {
3114 case 0: /* request for a generic port */
3115 tbr->ADDR_offset = sizeof (struct T_bind_req);
3116 if (tcp->tcp_family == AF_INET) {
3117 tbr->ADDR_length = sizeof (sin_t);
3118 sin = (sin_t *)&tbr[1];
3119 *sin = sin_null;
3120 sin->sin_family = AF_INET;
3121 mp->b_wptr = (uchar_t *)&sin[1];
3122 tcp->tcp_ipversion = IPV4_VERSION;
3123 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr);
3124 } else {
3125 ASSERT(tcp->tcp_family == AF_INET6);
3126 tbr->ADDR_length = sizeof (sin6_t);
3127 sin6 = (sin6_t *)&tbr[1];
3128 *sin6 = sin6_null;
3129 sin6->sin6_family = AF_INET6;
3130 mp->b_wptr = (uchar_t *)&sin6[1];
3131 tcp->tcp_ipversion = IPV6_VERSION;
3132 V6_SET_ZERO(v6addr);
3133 }
3134 requested_port = 0;
3135 break;
3136
3137 case sizeof (sin_t): /* Complete IPv4 address */
3138 sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
3139 sizeof (sin_t));
3140 if (sin == NULL || !OK_32PTR((char *)sin)) {
3141 if (tcp->tcp_debug) {
3142 (void) strlog(TCP_MOD_ID, 0, 1,
3143 SL_ERROR|SL_TRACE,
3144 "tcp_bind: bad address parameter, "
3145 "offset %d, len %d",
3146 tbr->ADDR_offset, tbr->ADDR_length);
3147 }
3148 tcp_err_ack(tcp, mp, TPROTO, 0);
3149 return;
3150 }
3151 /*
3152 * With sockets sockfs will accept bogus sin_family in
3153 * bind() and replace it with the family used in the socket
3154 * call.
3155 */
3156 if (sin->sin_family != AF_INET ||
3157 tcp->tcp_family != AF_INET) {
3158 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3159 return;
3160 }
3161 requested_port = ntohs(sin->sin_port);
3162 tcp->tcp_ipversion = IPV4_VERSION;
3163 v4addr = sin->sin_addr.s_addr;
3164 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
3165 break;
3166
3167 case sizeof (sin6_t): /* Complete IPv6 address */
3168 sin6 = (sin6_t *)mi_offset_param(mp,
3169 tbr->ADDR_offset, sizeof (sin6_t));
3170 if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
3171 if (tcp->tcp_debug) {
3172 (void) strlog(TCP_MOD_ID, 0, 1,
3173 SL_ERROR|SL_TRACE,
3174 "tcp_bind: bad IPv6 address parameter, "
3175 "offset %d, len %d", tbr->ADDR_offset,
3176 tbr->ADDR_length);
3177 }
3178 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
3179 return;
3180 }
3181 if (sin6->sin6_family != AF_INET6 ||
3182 tcp->tcp_family != AF_INET6) {
3183 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3184 return;
3185 }
3186 requested_port = ntohs(sin6->sin6_port);
3187 tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
3188 IPV4_VERSION : IPV6_VERSION;
3189 v6addr = sin6->sin6_addr;
3190 break;
3191
3192 default:
3193 if (tcp->tcp_debug) {
3194 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3195 "tcp_bind: bad address length, %d",
3196 tbr->ADDR_length);
3197 }
3198 tcp_err_ack(tcp, mp, TBADADDR, 0);
3199 return;
3200 }
3201 tcp->tcp_bound_source_v6 = v6addr;
3202
3203 /* Check for change in ipversion */
3204 if (origipversion != tcp->tcp_ipversion) {
3205 ASSERT(tcp->tcp_family == AF_INET6);
3206 err = tcp->tcp_ipversion == IPV6_VERSION ?
3207 tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
3208 if (err) {
3209 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3210 return;
3211 }
3212 }
3213
3214 /*
3215 * Initialize family specific fields. Copy of the src addr.
3216 * in tcp_t is needed for the lookup funcs.
3217 */
3218 if (tcp->tcp_ipversion == IPV6_VERSION) {
3219 tcp->tcp_ip6h->ip6_src = v6addr;
3220 } else {
3221 IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
3222 }
3223 tcp->tcp_ip_src_v6 = v6addr;
3224
3225 /*
3226 * For O_T_BIND_REQ:
3227 * Verify that the target port/addr is available, or choose
3228 * another.
3229 * For T_BIND_REQ:
3230 * Verify that the target port/addr is available or fail.
3231 * In both cases when it succeeds the tcp is inserted in the
3232 * bind hash table. This ensures that the operation is atomic
3233 * under the lock on the hash bucket.
3234 */
3235 bind_to_req_port_only = requested_port != 0 &&
3236 tbr->PRIM_type != O_T_BIND_REQ;
3237 /*
3238 * Get a valid port (within the anonymous range and should not
3239 * be a privileged one) to use if the user has not given a port.
3240 * If multiple threads are here, they may all start with
3241 * with the same initial port. But, it should be fine as long as
3242 * tcp_bindi will ensure that no two threads will be assigned
3243 * the same port.
3244 *
3245 * NOTE: XXX If a privileged process asks for an anonymous port, we
3246 * still check for ports only in the range > tcp_smallest_non_priv_port,
3247 * unless TCP_ANONPRIVBIND option is set.
3248 */
3249 mlptype = mlptSingle;
3250 mlp_port = requested_port;
3251 if (requested_port == 0) {
3252 requested_port = tcp->tcp_anon_priv_bind ?
3253 tcp_get_next_priv_port(tcp) :
3254 tcp_update_next_port(tcps->tcps_next_port_to_try,
3255 tcp, B_TRUE);
3256 if (requested_port == 0) {
3257 tcp_err_ack(tcp, mp, TNOADDR, 0);
3258 return;
3259 }
3260 user_specified = B_FALSE;
3261
3262 /*
3263 * If the user went through one of the RPC interfaces to create
3264 * this socket and RPC is MLP in this zone, then give him an
3265 * anonymous MLP.
3266 */
3267 cr = DB_CREDDEF(mp, tcp->tcp_cred);
3268 if (connp->conn_anon_mlp && is_system_labeled()) {
3269 zone = crgetzone(cr);
3270 addrtype = tsol_mlp_addr_type(zone->zone_id,
3271 IPV6_VERSION, &v6addr,
3272 tcps->tcps_netstack->netstack_ip);
3273 if (addrtype == mlptSingle) {
3274 tcp_err_ack(tcp, mp, TNOADDR, 0);
3275 return;
3276 }
3277 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3278 PMAPPORT, addrtype);
3279 mlp_port = PMAPPORT;
3280 }
3281 } else {
3282 int i;
3283 boolean_t priv = B_FALSE;
3284
3285 /*
3286 * If the requested_port is in the well-known privileged range,
3287 * verify that the stream was opened by a privileged user.
3288 * Note: No locks are held when inspecting tcp_g_*epriv_ports
3289 * but instead the code relies on:
3290 * - the fact that the address of the array and its size never
3291 * changes
3292 * - the atomic assignment of the elements of the array
3293 */
3294 cr = DB_CREDDEF(mp, tcp->tcp_cred);
3295 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
3296 priv = B_TRUE;
3297 } else {
3298 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
3299 if (requested_port ==
3300 tcps->tcps_g_epriv_ports[i]) {
3301 priv = B_TRUE;
3302 break;
3303 }
3304 }
3305 }
3306 if (priv) {
3307 if (secpolicy_net_privaddr(cr, requested_port,
3308 IPPROTO_TCP) != 0) {
3309 if (tcp->tcp_debug) {
3310 (void) strlog(TCP_MOD_ID, 0, 1,
3311 SL_ERROR|SL_TRACE,
3312 "tcp_bind: no priv for port %d",
3313 requested_port);
3314 }
3315 tcp_err_ack(tcp, mp, TACCES, 0);
3316 return;
3317 }
3318 }
3319 user_specified = B_TRUE;
3320
3321 if (is_system_labeled()) {
3322 zone = crgetzone(cr);
3323 addrtype = tsol_mlp_addr_type(zone->zone_id,
3324 IPV6_VERSION, &v6addr,
3325 tcps->tcps_netstack->netstack_ip);
3326 if (addrtype == mlptSingle) {
3327 tcp_err_ack(tcp, mp, TNOADDR, 0);
3328 return;
3329 }
3330 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3331 requested_port, addrtype);
3332 }
3333 }
3334
3335 if (mlptype != mlptSingle) {
3336 if (secpolicy_net_bindmlp(cr) != 0) {
3337 if (tcp->tcp_debug) {
3338 (void) strlog(TCP_MOD_ID, 0, 1,
3339 SL_ERROR|SL_TRACE,
3340 "tcp_bind: no priv for multilevel port %d",
3341 requested_port);
3342 }
3343 tcp_err_ack(tcp, mp, TACCES, 0);
3344 return;
3345 }
3346
3347 /*
3348 * If we're specifically binding a shared IP address and the
3349 * port is MLP on shared addresses, then check to see if this
3350 * zone actually owns the MLP. Reject if not.
3351 */
3352 if (mlptype == mlptShared && addrtype == mlptShared) {
3353 /*
3354 * No need to handle exclusive-stack zones since
3355 * ALL_ZONES only applies to the shared stack.
3356 */
3357 zoneid_t mlpzone;
3358
3359 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
3360 htons(mlp_port));
3361 if (connp->conn_zoneid != mlpzone) {
3362 if (tcp->tcp_debug) {
3363 (void) strlog(TCP_MOD_ID, 0, 1,
3364 SL_ERROR|SL_TRACE,
3365 "tcp_bind: attempt to bind port "
3366 "%d on shared addr in zone %d "
3367 "(should be %d)",
3368 mlp_port, connp->conn_zoneid,
3369 mlpzone);
3370 }
3371 tcp_err_ack(tcp, mp, TACCES, 0);
3372 return;
3373 }
3374 }
3375
3376 if (!user_specified) {
3377 err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3378 requested_port, B_TRUE);
3379 if (err != 0) {
3380 if (tcp->tcp_debug) {
3381 (void) strlog(TCP_MOD_ID, 0, 1,
3382 SL_ERROR|SL_TRACE,
3383 "tcp_bind: cannot establish anon "
3384 "MLP for port %d",
3385 requested_port);
3386 }
3387 tcp_err_ack(tcp, mp, TSYSERR, err);
3388 return;
3389 }
3390 connp->conn_anon_port = B_TRUE;
3391 }
3392 connp->conn_mlp_type = mlptype;
3393 }
3394
3395 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
3396 tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
3397
3398 if (allocated_port == 0) {
3399 connp->conn_mlp_type = mlptSingle;
3400 if (connp->conn_anon_port) {
3401 connp->conn_anon_port = B_FALSE;
3402 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3403 requested_port, B_FALSE);
3404 }
3405 if (bind_to_req_port_only) {
3406 if (tcp->tcp_debug) {
3407 (void) strlog(TCP_MOD_ID, 0, 1,
3408 SL_ERROR|SL_TRACE,
3409 "tcp_bind: requested addr busy");
3410 }
3411 tcp_err_ack(tcp, mp, TADDRBUSY, 0);
3412 } else {
3413 /* If we are out of ports, fail the bind. */
3414 if (tcp->tcp_debug) {
3415 (void) strlog(TCP_MOD_ID, 0, 1,
3416 SL_ERROR|SL_TRACE,
3417 "tcp_bind: out of ports?");
3418 }
3419 tcp_err_ack(tcp, mp, TNOADDR, 0);
3420 }
3421 return;
3422 }
3423 ASSERT(tcp->tcp_state == TCPS_BOUND);
3424 do_bind:
3425 if (!backlog_update) {
3426 if (tcp->tcp_family == AF_INET)
3427 sin->sin_port = htons(allocated_port);
3428 else
3429 sin6->sin6_port = htons(allocated_port);
3430 }
3431 if (tcp->tcp_family == AF_INET) {
3432 if (tbr->CONIND_number != 0) {
3433 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3434 sizeof (sin_t));
3435 } else {
3436 /* Just verify the local IP address */
3437 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN);
3438 }
3439 } else {
3440 if (tbr->CONIND_number != 0) {
3441 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3442 sizeof (sin6_t));
3443 } else {
3444 /* Just verify the local IP address */
3445 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3446 IPV6_ADDR_LEN);
3447 }
3448 }
3449 if (mp1 == NULL) {
3450 if (connp->conn_anon_port) {
3451 connp->conn_anon_port = B_FALSE;
3452 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3453 requested_port, B_FALSE);
3454 }
3455 connp->conn_mlp_type = mlptSingle;
3456 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3457 return;
3458 }
3459
3460 tbr->PRIM_type = T_BIND_ACK;
3461 mp->b_datap->db_type = M_PCPROTO;
3462
3463 /* Chain in the reply mp for tcp_rput() */
3464 mp1->b_cont = mp;
3465 mp = mp1;
3466
3467 tcp->tcp_conn_req_max = tbr->CONIND_number;
3468 if (tcp->tcp_conn_req_max) {
3469 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
3470 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
3471 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
3472 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
3473 /*
3474 * If this is a listener, do not reset the eager list
3475 * and other stuffs. Note that we don't check if the
3476 * existing eager list meets the new tcp_conn_req_max
3477 * requirement.
3478 */
3479 if (tcp->tcp_state != TCPS_LISTEN) {
3480 tcp->tcp_state = TCPS_LISTEN;
3481 /* Initialize the chain. Don't need the eager_lock */
3482 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3483 tcp->tcp_eager_next_drop_q0 = tcp;
3484 tcp->tcp_eager_prev_drop_q0 = tcp;
3485 tcp->tcp_second_ctimer_threshold =
3486 tcps->tcps_ip_abort_linterval;
3487 }
3488 }
3489
3490 /*
3491 * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3492 * processing continues in tcp_rput_other().
3493 *
3494 * We need to make sure that the conn_recv is set to a non-null
3495 * value before we insert the conn into the classifier table.
3496 * This is to avoid a race with an incoming packet which does an
3497 * ipcl_classify().
3498 */
3499 connp->conn_recv = tcp_conn_request;
3500 if (tcp->tcp_family == AF_INET6) {
3501 ASSERT(tcp->tcp_connp->conn_af_isv6);
3502 mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp);
3503 } else {
3504 ASSERT(!tcp->tcp_connp->conn_af_isv6);
3505 mp = ip_bind_v4(q, mp, tcp->tcp_connp);
3506 }
3507 /*
3508 * If the bind cannot complete immediately
3509 * IP will arrange to call tcp_rput_other
3510 * when the bind completes.
3511 */
3512 if (mp != NULL) {
3513 tcp_rput_other(tcp, mp);
3514 } else {
3515 /*
3516 * Bind will be resumed later. Need to ensure
3517 * that conn doesn't disappear when that happens.
3518 * This will be decremented in ip_resume_tcp_bind().
3519 */
3520 CONN_INC_REF(tcp->tcp_connp);
3521 }
3522 }
3523
3524
3525 /*
3526 * If the "bind_to_req_port_only" parameter is set, if the requested port
3527 * number is available, return it, If not return 0
3528 *
3529 * If "bind_to_req_port_only" parameter is not set and
3530 * If the requested port number is available, return it. If not, return
3531 * the first anonymous port we happen across. If no anonymous ports are
3532 * available, return 0. addr is the requested local address, if any.
3533 *
3534 * In either case, when succeeding update the tcp_t to record the port number
3535 * and insert it in the bind hash table.
3536 *
3537 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
3538 * without setting SO_REUSEADDR. This is needed so that they
3539 * can be viewed as two independent transport protocols.
3540 */
3541 static in_port_t
3542 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
3543 int reuseaddr, boolean_t quick_connect,
3544 boolean_t bind_to_req_port_only, boolean_t user_specified)
3545 {
3546 /* number of times we have run around the loop */
3547 int count = 0;
3548 /* maximum number of times to run around the loop */
3549 int loopmax;
3550 conn_t *connp = tcp->tcp_connp;
3551 zoneid_t zoneid = connp->conn_zoneid;
3552 tcp_stack_t *tcps = tcp->tcp_tcps;
3553
3554 /*
3555 * Lookup for free addresses is done in a loop and "loopmax"
3556 * influences how long we spin in the loop
3557 */
3558 if (bind_to_req_port_only) {
3559 /*
3560 * If the requested port is busy, don't bother to look
3561 * for a new one. Setting loop maximum count to 1 has
3562 * that effect.
3563 */
3564 loopmax = 1;
3565 } else {
3566 /*
3567 * If the requested port is busy, look for a free one
3568 * in the anonymous port range.
3569 * Set loopmax appropriately so that one does not look
3570 * forever in the case all of the anonymous ports are in use.
3571 */
3572 if (tcp->tcp_anon_priv_bind) {
3573 /*
3574 * loopmax =
3575 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
3576 */
3577 loopmax = IPPORT_RESERVED -
3578 tcps->tcps_min_anonpriv_port;
3579 } else {
3580 loopmax = (tcps->tcps_largest_anon_port -
3581 tcps->tcps_smallest_anon_port + 1);
3582 }
3583 }
3584 do {
3585 uint16_t lport;
3586 tf_t *tbf;
3587 tcp_t *ltcp;
3588 conn_t *lconnp;
3589
3590 lport = htons(port);
3591
3592 /*
3593 * Ensure that the tcp_t is not currently in the bind hash.
3594 * Hold the lock on the hash bucket to ensure that
3595 * the duplicate check plus the insertion is an atomic
3596 * operation.
3597 *
3598 * This function does an inline lookup on the bind hash list
3599 * Make sure that we access only members of tcp_t
3600 * and that we don't look at tcp_tcp, since we are not
3601 * doing a CONN_INC_REF.
3602 */
3603 tcp_bind_hash_remove(tcp);
3604 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
3605 mutex_enter(&tbf->tf_lock);
3606 for (ltcp = tbf->tf_tcp; ltcp != NULL;
3607 ltcp = ltcp->tcp_bind_hash) {
3608 boolean_t not_socket;
3609 boolean_t exclbind;
3610
3611 if (lport != ltcp->tcp_lport)
3612 continue;
3613
3614 lconnp = ltcp->tcp_connp;
3615
3616 /*
3617 * On a labeled system, we must treat bindings to ports
3618 * on shared IP addresses by sockets with MAC exemption
3619 * privilege as being in all zones, as there's
3620 * otherwise no way to identify the right receiver.
3621 */
3622 if (!(IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) ||
3623 IPCL_ZONE_MATCH(connp,
3624 ltcp->tcp_connp->conn_zoneid)) &&
3625 !lconnp->conn_mac_exempt &&
3626 !connp->conn_mac_exempt)
3627 continue;
3628
3629 /*
3630 * If TCP_EXCLBIND is set for either the bound or
3631 * binding endpoint, the semantics of bind
3632 * is changed according to the following.
3633 *
3634 * spec = specified address (v4 or v6)
3635 * unspec = unspecified address (v4 or v6)
3636 * A = specified addresses are different for endpoints
3637 *
3638 * bound bind to allowed
3639 * -------------------------------------
3640 * unspec unspec no
3641 * unspec spec no
3642 * spec unspec no
3643 * spec spec yes if A
3644 *
3645 * For labeled systems, SO_MAC_EXEMPT behaves the same
3646 * as TCP_EXCLBIND, except that zoneid is ignored.
3647 *
3648 * Note:
3649 *
3650 * 1. Because of TLI semantics, an endpoint can go
3651 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
3652 * TCPS_BOUND, depending on whether it is originally
3653 * a listener or not. That is why we need to check
3654 * for states greater than or equal to TCPS_BOUND
3655 * here.
3656 *
3657 * 2. Ideally, we should only check for state equals
3658 * to TCPS_LISTEN. And the following check should be
3659 * added.
3660 *
3661 * if (ltcp->tcp_state == TCPS_LISTEN ||
3662 * !reuseaddr || !ltcp->tcp_reuseaddr) {
3663 * ...
3664 * }
3665 *
3666 * The semantics will be changed to this. If the
3667 * endpoint on the list is in state not equal to
3668 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
3669 * set, let the bind succeed.
3670 *
3671 * Because of (1), we cannot do that for TLI
3672 * endpoints. But we can do that for socket endpoints.
3673 * If in future, we can change this going back
3674 * semantics, we can use the above check for TLI also.
3675 */
3676 not_socket = !(TCP_IS_SOCKET(ltcp) &&
3677 TCP_IS_SOCKET(tcp));
3678 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind;
3679
3680 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt ||
3681 (exclbind && (not_socket ||
3682 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
3683 if (V6_OR_V4_INADDR_ANY(
3684 ltcp->tcp_bound_source_v6) ||
3685 V6_OR_V4_INADDR_ANY(*laddr) ||
3686 IN6_ARE_ADDR_EQUAL(laddr,
3687 <cp->tcp_bound_source_v6)) {
3688 break;
3689 }
3690 continue;
3691 }
3692
3693 /*
3694 * Check ipversion to allow IPv4 and IPv6 sockets to
3695 * have disjoint port number spaces, if *_EXCLBIND
3696 * is not set and only if the application binds to a
3697 * specific port. We use the same autoassigned port
3698 * number space for IPv4 and IPv6 sockets.
3699 */
3700 if (tcp->tcp_ipversion != ltcp->tcp_ipversion &&
3701 bind_to_req_port_only)
3702 continue;
3703
3704 /*
3705 * Ideally, we should make sure that the source
3706 * address, remote address, and remote port in the
3707 * four tuple for this tcp-connection is unique.
3708 * However, trying to find out the local source
3709 * address would require too much code duplication
3710 * with IP, since IP needs needs to have that code
3711 * to support userland TCP implementations.
3712 */
3713 if (quick_connect &&
3714 (ltcp->tcp_state > TCPS_LISTEN) &&
3715 ((tcp->tcp_fport != ltcp->tcp_fport) ||
3716 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
3717 <cp->tcp_remote_v6)))
3718 continue;
3719
3720 if (!reuseaddr) {
3721 /*
3722 * No socket option SO_REUSEADDR.
3723 * If existing port is bound to
3724 * a non-wildcard IP address
3725 * and the requesting stream is
3726 * bound to a distinct
3727 * different IP addresses
3728 * (non-wildcard, also), keep
3729 * going.
3730 */
3731 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
3732 !V6_OR_V4_INADDR_ANY(
3733 ltcp->tcp_bound_source_v6) &&
3734 !IN6_ARE_ADDR_EQUAL(laddr,
3735 <cp->tcp_bound_source_v6))
3736 continue;
3737 if (ltcp->tcp_state >= TCPS_BOUND) {
3738 /*
3739 * This port is being used and
3740 * its state is >= TCPS_BOUND,
3741 * so we can't bind to it.
3742 */
3743 break;
3744 }
3745 } else {
3746 /*
3747 * socket option SO_REUSEADDR is set on the
3748 * binding tcp_t.
3749 *
3750 * If two streams are bound to
3751 * same IP address or both addr
3752 * and bound source are wildcards
3753 * (INADDR_ANY), we want to stop
3754 * searching.
3755 * We have found a match of IP source
3756 * address and source port, which is
3757 * refused regardless of the
3758 * SO_REUSEADDR setting, so we break.
3759 */
3760 if (IN6_ARE_ADDR_EQUAL(laddr,
3761 <cp->tcp_bound_source_v6) &&
3762 (ltcp->tcp_state == TCPS_LISTEN ||
3763 ltcp->tcp_state == TCPS_BOUND))
3764 break;
3765 }
3766 }
3767 if (ltcp != NULL) {
3768 /* The port number is busy */
3769 mutex_exit(&tbf->tf_lock);
3770 } else {
3771 /*
3772 * This port is ours. Insert in fanout and mark as
3773 * bound to prevent others from getting the port
3774 * number.
3775 */
3776 tcp->tcp_state = TCPS_BOUND;
3777 tcp->tcp_lport = htons(port);
3778 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3779
3780 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
3781 tcp->tcp_lport)] == tbf);
3782 tcp_bind_hash_insert(tbf, tcp, 1);
3783
3784 mutex_exit(&tbf->tf_lock);
3785
3786 /*
3787 * We don't want tcp_next_port_to_try to "inherit"
3788 * a port number supplied by the user in a bind.
3789 */
3790 if (user_specified)
3791 return (port);
3792
3793 /*
3794 * This is the only place where tcp_next_port_to_try
3795 * is updated. After the update, it may or may not
3796 * be in the valid range.
3797 */
3798 if (!tcp->tcp_anon_priv_bind)
3799 tcps->tcps_next_port_to_try = port + 1;
3800 return (port);
3801 }
3802
3803 if (tcp->tcp_anon_priv_bind) {
3804 port = tcp_get_next_priv_port(tcp);
3805 } else {
3806 if (count == 0 && user_specified) {
3807 /*
3808 * We may have to return an anonymous port. So
3809 * get one to start with.
3810 */
3811 port =
3812 tcp_update_next_port(
3813 tcps->tcps_next_port_to_try,
3814 tcp, B_TRUE);
3815 user_specified = B_FALSE;
3816 } else {
3817 port = tcp_update_next_port(port + 1, tcp,
3818 B_FALSE);
3819 }
3820 }
3821 if (port == 0)
3822 break;
3823
3824 /*
3825 * Don't let this loop run forever in the case where
3826 * all of the anonymous ports are in use.
3827 */
3828 } while (++count < loopmax);
3829 return (0);
3830 }
3831
3832 /*
3833 * tcp_clean_death / tcp_close_detached must not be called more than once
3834 * on a tcp. Thus every function that potentially calls tcp_clean_death
3835 * must check for the tcp state before calling tcp_clean_death.
3836 * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper,
3837 * tcp_timer_handler, all check for the tcp state.
3838 */
3839 /* ARGSUSED */
3840 void
3841 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2)
3842 {
3843 tcp_t *tcp = ((conn_t *)arg)->conn_tcp;
3844
3845 freemsg(mp);
3846 if (tcp->tcp_state > TCPS_BOUND)
3847 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
3848 ETIMEDOUT, 5);
3849 }
3850
3851 /*
3852 * We are dying for some reason. Try to do it gracefully. (May be called
3853 * as writer.)
3854 *
3855 * Return -1 if the structure was not cleaned up (if the cleanup had to be
3856 * done by a service procedure).
3857 * TBD - Should the return value distinguish between the tcp_t being
3858 * freed and it being reinitialized?
3859 */
3860 static int
3861 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
3862 {
3863 mblk_t *mp;
3864 queue_t *q;
3865 tcp_stack_t *tcps = tcp->tcp_tcps;
3866 sodirect_t *sodp;
3867
3868 TCP_CLD_STAT(tag);
3869
3870 #if TCP_TAG_CLEAN_DEATH
3871 tcp->tcp_cleandeathtag = tag;
3872 #endif
3873
3874 if (tcp->tcp_fused)
3875 tcp_unfuse(tcp);
3876
3877 if (tcp->tcp_linger_tid != 0 &&
3878 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
3879 tcp_stop_lingering(tcp);
3880 }
3881
3882 ASSERT(tcp != NULL);
3883 ASSERT((tcp->tcp_family == AF_INET &&
3884 tcp->tcp_ipversion == IPV4_VERSION) ||
3885 (tcp->tcp_family == AF_INET6 &&
3886 (tcp->tcp_ipversion == IPV4_VERSION ||
3887 tcp->tcp_ipversion == IPV6_VERSION)));
3888
3889 if (TCP_IS_DETACHED(tcp)) {
3890 if (tcp->tcp_hard_binding) {
3891 /*
3892 * Its an eager that we are dealing with. We close the
3893 * eager but in case a conn_ind has already gone to the
3894 * listener, let tcp_accept_finish() send a discon_ind
3895 * to the listener and drop the last reference. If the
3896 * listener doesn't even know about the eager i.e. the
3897 * conn_ind hasn't gone up, blow away the eager and drop
3898 * the last reference as well. If the conn_ind has gone
3899 * up, state should be BOUND. tcp_accept_finish
3900 * will figure out that the connection has received a
3901 * RST and will send a DISCON_IND to the application.
3902 */
3903 tcp_closei_local(tcp);
3904 if (!tcp->tcp_tconnind_started) {
3905 CONN_DEC_REF(tcp->tcp_connp);
3906 } else {
3907 tcp->tcp_state = TCPS_BOUND;
3908 }
3909 } else {
3910 tcp_close_detached(tcp);
3911 }
3912 return (0);
3913 }
3914
3915 TCP_STAT(tcps, tcp_clean_death_nondetached);
3916
3917 /*
3918 * If T_ORDREL_IND has not been sent yet (done when service routine
3919 * is run) postpone cleaning up the endpoint until service routine
3920 * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3921 * client_errno since tcp_close uses the client_errno field.
3922 */
3923 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3924 if (err != 0)
3925 tcp->tcp_client_errno = err;
3926
3927 tcp->tcp_deferred_clean_death = B_TRUE;
3928 return (-1);
3929 }
3930
3931 /* If sodirect, not anymore */
3932 SOD_PTR_ENTER(tcp, sodp);
3933 if (sodp != NULL) {
3934 tcp->tcp_sodirect = NULL;
3935 mutex_exit(sodp->sod_lock);
3936 }
3937
3938 q = tcp->tcp_rq;
3939
3940 /* Trash all inbound data */
3941 flushq(q, FLUSHALL);
3942
3943 /*
3944 * If we are at least part way open and there is error
3945 * (err==0 implies no error)
3946 * notify our client by a T_DISCON_IND.
3947 */
3948 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
3949 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
3950 !TCP_IS_SOCKET(tcp)) {
3951 /*
3952 * Send M_FLUSH according to TPI. Because sockets will
3953 * (and must) ignore FLUSHR we do that only for TPI
3954 * endpoints and sockets in STREAMS mode.
3955 */
3956 (void) putnextctl1(q, M_FLUSH, FLUSHR);
3957 }
3958 if (tcp->tcp_debug) {
3959 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
3960 "tcp_clean_death: discon err %d", err);
3961 }
3962 mp = mi_tpi_discon_ind(NULL, err, 0);
3963 if (mp != NULL) {
3964 putnext(q, mp);
3965 } else {
3966 if (tcp->tcp_debug) {
3967 (void) strlog(TCP_MOD_ID, 0, 1,
3968 SL_ERROR|SL_TRACE,
3969 "tcp_clean_death, sending M_ERROR");
3970 }
3971 (void) putnextctl1(q, M_ERROR, EPROTO);
3972 }
3973 if (tcp->tcp_state <= TCPS_SYN_RCVD) {
3974 /* SYN_SENT or SYN_RCVD */
3975 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
3976 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
3977 /* ESTABLISHED or CLOSE_WAIT */
3978 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
3979 }
3980 }
3981
3982 tcp_reinit(tcp);
3983 return (-1);
3984 }
3985
3986 /*
3987 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
3988 * to expire, stop the wait and finish the close.
3989 */
3990 static void
3991 tcp_stop_lingering(tcp_t *tcp)
3992 {
3993 clock_t delta = 0;
3994 tcp_stack_t *tcps = tcp->tcp_tcps;
3995
3996 tcp->tcp_linger_tid = 0;
3997 if (tcp->tcp_state > TCPS_LISTEN) {
3998 tcp_acceptor_hash_remove(tcp);
3999 mutex_enter(&tcp->tcp_non_sq_lock);
4000 if (tcp->tcp_flow_stopped) {
4001 tcp_clrqfull(tcp);
4002 }
4003 mutex_exit(&tcp->tcp_non_sq_lock);
4004
4005 if (tcp->tcp_timer_tid != 0) {
4006 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4007 tcp->tcp_timer_tid = 0;
4008 }
4009 /*
4010 * Need to cancel those timers which will not be used when
4011 * TCP is detached. This has to be done before the tcp_wq
4012 * is set to the global queue.
4013 */
4014 tcp_timers_stop(tcp);
4015
4016
4017 tcp->tcp_detached = B_TRUE;
4018 ASSERT(tcps->tcps_g_q != NULL);
4019 tcp->tcp_rq = tcps->tcps_g_q;
4020 tcp->tcp_wq = WR(tcps->tcps_g_q);
4021
4022 if (tcp->tcp_state == TCPS_TIME_WAIT) {
4023 tcp_time_wait_append(tcp);
4024 TCP_DBGSTAT(tcps, tcp_detach_time_wait);
4025 goto finish;
4026 }
4027
4028 /*
4029 * If delta is zero the timer event wasn't executed and was
4030 * successfully canceled. In this case we need to restart it
4031 * with the minimal delta possible.
4032 */
4033 if (delta >= 0) {
4034 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
4035 delta ? delta : 1);
4036 }
4037 } else {
4038 tcp_closei_local(tcp);
4039 CONN_DEC_REF(tcp->tcp_connp);
4040 }
4041 finish:
4042 /* Signal closing thread that it can complete close */
4043 mutex_enter(&tcp->tcp_closelock);
4044 tcp->tcp_detached = B_TRUE;
4045 ASSERT(tcps->tcps_g_q != NULL);
4046 tcp->tcp_rq = tcps->tcps_g_q;
4047 tcp->tcp_wq = WR(tcps->tcps_g_q);
4048 tcp->tcp_closed = 1;
4049 cv_signal(&tcp->tcp_closecv);
4050 mutex_exit(&tcp->tcp_closelock);
4051 }
4052
4053 /*
4054 * Handle lingering timeouts. This function is called when the SO_LINGER timeout
4055 * expires.
4056 */
4057 static void
4058 tcp_close_linger_timeout(void *arg)
4059 {
4060 conn_t *connp = (conn_t *)arg;
4061 tcp_t *tcp = connp->conn_tcp;
4062
4063 tcp->tcp_client_errno = ETIMEDOUT;
4064 tcp_stop_lingering(tcp);
4065 }
4066
4067 static int
4068 tcp_close(queue_t *q, int flags)
4069 {
4070 conn_t *connp = Q_TO_CONN(q);
4071 tcp_t *tcp = connp->conn_tcp;
4072 mblk_t *mp = &tcp->tcp_closemp;
4073 boolean_t conn_ioctl_cleanup_reqd = B_FALSE;
4074 mblk_t *bp;
4075
4076 ASSERT(WR(q)->q_next == NULL);
4077 ASSERT(connp->conn_ref >= 2);
4078
4079 /*
4080 * We are being closed as /dev/tcp or /dev/tcp6.
4081 *
4082 * Mark the conn as closing. ill_pending_mp_add will not
4083 * add any mp to the pending mp list, after this conn has
4084 * started closing. Same for sq_pending_mp_add
4085 */
4086 mutex_enter(&connp->conn_lock);
4087 connp->conn_state_flags |= CONN_CLOSING;
4088 if (connp->conn_oper_pending_ill != NULL)
4089 conn_ioctl_cleanup_reqd = B_TRUE;
4090 CONN_INC_REF_LOCKED(connp);
4091 mutex_exit(&connp->conn_lock);
4092 tcp->tcp_closeflags = (uint8_t)flags;
4093 ASSERT(connp->conn_ref >= 3);
4094
4095 /*
4096 * tcp_closemp_used is used below without any protection of a lock
4097 * as we don't expect any one else to use it concurrently at this
4098 * point otherwise it would be a major defect.
4099 */
4100
4101 if (mp->b_prev == NULL)
4102 tcp->tcp_closemp_used = B_TRUE;
4103 else
4104 cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: "
4105 "connp %p tcp %p\n", (void *)connp, (void *)tcp);
4106
4107 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
4108
4109 (*tcp_squeue_close_proc)(connp->conn_sqp, mp,
4110 tcp_close_output, connp, SQTAG_IP_TCP_CLOSE);
4111
4112 mutex_enter(&tcp->tcp_closelock);
4113 while (!tcp->tcp_closed) {
4114 if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
4115 /*
4116 * The cv_wait_sig() was interrupted. We now do the
4117 * following:
4118 *
4119 * 1) If the endpoint was lingering, we allow this
4120 * to be interrupted by cancelling the linger timeout
4121 * and closing normally.
4122 *
4123 * 2) Revert to calling cv_wait()
4124 *
4125 * We revert to using cv_wait() to avoid an
4126 * infinite loop which can occur if the calling
4127 * thread is higher priority than the squeue worker
4128 * thread and is bound to the same cpu.
4129 */
4130 if (tcp->tcp_linger && tcp->tcp_lingertime > 0) {
4131 mutex_exit(&tcp->tcp_closelock);
4132 /* Entering squeue, bump ref count. */
4133 CONN_INC_REF(connp);
4134 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
4135 squeue_enter(connp->conn_sqp, bp,
4136 tcp_linger_interrupted, connp,
4137 SQTAG_IP_TCP_CLOSE);
4138 mutex_enter(&tcp->tcp_closelock);
4139 }
4140 break;
4141 }
4142 }
4143 while (!tcp->tcp_closed)
4144 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
4145 mutex_exit(&tcp->tcp_closelock);
4146
4147 /*
4148 * In the case of listener streams that have eagers in the q or q0
4149 * we wait for the eagers to drop their reference to us. tcp_rq and
4150 * tcp_wq of the eagers point to our queues. By waiting for the
4151 * refcnt to drop to 1, we are sure that the eagers have cleaned
4152 * up their queue pointers and also dropped their references to us.
4153 */
4154 if (tcp->tcp_wait_for_eagers) {
4155 mutex_enter(&connp->conn_lock);
4156 while (connp->conn_ref != 1) {
4157 cv_wait(&connp->conn_cv, &connp->conn_lock);
4158 }
4159 mutex_exit(&connp->conn_lock);
4160 }
4161 /*
4162 * ioctl cleanup. The mp is queued in the
4163 * ill_pending_mp or in the sq_pending_mp.
4164 */
4165 if (conn_ioctl_cleanup_reqd)
4166 conn_ioctl_cleanup(connp);
4167
4168 qprocsoff(q);
4169 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4170
4171 tcp->tcp_cpid = -1;
4172
4173 /*
4174 * Drop IP's reference on the conn. This is the last reference
4175 * on the connp if the state was less than established. If the
4176 * connection has gone into timewait state, then we will have
4177 * one ref for the TCP and one more ref (total of two) for the
4178 * classifier connected hash list (a timewait connections stays
4179 * in connected hash till closed).
4180 *
4181 * We can't assert the references because there might be other
4182 * transient reference places because of some walkers or queued
4183 * packets in squeue for the timewait state.
4184 */
4185 CONN_DEC_REF(connp);
4186 q->q_ptr = WR(q)->q_ptr = NULL;
4187 return (0);
4188 }
4189
4190 static int
4191 tcpclose_accept(queue_t *q)
4192 {
4193 vmem_t *minor_arena;
4194 dev_t conn_dev;
4195
4196 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
4197
4198 /*
4199 * We had opened an acceptor STREAM for sockfs which is
4200 * now being closed due to some error.
4201 */
4202 qprocsoff(q);
4203
4204 minor_arena = (vmem_t *)WR(q)->q_ptr;
4205 conn_dev = (dev_t)RD(q)->q_ptr;
4206 ASSERT(minor_arena != NULL);
4207 ASSERT(conn_dev != 0);
4208 inet_minor_free(minor_arena, conn_dev);
4209 q->q_ptr = WR(q)->q_ptr = NULL;
4210 return (0);
4211 }
4212
4213 /*
4214 * Called by tcp_close() routine via squeue when lingering is
4215 * interrupted by a signal.
4216 */
4217
4218 /* ARGSUSED */
4219 static void
4220 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
4221 {
4222 conn_t *connp = (conn_t *)arg;
4223 tcp_t *tcp = connp->conn_tcp;
4224
4225 freeb(mp);
4226 if (tcp->tcp_linger_tid != 0 &&
4227 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
4228 tcp_stop_lingering(tcp);
4229 tcp->tcp_client_errno = EINTR;
4230 }
4231 }
4232
4233 /*
4234 * Called by streams close routine via squeues when our client blows off her
4235 * descriptor, we take this to mean: "close the stream state NOW, close the tcp
4236 * connection politely" When SO_LINGER is set (with a non-zero linger time and
4237 * it is not a nonblocking socket) then this routine sleeps until the FIN is
4238 * acked.
4239 *
4240 * NOTE: tcp_close potentially returns error when lingering.
4241 * However, the stream head currently does not pass these errors
4242 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
4243 * errors to the application (from tsleep()) and not errors
4244 * like ECONNRESET caused by receiving a reset packet.
4245 */
4246
4247 /* ARGSUSED */
4248 static void
4249 tcp_close_output(void *arg, mblk_t *mp, void *arg2)
4250 {
4251 char *msg;
4252 conn_t *connp = (conn_t *)arg;
4253 tcp_t *tcp = connp->conn_tcp;
4254 clock_t delta = 0;
4255 tcp_stack_t *tcps = tcp->tcp_tcps;
4256
4257 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
4258 (connp->conn_fanout == NULL && connp->conn_ref >= 3));
4259
4260 /* Cancel any pending timeout */
4261 if (tcp->tcp_ordrelid != 0) {
4262 if (tcp->tcp_timeout) {
4263 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid);
4264 }
4265 tcp->tcp_ordrelid = 0;
4266 tcp->tcp_timeout = B_FALSE;
4267 }
4268
4269 mutex_enter(&tcp->tcp_eager_lock);
4270 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
4271 /* Cleanup for listener */
4272 tcp_eager_cleanup(tcp, 0);
4273 tcp->tcp_wait_for_eagers = 1;
4274 }
4275 mutex_exit(&tcp->tcp_eager_lock);
4276
4277 connp->conn_mdt_ok = B_FALSE;
4278 tcp->tcp_mdt = B_FALSE;
4279
4280 connp->conn_lso_ok = B_FALSE;
4281 tcp->tcp_lso = B_FALSE;
4282
4283 msg = NULL;
4284 switch (tcp->tcp_state) {
4285 case TCPS_CLOSED:
4286 case TCPS_IDLE:
4287 case TCPS_BOUND:
4288 case TCPS_LISTEN:
4289 break;
4290 case TCPS_SYN_SENT:
4291 msg = "tcp_close, during connect";
4292 break;
4293 case TCPS_SYN_RCVD:
4294 /*
4295 * Close during the connect 3-way handshake
4296 * but here there may or may not be pending data
4297 * already on queue. Process almost same as in
4298 * the ESTABLISHED state.
4299 */
4300 /* FALLTHRU */
4301 default:
4302 if (tcp->tcp_sodirect != NULL) {
4303 /* Ok, no more sodirect */
4304 tcp->tcp_sodirect = NULL;
4305 }
4306
4307 if (tcp->tcp_fused)
4308 tcp_unfuse(tcp);
4309
4310 /*
4311 * If SO_LINGER has set a zero linger time, abort the
4312 * connection with a reset.
4313 */
4314 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) {
4315 msg = "tcp_close, zero lingertime";
4316 break;
4317 }
4318
4319 ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding);
4320 /*
4321 * Abort connection if there is unread data queued.
4322 */
4323 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
4324 msg = "tcp_close, unread data";
4325 break;
4326 }
4327 /*
4328 * tcp_hard_bound is now cleared thus all packets go through
4329 * tcp_lookup. This fact is used by tcp_detach below.
4330 *
4331 * We have done a qwait() above which could have possibly
4332 * drained more messages in turn causing transition to a
4333 * different state. Check whether we have to do the rest
4334 * of the processing or not.
4335 */
4336 if (tcp->tcp_state <= TCPS_LISTEN)
4337 break;
4338
4339 /*
4340 * Transmit the FIN before detaching the tcp_t.
4341 * After tcp_detach returns this queue/perimeter
4342 * no longer owns the tcp_t thus others can modify it.
4343 */
4344 (void) tcp_xmit_end(tcp);
4345
4346 /*
4347 * If lingering on close then wait until the fin is acked,
4348 * the SO_LINGER time passes, or a reset is sent/received.
4349 */
4350 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 &&
4351 !(tcp->tcp_fin_acked) &&
4352 tcp->tcp_state >= TCPS_ESTABLISHED) {
4353 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
4354 tcp->tcp_client_errno = EWOULDBLOCK;
4355 } else if (tcp->tcp_client_errno == 0) {
4356
4357 ASSERT(tcp->tcp_linger_tid == 0);
4358
4359 tcp->tcp_linger_tid = TCP_TIMER(tcp,
4360 tcp_close_linger_timeout,
4361 tcp->tcp_lingertime * hz);
4362
4363 /* tcp_close_linger_timeout will finish close */
4364 if (tcp->tcp_linger_tid == 0)
4365 tcp->tcp_client_errno = ENOSR;
4366 else
4367 return;
4368 }
4369
4370 /*
4371 * Check if we need to detach or just close
4372 * the instance.
4373 */
4374 if (tcp->tcp_state <= TCPS_LISTEN)
4375 break;
4376 }
4377
4378 /*
4379 * Make sure that no other thread will access the tcp_rq of
4380 * this instance (through lookups etc.) as tcp_rq will go
4381 * away shortly.
4382 */
4383 tcp_acceptor_hash_remove(tcp);
4384
4385 mutex_enter(&tcp->tcp_non_sq_lock);
4386 if (tcp->tcp_flow_stopped) {
4387 tcp_clrqfull(tcp);
4388 }
4389 mutex_exit(&tcp->tcp_non_sq_lock);
4390
4391 if (tcp->tcp_timer_tid != 0) {
4392 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4393 tcp->tcp_timer_tid = 0;
4394 }
4395 /*
4396 * Need to cancel those timers which will not be used when
4397 * TCP is detached. This has to be done before the tcp_wq
4398 * is set to the global queue.
4399 */
4400 tcp_timers_stop(tcp);
4401
4402 tcp->tcp_detached = B_TRUE;
4403 if (tcp->tcp_state == TCPS_TIME_WAIT) {
4404 tcp_time_wait_append(tcp);
4405 TCP_DBGSTAT(tcps, tcp_detach_time_wait);
4406 ASSERT(connp->conn_ref >= 3);
4407 goto finish;
4408 }
4409
4410 /*
4411 * If delta is zero the timer event wasn't executed and was
4412 * successfully canceled. In this case we need to restart it
4413 * with the minimal delta possible.
4414 */
4415 if (delta >= 0)
4416 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
4417 delta ? delta : 1);
4418
4419 ASSERT(connp->conn_ref >= 3);
4420 goto finish;
4421 }
4422
4423 /* Detach did not complete. Still need to remove q from stream. */
4424 if (msg) {
4425 if (tcp->tcp_state == TCPS_ESTABLISHED ||
4426 tcp->tcp_state == TCPS_CLOSE_WAIT)
4427 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
4428 if (tcp->tcp_state == TCPS_SYN_SENT ||
4429 tcp->tcp_state == TCPS_SYN_RCVD)
4430 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
4431 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST);
4432 }
4433
4434 tcp_closei_local(tcp);
4435 CONN_DEC_REF(connp);
4436 ASSERT(connp->conn_ref >= 2);
4437
4438 finish:
4439 /*
4440 * Although packets are always processed on the correct
4441 * tcp's perimeter and access is serialized via squeue's,
4442 * IP still needs a queue when sending packets in time_wait
4443 * state so use WR(tcps_g_q) till ip_output() can be
4444 * changed to deal with just connp. For read side, we
4445 * could have set tcp_rq to NULL but there are some cases
4446 * in tcp_rput_data() from early days of this code which
4447 * do a putnext without checking if tcp is closed. Those
4448 * need to be identified before both tcp_rq and tcp_wq
4449 * can be set to NULL and tcps_g_q can disappear forever.
4450 */
4451 mutex_enter(&tcp->tcp_closelock);
4452 /*
4453 * Don't change the queues in the case of a listener that has
4454 * eagers in its q or q0. It could surprise the eagers.
4455 * Instead wait for the eagers outside the squeue.
4456 */
4457 if (!tcp->tcp_wait_for_eagers) {
4458 tcp->tcp_detached = B_TRUE;
4459 /*
4460 * When default queue is closing we set tcps_g_q to NULL
4461 * after the close is done.
4462 */
4463 ASSERT(tcps->tcps_g_q != NULL);
4464 tcp->tcp_rq = tcps->tcps_g_q;
4465 tcp->tcp_wq = WR(tcps->tcps_g_q);
4466 }
4467
4468 /* Signal tcp_close() to finish closing. */
4469 tcp->tcp_closed = 1;
4470 cv_signal(&tcp->tcp_closecv);
4471 mutex_exit(&tcp->tcp_closelock);
4472 }
4473
4474
4475 /*
4476 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp.
4477 * Some stream heads get upset if they see these later on as anything but NULL.
4478 */
4479 static void
4480 tcp_close_mpp(mblk_t **mpp)
4481 {
4482 mblk_t *mp;
4483
4484 if ((mp = *mpp) != NULL) {
4485 do {
4486 mp->b_next = NULL;
4487 mp->b_prev = NULL;
4488 } while ((mp = mp->b_cont) != NULL);
4489
4490 mp = *mpp;
4491 *mpp = NULL;
4492 freemsg(mp);
4493 }
4494 }
4495
4496 /* Do detached close. */
4497 static void
4498 tcp_close_detached(tcp_t *tcp)
4499 {
4500 if (tcp->tcp_fused)
4501 tcp_unfuse(tcp);
4502
4503 /*
4504 * Clustering code serializes TCP disconnect callbacks and
4505 * cluster tcp list walks by blocking a TCP disconnect callback
4506 * if a cluster tcp list walk is in progress. This ensures
4507 * accurate accounting of TCPs in the cluster code even though
4508 * the TCP list walk itself is not atomic.
4509 */
4510 tcp_closei_local(tcp);
4511 CONN_DEC_REF(tcp->tcp_connp);
4512 }
4513
4514 /*
4515 * Stop all TCP timers, and free the timer mblks if requested.
4516 */
4517 void
4518 tcp_timers_stop(tcp_t *tcp)
4519 {
4520 if (tcp->tcp_timer_tid != 0) {
4521 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4522 tcp->tcp_timer_tid = 0;
4523 }
4524 if (tcp->tcp_ka_tid != 0) {
4525 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
4526 tcp->tcp_ka_tid = 0;
4527 }
4528 if (tcp->tcp_ack_tid != 0) {
4529 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4530 tcp->tcp_ack_tid = 0;
4531 }
4532 if (tcp->tcp_push_tid != 0) {
4533 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
4534 tcp->tcp_push_tid = 0;
4535 }
4536 }
4537
4538 /*
4539 * The tcp_t is going away. Remove it from all lists and set it
4540 * to TCPS_CLOSED. The freeing up of memory is deferred until
4541 * tcp_inactive. This is needed since a thread in tcp_rput might have
4542 * done a CONN_INC_REF on this structure before it was removed from the
4543 * hashes.
4544 */
4545 static void
4546 tcp_closei_local(tcp_t *tcp)
4547 {
4548 ire_t *ire;
4549 conn_t *connp = tcp->tcp_connp;
4550 tcp_stack_t *tcps = tcp->tcp_tcps;
4551
4552 if (!TCP_IS_SOCKET(tcp))
4553 tcp_acceptor_hash_remove(tcp);
4554
4555 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
4556 tcp->tcp_ibsegs = 0;
4557 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
4558 tcp->tcp_obsegs = 0;
4559
4560 /*
4561 * If we are an eager connection hanging off a listener that
4562 * hasn't formally accepted the connection yet, get off his
4563 * list and blow off any data that we have accumulated.
4564 */
4565 if (tcp->tcp_listener != NULL) {
4566 tcp_t *listener = tcp->tcp_listener;
4567 mutex_enter(&listener->tcp_eager_lock);
4568 /*
4569 * tcp_tconnind_started == B_TRUE means that the
4570 * conn_ind has already gone to listener. At
4571 * this point, eager will be closed but we
4572 * leave it in listeners eager list so that
4573 * if listener decides to close without doing
4574 * accept, we can clean this up. In tcp_wput_accept
4575 * we take care of the case of accept on closed
4576 * eager.
4577 */
4578 if (!tcp->tcp_tconnind_started) {
4579 tcp_eager_unlink(tcp);
4580 mutex_exit(&listener->tcp_eager_lock);
4581 /*
4582 * We don't want to have any pointers to the
4583 * listener queue, after we have released our
4584 * reference on the listener
4585 */
4586 ASSERT(tcps->tcps_g_q != NULL);
4587 tcp->tcp_rq = tcps->tcps_g_q;
4588 tcp->tcp_wq = WR(tcps->tcps_g_q);
4589 CONN_DEC_REF(listener->tcp_connp);
4590 } else {
4591 mutex_exit(&listener->tcp_eager_lock);
4592 }
4593 }
4594
4595 /* Stop all the timers */
4596 tcp_timers_stop(tcp);
4597
4598 if (tcp->tcp_state == TCPS_LISTEN) {
4599 if (tcp->tcp_ip_addr_cache) {
4600 kmem_free((void *)tcp->tcp_ip_addr_cache,
4601 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
4602 tcp->tcp_ip_addr_cache = NULL;
4603 }
4604 }
4605 mutex_enter(&tcp->tcp_non_sq_lock);
4606 if (tcp->tcp_flow_stopped)
4607 tcp_clrqfull(tcp);
4608 mutex_exit(&tcp->tcp_non_sq_lock);
4609
4610 tcp_bind_hash_remove(tcp);
4611 /*
4612 * If the tcp_time_wait_collector (which runs outside the squeue)
4613 * is trying to remove this tcp from the time wait list, we will
4614 * block in tcp_time_wait_remove while trying to acquire the
4615 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also
4616 * requires the ipcl_hash_remove to be ordered after the
4617 * tcp_time_wait_remove for the refcnt checks to work correctly.
4618 */
4619 if (tcp->tcp_state == TCPS_TIME_WAIT)
4620 (void) tcp_time_wait_remove(tcp, NULL);
4621 CL_INET_DISCONNECT(tcp);
4622 ipcl_hash_remove(connp);
4623
4624 /*
4625 * Delete the cached ire in conn_ire_cache and also mark
4626 * the conn as CONDEMNED
4627 */
4628 mutex_enter(&connp->conn_lock);
4629 connp->conn_state_flags |= CONN_CONDEMNED;
4630 ire = connp->conn_ire_cache;
4631 connp->conn_ire_cache = NULL;
4632 mutex_exit(&connp->conn_lock);
4633 if (ire != NULL)
4634 IRE_REFRELE_NOTR(ire);
4635
4636 /* Need to cleanup any pending ioctls */
4637 ASSERT(tcp->tcp_time_wait_next == NULL);
4638 ASSERT(tcp->tcp_time_wait_prev == NULL);
4639 ASSERT(tcp->tcp_time_wait_expire == 0);
4640 tcp->tcp_state = TCPS_CLOSED;
4641
4642 /* Release any SSL context */
4643 if (tcp->tcp_kssl_ent != NULL) {
4644 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4645 tcp->tcp_kssl_ent = NULL;
4646 }
4647 if (tcp->tcp_kssl_ctx != NULL) {
4648 kssl_release_ctx(tcp->tcp_kssl_ctx);
4649 tcp->tcp_kssl_ctx = NULL;
4650 }
4651 tcp->tcp_kssl_pending = B_FALSE;
4652
4653 tcp_ipsec_cleanup(tcp);
4654 }
4655
4656 /*
4657 * tcp is dying (called from ipcl_conn_destroy and error cases).
4658 * Free the tcp_t in either case.
4659 */
4660 void
4661 tcp_free(tcp_t *tcp)
4662 {
4663 mblk_t *mp;
4664 ip6_pkt_t *ipp;
4665
4666 ASSERT(tcp != NULL);
4667 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
4668
4669 tcp->tcp_rq = NULL;
4670 tcp->tcp_wq = NULL;
4671
4672 tcp_close_mpp(&tcp->tcp_xmit_head);
4673 tcp_close_mpp(&tcp->tcp_reass_head);
4674 if (tcp->tcp_rcv_list != NULL) {
4675 /* Free b_next chain */
4676 tcp_close_mpp(&tcp->tcp_rcv_list);
4677 }
4678 if ((mp = tcp->tcp_urp_mp) != NULL) {
4679 freemsg(mp);
4680 }
4681 if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
4682 freemsg(mp);
4683 }
4684
4685 if (tcp->tcp_fused_sigurg_mp != NULL) {
4686 freeb(tcp->tcp_fused_sigurg_mp);
4687 tcp->tcp_fused_sigurg_mp = NULL;
4688 }
4689
4690 if (tcp->tcp_sack_info != NULL) {
4691 if (tcp->tcp_notsack_list != NULL) {
4692 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
4693 }
4694 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
4695 }
4696
4697 if (tcp->tcp_hopopts != NULL) {
4698 mi_free(tcp->tcp_hopopts);
4699 tcp->tcp_hopopts = NULL;
4700 tcp->tcp_hopoptslen = 0;
4701 }
4702 ASSERT(tcp->tcp_hopoptslen == 0);
4703 if (tcp->tcp_dstopts != NULL) {
4704 mi_free(tcp->tcp_dstopts);
4705 tcp->tcp_dstopts = NULL;
4706 tcp->tcp_dstoptslen = 0;
4707 }
4708 ASSERT(tcp->tcp_dstoptslen == 0);
4709 if (tcp->tcp_rtdstopts != NULL) {
4710 mi_free(tcp->tcp_rtdstopts);
4711 tcp->tcp_rtdstopts = NULL;
4712 tcp->tcp_rtdstoptslen = 0;
4713 }
4714 ASSERT(tcp->tcp_rtdstoptslen == 0);
4715 if (tcp->tcp_rthdr != NULL) {
4716 mi_free(tcp->tcp_rthdr);
4717 tcp->tcp_rthdr = NULL;
4718 tcp->tcp_rthdrlen = 0;
4719 }
4720 ASSERT(tcp->tcp_rthdrlen == 0);
4721
4722 ipp = &tcp->tcp_sticky_ipp;
4723 if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
4724 IPPF_RTHDR))
4725 ip6_pkt_free(ipp);
4726
4727 /*
4728 * Free memory associated with the tcp/ip header template.
4729 */
4730
4731 if (tcp->tcp_iphc != NULL)
4732 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
4733
4734 /*
4735 * Following is really a blowing away a union.
4736 * It happens to have exactly two members of identical size
4737 * the following code is enough.
4738 */
4739 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
4740
4741 if (tcp->tcp_tracebuf != NULL) {
4742 kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t));
4743 tcp->tcp_tracebuf = NULL;
4744 }
4745 }
4746
4747
4748 /*
4749 * Put a connection confirmation message upstream built from the
4750 * address information within 'iph' and 'tcph'. Report our success or failure.
4751 */
4752 static boolean_t
4753 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
4754 mblk_t **defermp)
4755 {
4756 sin_t sin;
4757 sin6_t sin6;
4758 mblk_t *mp;
4759 char *optp = NULL;
4760 int optlen = 0;
4761 cred_t *cr;
4762
4763 if (defermp != NULL)
4764 *defermp = NULL;
4765
4766 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
4767 /*
4768 * Return in T_CONN_CON results of option negotiation through
4769 * the T_CONN_REQ. Note: If there is an real end-to-end option
4770 * negotiation, then what is received from remote end needs
4771 * to be taken into account but there is no such thing (yet?)
4772 * in our TCP/IP.
4773 * Note: We do not use mi_offset_param() here as
4774 * tcp_opts_conn_req contents do not directly come from
4775 * an application and are either generated in kernel or
4776 * from user input that was already verified.
4777 */
4778 mp = tcp->tcp_conn.tcp_opts_conn_req;
4779 optp = (char *)(mp->b_rptr +
4780 ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
4781 optlen = (int)
4782 ((struct T_conn_req *)mp->b_rptr)->OPT_length;
4783 }
4784
4785 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
4786 ipha_t *ipha = (ipha_t *)iphdr;
4787
4788 /* packet is IPv4 */
4789 if (tcp->tcp_family == AF_INET) {
4790 sin = sin_null;
4791 sin.sin_addr.s_addr = ipha->ipha_src;
4792 sin.sin_port = *(uint16_t *)tcph->th_lport;
4793 sin.sin_family = AF_INET;
4794 mp = mi_tpi_conn_con(NULL, (char *)&sin,
4795 (int)sizeof (sin_t), optp, optlen);
4796 } else {
4797 sin6 = sin6_null;
4798 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4799 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4800 sin6.sin6_family = AF_INET6;
4801 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4802 (int)sizeof (sin6_t), optp, optlen);
4803
4804 }
4805 } else {
4806 ip6_t *ip6h = (ip6_t *)iphdr;
4807
4808 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
4809 ASSERT(tcp->tcp_family == AF_INET6);
4810 sin6 = sin6_null;
4811 sin6.sin6_addr = ip6h->ip6_src;
4812 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4813 sin6.sin6_family = AF_INET6;
4814 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4815 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4816 (int)sizeof (sin6_t), optp, optlen);
4817 }
4818
4819 if (!mp)
4820 return (B_FALSE);
4821
4822 if ((cr = DB_CRED(idmp)) != NULL) {
4823 mblk_setcred(mp, cr);
4824 DB_CPID(mp) = DB_CPID(idmp);
4825 }
4826
4827 if (defermp == NULL)
4828 putnext(tcp->tcp_rq, mp);
4829 else
4830 *defermp = mp;
4831
4832 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
4833 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
4834 return (B_TRUE);
4835 }
4836
4837 /*
4838 * Defense for the SYN attack -
4839 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
4840 * one from the list of droppable eagers. This list is a subset of q0.
4841 * see comments before the definition of MAKE_DROPPABLE().
4842 * 2. Don't drop a SYN request before its first timeout. This gives every
4843 * request at least til the first timeout to complete its 3-way handshake.
4844 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
4845 * requests currently on the queue that has timed out. This will be used
4846 * as an indicator of whether an attack is under way, so that appropriate
4847 * actions can be taken. (It's incremented in tcp_timer() and decremented
4848 * either when eager goes into ESTABLISHED, or gets freed up.)
4849 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
4850 * # of timeout drops back to <= q0len/32 => SYN alert off
4851 */
4852 static boolean_t
4853 tcp_drop_q0(tcp_t *tcp)
4854 {
4855 tcp_t *eager;
4856 mblk_t *mp;
4857 tcp_stack_t *tcps = tcp->tcp_tcps;
4858
4859 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
4860 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
4861
4862 /* Pick oldest eager from the list of droppable eagers */
4863 eager = tcp->tcp_eager_prev_drop_q0;
4864
4865 /* If list is empty. return B_FALSE */
4866 if (eager == tcp) {
4867 return (B_FALSE);
4868 }
4869
4870 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
4871 if ((mp = allocb(0, BPRI_HI)) == NULL)
4872 return (B_FALSE);
4873
4874 /*
4875 * Take this eager out from the list of droppable eagers since we are
4876 * going to drop it.
4877 */
4878 MAKE_UNDROPPABLE(eager);
4879
4880 if (tcp->tcp_debug) {
4881 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
4882 "tcp_drop_q0: listen half-open queue (max=%d) overflow"
4883 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
4884 tcp->tcp_conn_req_cnt_q0,
4885 tcp_display(tcp, NULL, DISP_PORT_ONLY));
4886 }
4887
4888 BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop);
4889
4890 /* Put a reference on the conn as we are enqueueing it in the sqeue */
4891 CONN_INC_REF(eager->tcp_connp);
4892
4893 /* Mark the IRE created for this SYN request temporary */
4894 tcp_ip_ire_mark_advice(eager);
4895 squeue_fill(eager->tcp_connp->conn_sqp, mp,
4896 tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0);
4897
4898 return (B_TRUE);
4899 }
4900
4901 int
4902 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
4903 tcph_t *tcph, uint_t ipvers, mblk_t *idmp)
4904 {
4905 tcp_t *ltcp = lconnp->conn_tcp;
4906 tcp_t *tcp = connp->conn_tcp;
4907 mblk_t *tpi_mp;
4908 ipha_t *ipha;
4909 ip6_t *ip6h;
4910 sin6_t sin6;
4911 in6_addr_t v6dst;
4912 int err;
4913 int ifindex = 0;
4914 cred_t *cr;
4915 tcp_stack_t *tcps = tcp->tcp_tcps;
4916
4917 if (ipvers == IPV4_VERSION) {
4918 ipha = (ipha_t *)mp->b_rptr;
4919
4920 connp->conn_send = ip_output;
4921 connp->conn_recv = tcp_input;
4922
4923 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
4924 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
4925
4926 sin6 = sin6_null;
4927 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4928 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
4929 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4930 sin6.sin6_family = AF_INET6;
4931 sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst,
4932 lconnp->conn_zoneid, tcps->tcps_netstack);
4933 if (tcp->tcp_recvdstaddr) {
4934 sin6_t sin6d;
4935
4936 sin6d = sin6_null;
4937 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
4938 &sin6d.sin6_addr);
4939 sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4940 sin6d.sin6_family = AF_INET;
4941 tpi_mp = mi_tpi_extconn_ind(NULL,
4942 (char *)&sin6d, sizeof (sin6_t),
4943 (char *)&tcp,
4944 (t_scalar_t)sizeof (intptr_t),
4945 (char *)&sin6d, sizeof (sin6_t),
4946 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4947 } else {
4948 tpi_mp = mi_tpi_conn_ind(NULL,
4949 (char *)&sin6, sizeof (sin6_t),
4950 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4951 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4952 }
4953 } else {
4954 ip6h = (ip6_t *)mp->b_rptr;
4955
4956 connp->conn_send = ip_output_v6;
4957 connp->conn_recv = tcp_input;
4958
4959 connp->conn_srcv6 = ip6h->ip6_dst;
4960 connp->conn_remv6 = ip6h->ip6_src;
4961
4962 /* db_cksumstuff is set at ip_fanout_tcp_v6 */
4963 ifindex = (int)DB_CKSUMSTUFF(mp);
4964 DB_CKSUMSTUFF(mp) = 0;
4965
4966 sin6 = sin6_null;
4967 sin6.sin6_addr = ip6h->ip6_src;
4968 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4969 sin6.sin6_family = AF_INET6;
4970 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4971 sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
4972 lconnp->conn_zoneid, tcps->tcps_netstack);
4973
4974 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
4975 /* Pass up the scope_id of remote addr */
4976 sin6.sin6_scope_id = ifindex;
4977 } else {
4978 sin6.sin6_scope_id = 0;
4979 }
4980 if (tcp->tcp_recvdstaddr) {
4981 sin6_t sin6d;
4982
4983 sin6d = sin6_null;
4984 sin6.sin6_addr = ip6h->ip6_dst;
4985 sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4986 sin6d.sin6_family = AF_INET;
4987 tpi_mp = mi_tpi_extconn_ind(NULL,
4988 (char *)&sin6d, sizeof (sin6_t),
4989 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4990 (char *)&sin6d, sizeof (sin6_t),
4991 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4992 } else {
4993 tpi_mp = mi_tpi_conn_ind(NULL,
4994 (char *)&sin6, sizeof (sin6_t),
4995 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4996 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4997 }
4998 }
4999
5000 if (tpi_mp == NULL)
5001 return (ENOMEM);
5002
5003 connp->conn_fport = *(uint16_t *)tcph->th_lport;
5004 connp->conn_lport = *(uint16_t *)tcph->th_fport;
5005 connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER);
5006 connp->conn_fully_bound = B_FALSE;
5007
5008 if (tcps->tcps_trace)
5009 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
5010
5011 /* Inherit information from the "parent" */
5012 tcp->tcp_ipversion = ltcp->tcp_ipversion;
5013 tcp->tcp_family = ltcp->tcp_family;
5014 tcp->tcp_wq = ltcp->tcp_wq;
5015 tcp->tcp_rq = ltcp->tcp_rq;
5016 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
5017 tcp->tcp_detached = B_TRUE;
5018 if ((err = tcp_init_values(tcp)) != 0) {
5019 freemsg(tpi_mp);
5020 return (err);
5021 }
5022
5023 if (ipvers == IPV4_VERSION) {
5024 if ((err = tcp_header_init_ipv4(tcp)) != 0) {
5025 freemsg(tpi_mp);
5026 return (err);
5027 }
5028 ASSERT(tcp->tcp_ipha != NULL);
5029 } else {
5030 /* ifindex must be already set */
5031 ASSERT(ifindex != 0);
5032
5033 if (ltcp->tcp_bound_if != 0) {
5034 /*
5035 * Set newtcp's bound_if equal to
5036 * listener's value. If ifindex is
5037 * not the same as ltcp->tcp_bound_if,
5038 * it must be a packet for the ipmp group
5039 * of interfaces
5040 */
5041 tcp->tcp_bound_if = ltcp->tcp_bound_if;
5042 } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
5043 tcp->tcp_bound_if = ifindex;
5044 }
5045
5046 tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
5047 tcp->tcp_recvifindex = 0;
5048 tcp->tcp_recvhops = 0xffffffffU;
5049 ASSERT(tcp->tcp_ip6h != NULL);
5050 }
5051
5052 tcp->tcp_lport = ltcp->tcp_lport;
5053
5054 if (ltcp->tcp_ipversion == tcp->tcp_ipversion) {
5055 if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) {
5056 /*
5057 * Listener had options of some sort; eager inherits.
5058 * Free up the eager template and allocate one
5059 * of the right size.
5060 */
5061 if (tcp->tcp_hdr_grown) {
5062 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
5063 } else {
5064 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
5065 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
5066 }
5067 tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len,
5068 KM_NOSLEEP);
5069 if (tcp->tcp_iphc == NULL) {
5070 tcp->tcp_iphc_len = 0;
5071 freemsg(tpi_mp);
5072 return (ENOMEM);
5073 }
5074 tcp->tcp_iphc_len = ltcp->tcp_iphc_len;
5075 tcp->tcp_hdr_grown = B_TRUE;
5076 }
5077 tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
5078 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
5079 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5080 tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops;
5081 tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf;
5082
5083 /*
5084 * Copy the IP+TCP header template from listener to eager
5085 */
5086 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
5087 if (tcp->tcp_ipversion == IPV6_VERSION) {
5088 if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt ==
5089 IPPROTO_RAW) {
5090 tcp->tcp_ip6h =
5091 (ip6_t *)(tcp->tcp_iphc +
5092 sizeof (ip6i_t));
5093 } else {
5094 tcp->tcp_ip6h =
5095 (ip6_t *)(tcp->tcp_iphc);
5096 }
5097 tcp->tcp_ipha = NULL;
5098 } else {
5099 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
5100 tcp->tcp_ip6h = NULL;
5101 }
5102 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
5103 tcp->tcp_ip_hdr_len);
5104 } else {
5105 /*
5106 * only valid case when ipversion of listener and
5107 * eager differ is when listener is IPv6 and
5108 * eager is IPv4.
5109 * Eager header template has been initialized to the
5110 * maximum v4 header sizes, which includes space for
5111 * TCP and IP options.
5112 */
5113 ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) &&
5114 (tcp->tcp_ipversion == IPV4_VERSION));
5115 ASSERT(tcp->tcp_iphc_len >=
5116 TCP_MAX_COMBINED_HEADER_LENGTH);
5117 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5118 /* copy IP header fields individually */
5119 tcp->tcp_ipha->ipha_ttl =
5120 ltcp->tcp_ip6h->ip6_hops;
5121 bcopy(ltcp->tcp_tcph->th_lport,
5122 tcp->tcp_tcph->th_lport, sizeof (ushort_t));
5123 }
5124
5125 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
5126 bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport,
5127 sizeof (in_port_t));
5128
5129 if (ltcp->tcp_lport == 0) {
5130 tcp->tcp_lport = *(in_port_t *)tcph->th_fport;
5131 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport,
5132 sizeof (in_port_t));
5133 }
5134
5135 if (tcp->tcp_ipversion == IPV4_VERSION) {
5136 ASSERT(ipha != NULL);
5137 tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
5138 tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
5139
5140 /* Source routing option copyover (reverse it) */
5141 if (tcps->tcps_rev_src_routes)
5142 tcp_opt_reverse(tcp, ipha);
5143 } else {
5144 ASSERT(ip6h != NULL);
5145 tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src;
5146 tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst;
5147 }
5148
5149 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
5150 ASSERT(!tcp->tcp_tconnind_started);
5151 /*
5152 * If the SYN contains a credential, it's a loopback packet; attach
5153 * the credential to the TPI message.
5154 */
5155 if ((cr = DB_CRED(idmp)) != NULL) {
5156 mblk_setcred(tpi_mp, cr);
5157 DB_CPID(tpi_mp) = DB_CPID(idmp);
5158 }
5159 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
5160
5161 /* Inherit the listener's SSL protection state */
5162
5163 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
5164 kssl_hold_ent(tcp->tcp_kssl_ent);
5165 tcp->tcp_kssl_pending = B_TRUE;
5166 }
5167
5168 return (0);
5169 }
5170
5171
5172 int
5173 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
5174 tcph_t *tcph, mblk_t *idmp)
5175 {
5176 tcp_t *ltcp = lconnp->conn_tcp;
5177 tcp_t *tcp = connp->conn_tcp;
5178 sin_t sin;
5179 mblk_t *tpi_mp = NULL;
5180 int err;
5181 cred_t *cr;
5182 tcp_stack_t *tcps = tcp->tcp_tcps;
5183
5184 sin = sin_null;
5185 sin.sin_addr.s_addr = ipha->ipha_src;
5186 sin.sin_port = *(uint16_t *)tcph->th_lport;
5187 sin.sin_family = AF_INET;
5188 if (ltcp->tcp_recvdstaddr) {
5189 sin_t sind;
5190
5191 sind = sin_null;
5192 sind.sin_addr.s_addr = ipha->ipha_dst;
5193 sind.sin_port = *(uint16_t *)tcph->th_fport;
5194 sind.sin_family = AF_INET;
5195 tpi_mp = mi_tpi_extconn_ind(NULL,
5196 (char *)&sind, sizeof (sin_t), (char *)&tcp,
5197 (t_scalar_t)sizeof (intptr_t), (char *)&sind,
5198 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5199 } else {
5200 tpi_mp = mi_tpi_conn_ind(NULL,
5201 (char *)&sin, sizeof (sin_t),
5202 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
5203 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5204 }
5205
5206 if (tpi_mp == NULL) {
5207 return (ENOMEM);
5208 }
5209
5210 connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER);
5211 connp->conn_send = ip_output;
5212 connp->conn_recv = tcp_input;
5213 connp->conn_fully_bound = B_FALSE;
5214
5215 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
5216 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
5217 connp->conn_fport = *(uint16_t *)tcph->th_lport;
5218 connp->conn_lport = *(uint16_t *)tcph->th_fport;
5219
5220 if (tcps->tcps_trace) {
5221 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
5222 }
5223
5224 /* Inherit information from the "parent" */
5225 tcp->tcp_ipversion = ltcp->tcp_ipversion;
5226 tcp->tcp_family = ltcp->tcp_family;
5227 tcp->tcp_wq = ltcp->tcp_wq;
5228 tcp->tcp_rq = ltcp->tcp_rq;
5229 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
5230 tcp->tcp_detached = B_TRUE;
5231 if ((err = tcp_init_values(tcp)) != 0) {
5232 freemsg(tpi_mp);
5233 return (err);
5234 }
5235
5236 /*
5237 * Let's make sure that eager tcp template has enough space to
5238 * copy IPv4 listener's tcp template. Since the conn_t structure is
5239 * preserved and tcp_iphc_len is also preserved, an eager conn_t may
5240 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or
5241 * more (in case of re-allocation of conn_t with tcp-IPv6 template with
5242 * extension headers or with ip6i_t struct). Note that bcopy() below
5243 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_
5244 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener.
5245 */
5246 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
5247 ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH);
5248
5249 tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
5250 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
5251 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5252 tcp->tcp_ttl = ltcp->tcp_ttl;
5253 tcp->tcp_tos = ltcp->tcp_tos;
5254
5255 /* Copy the IP+TCP header template from listener to eager */
5256 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
5257 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
5258 tcp->tcp_ip6h = NULL;
5259 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
5260 tcp->tcp_ip_hdr_len);
5261
5262 /* Initialize the IP addresses and Ports */
5263 tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
5264 tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
5265 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
5266 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t));
5267
5268 /* Source routing option copyover (reverse it) */
5269 if (tcps->tcps_rev_src_routes)
5270 tcp_opt_reverse(tcp, ipha);
5271
5272 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
5273 ASSERT(!tcp->tcp_tconnind_started);
5274
5275 /*
5276 * If the SYN contains a credential, it's a loopback packet; attach
5277 * the credential to the TPI message.
5278 */
5279 if ((cr = DB_CRED(idmp)) != NULL) {
5280 mblk_setcred(tpi_mp, cr);
5281 DB_CPID(tpi_mp) = DB_CPID(idmp);
5282 }
5283 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
5284
5285 /* Inherit the listener's SSL protection state */
5286 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
5287 kssl_hold_ent(tcp->tcp_kssl_ent);
5288 tcp->tcp_kssl_pending = B_TRUE;
5289 }
5290
5291 return (0);
5292 }
5293
5294 /*
5295 * sets up conn for ipsec.
5296 * if the first mblk is M_CTL it is consumed and mpp is updated.
5297 * in case of error mpp is freed.
5298 */
5299 conn_t *
5300 tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
5301 {
5302 conn_t *connp = tcp->tcp_connp;
5303 conn_t *econnp;
5304 squeue_t *new_sqp;
5305 mblk_t *first_mp = *mpp;
5306 mblk_t *mp = *mpp;
5307 boolean_t mctl_present = B_FALSE;
5308 uint_t ipvers;
5309
5310 econnp = tcp_get_conn(sqp, tcp->tcp_tcps);
5311 if (econnp == NULL) {
5312 freemsg(first_mp);
5313 return (NULL);
5314 }
5315 if (DB_TYPE(mp) == M_CTL) {
5316 if (mp->b_cont == NULL ||
5317 mp->b_cont->b_datap->db_type != M_DATA) {
5318 freemsg(first_mp);
5319 return (NULL);
5320 }
5321 mp = mp->b_cont;
5322 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) {
5323 freemsg(first_mp);
5324 return (NULL);
5325 }
5326
5327 mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5328 first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5329 mctl_present = B_TRUE;
5330 } else {
5331 ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY);
5332 mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5333 }
5334
5335 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5336 DB_CKSUMSTART(mp) = 0;
5337
5338 ASSERT(OK_32PTR(mp->b_rptr));
5339 ipvers = IPH_HDR_VERSION(mp->b_rptr);
5340 if (ipvers == IPV4_VERSION) {
5341 uint16_t *up;
5342 uint32_t ports;
5343 ipha_t *ipha;
5344
5345 ipha = (ipha_t *)mp->b_rptr;
5346 up = (uint16_t *)((uchar_t *)ipha +
5347 IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET);
5348 ports = *(uint32_t *)up;
5349 IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP,
5350 ipha->ipha_dst, ipha->ipha_src, ports);
5351 } else {
5352 uint16_t *up;
5353 uint32_t ports;
5354 uint16_t ip_hdr_len;
5355 uint8_t *nexthdrp;
5356 ip6_t *ip6h;
5357 tcph_t *tcph;
5358
5359 ip6h = (ip6_t *)mp->b_rptr;
5360 if (ip6h->ip6_nxt == IPPROTO_TCP) {
5361 ip_hdr_len = IPV6_HDR_LEN;
5362 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len,
5363 &nexthdrp) || *nexthdrp != IPPROTO_TCP) {
5364 CONN_DEC_REF(econnp);
5365 freemsg(first_mp);
5366 return (NULL);
5367 }
5368 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5369 up = (uint16_t *)tcph->th_lport;
5370 ports = *(uint32_t *)up;
5371 IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP,
5372 ip6h->ip6_dst, ip6h->ip6_src, ports);
5373 }
5374
5375 /*
5376 * The caller already ensured that there is a sqp present.
5377 */
5378 econnp->conn_sqp = new_sqp;
5379
5380 if (connp->conn_policy != NULL) {
5381 ipsec_in_t *ii;
5382 ii = (ipsec_in_t *)(first_mp->b_rptr);
5383 ASSERT(ii->ipsec_in_policy == NULL);
5384 IPPH_REFHOLD(connp->conn_policy);
5385 ii->ipsec_in_policy = connp->conn_policy;
5386
5387 first_mp->b_datap->db_type = IPSEC_POLICY_SET;
5388 if (!ip_bind_ipsec_policy_set(econnp, first_mp)) {
5389 CONN_DEC_REF(econnp);
5390 freemsg(first_mp);
5391 return (NULL);
5392 }
5393 }
5394
5395 if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
5396 CONN_DEC_REF(econnp);
5397 freemsg(first_mp);
5398 return (NULL);
5399 }
5400
5401 /*
5402 * If we know we have some policy, pass the "IPSEC"
5403 * options size TCP uses this adjust the MSS.
5404 */
5405 econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp);
5406 if (mctl_present) {
5407 freeb(first_mp);
5408 *mpp = mp;
5409 }
5410
5411 return (econnp);
5412 }
5413
5414 /*
5415 * tcp_get_conn/tcp_free_conn
5416 *
5417 * tcp_get_conn is used to get a clean tcp connection structure.
5418 * It tries to reuse the connections put on the freelist by the
5419 * time_wait_collector failing which it goes to kmem_cache. This
5420 * way has two benefits compared to just allocating from and
5421 * freeing to kmem_cache.
5422 * 1) The time_wait_collector can free (which includes the cleanup)
5423 * outside the squeue. So when the interrupt comes, we have a clean
5424 * connection sitting in the freelist. Obviously, this buys us
5425 * performance.
5426 *
5427 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request
5428 * has multiple disadvantages - tying up the squeue during alloc, and the
5429 * fact that IPSec policy initialization has to happen here which
5430 * requires us sending a M_CTL and checking for it i.e. real ugliness.
5431 * But allocating the conn/tcp in IP land is also not the best since
5432 * we can't check the 'q' and 'q0' which are protected by squeue and
5433 * blindly allocate memory which might have to be freed here if we are
5434 * not allowed to accept the connection. By using the freelist and
5435 * putting the conn/tcp back in freelist, we don't pay a penalty for
5436 * allocating memory without checking 'q/q0' and freeing it if we can't
5437 * accept the connection.
5438 *
5439 * Care should be taken to put the conn back in the same squeue's freelist
5440 * from which it was allocated. Best results are obtained if conn is
5441 * allocated from listener's squeue and freed to the same. Time wait
5442 * collector will free up the freelist is the connection ends up sitting
5443 * there for too long.
5444 */
5445 void *
5446 tcp_get_conn(void *arg, tcp_stack_t *tcps)
5447 {
5448 tcp_t *tcp = NULL;
5449 conn_t *connp = NULL;
5450 squeue_t *sqp = (squeue_t *)arg;
5451 tcp_squeue_priv_t *tcp_time_wait;
5452 netstack_t *ns;
5453
5454 tcp_time_wait =
5455 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
5456
5457 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
5458 tcp = tcp_time_wait->tcp_free_list;
5459 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
5460 if (tcp != NULL) {
5461 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
5462 tcp_time_wait->tcp_free_list_cnt--;
5463 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5464 tcp->tcp_time_wait_next = NULL;
5465 connp = tcp->tcp_connp;
5466 connp->conn_flags |= IPCL_REUSED;
5467
5468 ASSERT(tcp->tcp_tcps == NULL);
5469 ASSERT(connp->conn_netstack == NULL);
5470 ns = tcps->tcps_netstack;
5471 netstack_hold(ns);
5472 connp->conn_netstack = ns;
5473 tcp->tcp_tcps = tcps;
5474 TCPS_REFHOLD(tcps);
5475 ipcl_globalhash_insert(connp);
5476 return ((void *)connp);
5477 }
5478 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5479 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
5480 tcps->tcps_netstack)) == NULL)
5481 return (NULL);
5482 tcp = connp->conn_tcp;
5483 tcp->tcp_tcps = tcps;
5484 TCPS_REFHOLD(tcps);
5485 return ((void *)connp);
5486 }
5487
5488 /*
5489 * Update the cached label for the given tcp_t. This should be called once per
5490 * connection, and before any packets are sent or tcp_process_options is
5491 * invoked. Returns B_FALSE if the correct label could not be constructed.
5492 */
5493 static boolean_t
5494 tcp_update_label(tcp_t *tcp, const cred_t *cr)
5495 {
5496 conn_t *connp = tcp->tcp_connp;
5497
5498 if (tcp->tcp_ipversion == IPV4_VERSION) {
5499 uchar_t optbuf[IP_MAX_OPT_LENGTH];
5500 int added;
5501
5502 if (tsol_compute_label(cr, tcp->tcp_remote, optbuf,
5503 connp->conn_mac_exempt,
5504 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
5505 return (B_FALSE);
5506
5507 added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len);
5508 if (added == -1)
5509 return (B_FALSE);
5510 tcp->tcp_hdr_len += added;
5511 tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added);
5512 tcp->tcp_ip_hdr_len += added;
5513 if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) {
5514 tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3;
5515 added = tsol_prepend_option(optbuf, tcp->tcp_ipha,
5516 tcp->tcp_hdr_len);
5517 if (added == -1)
5518 return (B_FALSE);
5519 tcp->tcp_hdr_len += added;
5520 tcp->tcp_tcph = (tcph_t *)
5521 ((uchar_t *)tcp->tcp_tcph + added);
5522 tcp->tcp_ip_hdr_len += added;
5523 }
5524 } else {
5525 uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
5526
5527 if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf,
5528 connp->conn_mac_exempt,
5529 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
5530 return (B_FALSE);
5531 if (tsol_update_sticky(&tcp->tcp_sticky_ipp,
5532 &tcp->tcp_label_len, optbuf) != 0)
5533 return (B_FALSE);
5534 if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0)
5535 return (B_FALSE);
5536 }
5537
5538 connp->conn_ulp_labeled = 1;
5539
5540 return (B_TRUE);
5541 }
5542
5543 /* BEGIN CSTYLED */
5544 /*
5545 *
5546 * The sockfs ACCEPT path:
5547 * =======================
5548 *
5549 * The eager is now established in its own perimeter as soon as SYN is
5550 * received in tcp_conn_request(). When sockfs receives conn_ind, it
5551 * completes the accept processing on the acceptor STREAM. The sending
5552 * of conn_ind part is common for both sockfs listener and a TLI/XTI
5553 * listener but a TLI/XTI listener completes the accept processing
5554 * on the listener perimeter.
5555 *
5556 * Common control flow for 3 way handshake:
5557 * ----------------------------------------
5558 *
5559 * incoming SYN (listener perimeter) -> tcp_rput_data()
5560 * -> tcp_conn_request()
5561 *
5562 * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data()
5563 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind()
5564 *
5565 * Sockfs ACCEPT Path:
5566 * -------------------
5567 *
5568 * open acceptor stream (tcp_open allocates tcp_wput_accept()
5569 * as STREAM entry point)
5570 *
5571 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept()
5572 *
5573 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager
5574 * association (we are not behind eager's squeue but sockfs is protecting us
5575 * and no one knows about this stream yet. The STREAMS entry point q->q_info
5576 * is changed to point at tcp_wput().
5577 *
5578 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to
5579 * listener (done on listener's perimeter).
5580 *
5581 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish
5582 * accept.
5583 *
5584 * TLI/XTI client ACCEPT path:
5585 * ---------------------------
5586 *
5587 * soaccept() sends T_CONN_RES on the listener STREAM.
5588 *
5589 * tcp_accept() -> tcp_accept_swap() complete the processing and send
5590 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()).
5591 *
5592 * Locks:
5593 * ======
5594 *
5595 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
5596 * and listeners->tcp_eager_next_q.
5597 *
5598 * Referencing:
5599 * ============
5600 *
5601 * 1) We start out in tcp_conn_request by eager placing a ref on
5602 * listener and listener adding eager to listeners->tcp_eager_next_q0.
5603 *
5604 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
5605 * doing so we place a ref on the eager. This ref is finally dropped at the
5606 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
5607 * reference is dropped by the squeue framework.
5608 *
5609 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
5610 *
5611 * The reference must be released by the same entity that added the reference
5612 * In the above scheme, the eager is the entity that adds and releases the
5613 * references. Note that tcp_accept_finish executes in the squeue of the eager
5614 * (albeit after it is attached to the acceptor stream). Though 1. executes
5615 * in the listener's squeue, the eager is nascent at this point and the
5616 * reference can be considered to have been added on behalf of the eager.
5617 *
5618 * Eager getting a Reset or listener closing:
5619 * ==========================================
5620 *
5621 * Once the listener and eager are linked, the listener never does the unlink.
5622 * If the listener needs to close, tcp_eager_cleanup() is called which queues
5623 * a message on all eager perimeter. The eager then does the unlink, clears
5624 * any pointers to the listener's queue and drops the reference to the
5625 * listener. The listener waits in tcp_close outside the squeue until its
5626 * refcount has dropped to 1. This ensures that the listener has waited for
5627 * all eagers to clear their association with the listener.
5628 *
5629 * Similarly, if eager decides to go away, it can unlink itself and close.
5630 * When the T_CONN_RES comes down, we check if eager has closed. Note that
5631 * the reference to eager is still valid because of the extra ref we put
5632 * in tcp_send_conn_ind.
5633 *
5634 * Listener can always locate the eager under the protection
5635 * of the listener->tcp_eager_lock, and then do a refhold
5636 * on the eager during the accept processing.
5637 *
5638 * The acceptor stream accesses the eager in the accept processing
5639 * based on the ref placed on eager before sending T_conn_ind.
5640 * The only entity that can negate this refhold is a listener close
5641 * which is mutually exclusive with an active acceptor stream.
5642 *
5643 * Eager's reference on the listener
5644 * ===================================
5645 *
5646 * If the accept happens (even on a closed eager) the eager drops its
5647 * reference on the listener at the start of tcp_accept_finish. If the
5648 * eager is killed due to an incoming RST before the T_conn_ind is sent up,
5649 * the reference is dropped in tcp_closei_local. If the listener closes,
5650 * the reference is dropped in tcp_eager_kill. In all cases the reference
5651 * is dropped while executing in the eager's context (squeue).
5652 */
5653 /* END CSTYLED */
5654
5655 /* Process the SYN packet, mp, directed at the listener 'tcp' */
5656
5657 /*
5658 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
5659 * tcp_rput_data will not see any SYN packets.
5660 */
5661 /* ARGSUSED */
5662 void
5663 tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
5664 {
5665 tcph_t *tcph;
5666 uint32_t seg_seq;
5667 tcp_t *eager;
5668 uint_t ipvers;
5669 ipha_t *ipha;
5670 ip6_t *ip6h;
5671 int err;
5672 conn_t *econnp = NULL;
5673 squeue_t *new_sqp;
5674 mblk_t *mp1;
5675 uint_t ip_hdr_len;
5676 conn_t *connp = (conn_t *)arg;
5677 tcp_t *tcp = connp->conn_tcp;
5678 cred_t *credp;
5679 tcp_stack_t *tcps = tcp->tcp_tcps;
5680 ip_stack_t *ipst;
5681
5682 if (tcp->tcp_state != TCPS_LISTEN)
5683 goto error2;
5684
5685 ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0);
5686
5687 mutex_enter(&tcp->tcp_eager_lock);
5688 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) {
5689 mutex_exit(&tcp->tcp_eager_lock);
5690 TCP_STAT(tcps, tcp_listendrop);
5691 BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
5692 if (tcp->tcp_debug) {
5693 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
5694 "tcp_conn_request: listen backlog (max=%d) "
5695 "overflow (%d pending) on %s",
5696 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
5697 tcp_display(tcp, NULL, DISP_PORT_ONLY));
5698 }
5699 goto error2;
5700 }
5701
5702 if (tcp->tcp_conn_req_cnt_q0 >=
5703 tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
5704 /*
5705 * Q0 is full. Drop a pending half-open req from the queue
5706 * to make room for the new SYN req. Also mark the time we
5707 * drop a SYN.
5708 *
5709 * A more aggressive defense against SYN attack will
5710 * be to set the "tcp_syn_defense" flag now.
5711 */
5712 TCP_STAT(tcps, tcp_listendropq0);
5713 tcp->tcp_last_rcv_lbolt = lbolt64;
5714 if (!tcp_drop_q0(tcp)) {
5715 mutex_exit(&tcp->tcp_eager_lock);
5716 BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
5717 if (tcp->tcp_debug) {
5718 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
5719 "tcp_conn_request: listen half-open queue "
5720 "(max=%d) full (%d pending) on %s",
5721 tcps->tcps_conn_req_max_q0,
5722 tcp->tcp_conn_req_cnt_q0,
5723 tcp_display(tcp, NULL,
5724 DISP_PORT_ONLY));
5725 }
5726 goto error2;
5727 }
5728 }
5729 mutex_exit(&tcp->tcp_eager_lock);
5730
5731 /*
5732 * IP adds STRUIO_EAGER and ensures that the received packet is
5733 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6
5734 * link local address. If IPSec is enabled, db_struioflag has
5735 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER);
5736 * otherwise an error case if neither of them is set.
5737 */
5738 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
5739 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5740 DB_CKSUMSTART(mp) = 0;
5741 mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5742 econnp = (conn_t *)tcp_get_conn(arg2, tcps);
5743 if (econnp == NULL)
5744 goto error2;
5745 ASSERT(econnp->conn_netstack == connp->conn_netstack);
5746 econnp->conn_sqp = new_sqp;
5747 } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
5748 /*
5749 * mp is updated in tcp_get_ipsec_conn().
5750 */
5751 econnp = tcp_get_ipsec_conn(tcp, arg2, &mp);
5752 if (econnp == NULL) {
5753 /*
5754 * mp freed by tcp_get_ipsec_conn.
5755 */
5756 return;
5757 }
5758 ASSERT(econnp->conn_netstack == connp->conn_netstack);
5759 } else {
5760 goto error2;
5761 }
5762
5763 ASSERT(DB_TYPE(mp) == M_DATA);
5764
5765 ipvers = IPH_HDR_VERSION(mp->b_rptr);
5766 ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
5767 ASSERT(OK_32PTR(mp->b_rptr));
5768 if (ipvers == IPV4_VERSION) {
5769 ipha = (ipha_t *)mp->b_rptr;
5770 ip_hdr_len = IPH_HDR_LENGTH(ipha);
5771 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5772 } else {
5773 ip6h = (ip6_t *)mp->b_rptr;
5774 ip_hdr_len = ip_hdr_length_v6(mp, ip6h);
5775 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5776 }
5777
5778 if (tcp->tcp_family == AF_INET) {
5779 ASSERT(ipvers == IPV4_VERSION);
5780 err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp);
5781 } else {
5782 err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp);
5783 }
5784
5785 if (err)
5786 goto error3;
5787
5788 eager = econnp->conn_tcp;
5789
5790 /* Inherit various TCP parameters from the listener */
5791 eager->tcp_naglim = tcp->tcp_naglim;
5792 eager->tcp_first_timer_threshold =
5793 tcp->tcp_first_timer_threshold;
5794 eager->tcp_second_timer_threshold =
5795 tcp->tcp_second_timer_threshold;
5796
5797 eager->tcp_first_ctimer_threshold =
5798 tcp->tcp_first_ctimer_threshold;
5799 eager->tcp_second_ctimer_threshold =
5800 tcp->tcp_second_ctimer_threshold;
5801
5802 /*
5803 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics.
5804 * If it does not, the eager's receive window will be set to the
5805 * listener's receive window later in this function.
5806 */
5807 eager->tcp_rwnd = 0;
5808
5809 /*
5810 * Inherit listener's tcp_init_cwnd. Need to do this before
5811 * calling tcp_process_options() where tcp_mss_set() is called
5812 * to set the initial cwnd.
5813 */
5814 eager->tcp_init_cwnd = tcp->tcp_init_cwnd;
5815
5816 /*
5817 * Zones: tcp_adapt_ire() and tcp_send_data() both need the
5818 * zone id before the accept is completed in tcp_wput_accept().
5819 */
5820 econnp->conn_zoneid = connp->conn_zoneid;
5821 econnp->conn_allzones = connp->conn_allzones;
5822
5823 /* Copy nexthop information from listener to eager */
5824 if (connp->conn_nexthop_set) {
5825 econnp->conn_nexthop_set = connp->conn_nexthop_set;
5826 econnp->conn_nexthop_v4 = connp->conn_nexthop_v4;
5827 }
5828
5829 /*
5830 * TSOL: tsol_input_proc() needs the eager's cred before the
5831 * eager is accepted
5832 */
5833 econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred;
5834 crhold(credp);
5835
5836 /*
5837 * If the caller has the process-wide flag set, then default to MAC
5838 * exempt mode. This allows read-down to unlabeled hosts.
5839 */
5840 if (getpflags(NET_MAC_AWARE, credp) != 0)
5841 econnp->conn_mac_exempt = B_TRUE;
5842
5843 if (is_system_labeled()) {
5844 cred_t *cr;
5845
5846 if (connp->conn_mlp_type != mlptSingle) {
5847 cr = econnp->conn_peercred = DB_CRED(mp);
5848 if (cr != NULL)
5849 crhold(cr);
5850 else
5851 cr = econnp->conn_cred;
5852 DTRACE_PROBE2(mlp_syn_accept, conn_t *,
5853 econnp, cred_t *, cr)
5854 } else {
5855 cr = econnp->conn_cred;
5856 DTRACE_PROBE2(syn_accept, conn_t *,
5857 econnp, cred_t *, cr)
5858 }
5859
5860 if (!tcp_update_label(eager, cr)) {
5861 DTRACE_PROBE3(
5862 tx__ip__log__error__connrequest__tcp,
5863 char *, "eager connp(1) label on SYN mp(2) failed",
5864 conn_t *, econnp, mblk_t *, mp);
5865 goto error3;
5866 }
5867 }
5868
5869 eager->tcp_hard_binding = B_TRUE;
5870
5871 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
5872 TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5873
5874 CL_INET_CONNECT(eager);
5875
5876 /*
5877 * No need to check for multicast destination since ip will only pass
5878 * up multicasts to those that have expressed interest
5879 * TODO: what about rejecting broadcasts?
5880 * Also check that source is not a multicast or broadcast address.
5881 */
5882 eager->tcp_state = TCPS_SYN_RCVD;
5883
5884
5885 /*
5886 * There should be no ire in the mp as we are being called after
5887 * receiving the SYN.
5888 */
5889 ASSERT(tcp_ire_mp(mp) == NULL);
5890
5891 /*
5892 * Adapt our mss, ttl, ... according to information provided in IRE.
5893 */
5894
5895 if (tcp_adapt_ire(eager, NULL) == 0) {
5896 /* Undo the bind_hash_insert */
5897 tcp_bind_hash_remove(eager);
5898 goto error3;
5899 }
5900
5901 /* Process all TCP options. */
5902 tcp_process_options(eager, tcph);
5903
5904 /* Is the other end ECN capable? */
5905 if (tcps->tcps_ecn_permitted >= 1 &&
5906 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5907 eager->tcp_ecn_ok = B_TRUE;
5908 }
5909
5910 /*
5911 * listener->tcp_rq->q_hiwat should be the default window size or a
5912 * window size changed via SO_RCVBUF option. First round up the
5913 * eager's tcp_rwnd to the nearest MSS. Then find out the window
5914 * scale option value if needed. Call tcp_rwnd_set() to finish the
5915 * setting.
5916 *
5917 * Note if there is a rpipe metric associated with the remote host,
5918 * we should not inherit receive window size from listener.
5919 */
5920 eager->tcp_rwnd = MSS_ROUNDUP(
5921 (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat :
5922 eager->tcp_rwnd), eager->tcp_mss);
5923 if (eager->tcp_snd_ws_ok)
5924 tcp_set_ws_value(eager);
5925 /*
5926 * Note that this is the only place tcp_rwnd_set() is called for
5927 * accepting a connection. We need to call it here instead of
5928 * after the 3-way handshake because we need to tell the other
5929 * side our rwnd in the SYN-ACK segment.
5930 */
5931 (void) tcp_rwnd_set(eager, eager->tcp_rwnd);
5932
5933 /*
5934 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
5935 * via soaccept()->soinheritoptions() which essentially applies
5936 * all the listener options to the new STREAM. The options that we
5937 * need to take care of are:
5938 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
5939 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
5940 * SO_SNDBUF, SO_RCVBUF.
5941 *
5942 * SO_RCVBUF: tcp_rwnd_set() above takes care of it.
5943 * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When
5944 * tcp_maxpsz_set() gets called later from
5945 * tcp_accept_finish(), the option takes effect.
5946 *
5947 */
5948 /* Set the TCP options */
5949 eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater;
5950 eager->tcp_dgram_errind = tcp->tcp_dgram_errind;
5951 eager->tcp_oobinline = tcp->tcp_oobinline;
5952 eager->tcp_reuseaddr = tcp->tcp_reuseaddr;
5953 eager->tcp_broadcast = tcp->tcp_broadcast;
5954 eager->tcp_useloopback = tcp->tcp_useloopback;
5955 eager->tcp_dontroute = tcp->tcp_dontroute;
5956 eager->tcp_linger = tcp->tcp_linger;
5957 eager->tcp_lingertime = tcp->tcp_lingertime;
5958 if (tcp->tcp_ka_enabled)
5959 eager->tcp_ka_enabled = 1;
5960
5961 /* Set the IP options */
5962 econnp->conn_broadcast = connp->conn_broadcast;
5963 econnp->conn_loopback = connp->conn_loopback;
5964 econnp->conn_dontroute = connp->conn_dontroute;
5965 econnp->conn_reuseaddr = connp->conn_reuseaddr;
5966
5967 /* Put a ref on the listener for the eager. */
5968 CONN_INC_REF(connp);
5969 mutex_enter(&tcp->tcp_eager_lock);
5970 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
5971 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
5972 tcp->tcp_eager_next_q0 = eager;
5973 eager->tcp_eager_prev_q0 = tcp;
5974
5975 /* Set tcp_listener before adding it to tcp_conn_fanout */
5976 eager->tcp_listener = tcp;
5977 eager->tcp_saved_listener = tcp;
5978
5979 /*
5980 * Tag this detached tcp vector for later retrieval
5981 * by our listener client in tcp_accept().
5982 */
5983 eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum;
5984 tcp->tcp_conn_req_cnt_q0++;
5985 if (++tcp->tcp_conn_req_seqnum == -1) {
5986 /*
5987 * -1 is "special" and defined in TPI as something
5988 * that should never be used in T_CONN_IND
5989 */
5990 ++tcp->tcp_conn_req_seqnum;
5991 }
5992 mutex_exit(&tcp->tcp_eager_lock);
5993
5994 if (tcp->tcp_syn_defense) {
5995 /* Don't drop the SYN that comes from a good IP source */
5996 ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache);
5997 if (addr_cache != NULL && eager->tcp_remote ==
5998 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
5999 eager->tcp_dontdrop = B_TRUE;
6000 }
6001 }
6002
6003 /*
6004 * We need to insert the eager in its own perimeter but as soon
6005 * as we do that, we expose the eager to the classifier and
6006 * should not touch any field outside the eager's perimeter.
6007 * So do all the work necessary before inserting the eager
6008 * in its own perimeter. Be optimistic that ipcl_conn_insert()
6009 * will succeed but undo everything if it fails.
6010 */
6011 seg_seq = ABE32_TO_U32(tcph->th_seq);
6012 eager->tcp_irs = seg_seq;
6013 eager->tcp_rack = seg_seq;
6014 eager->tcp_rnxt = seg_seq + 1;
6015 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
6016 BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
6017 eager->tcp_state = TCPS_SYN_RCVD;
6018 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
6019 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
6020 if (mp1 == NULL) {
6021 /*
6022 * Increment the ref count as we are going to
6023 * enqueueing an mp in squeue
6024 */
6025 CONN_INC_REF(econnp);
6026 goto error;
6027 }
6028 DB_CPID(mp1) = tcp->tcp_cpid;
6029 eager->tcp_cpid = tcp->tcp_cpid;
6030 eager->tcp_open_time = lbolt64;
6031
6032 /*
6033 * We need to start the rto timer. In normal case, we start
6034 * the timer after sending the packet on the wire (or at
6035 * least believing that packet was sent by waiting for
6036 * CALL_IP_WPUT() to return). Since this is the first packet
6037 * being sent on the wire for the eager, our initial tcp_rto
6038 * is at least tcp_rexmit_interval_min which is a fairly
6039 * large value to allow the algorithm to adjust slowly to large
6040 * fluctuations of RTT during first few transmissions.
6041 *
6042 * Starting the timer first and then sending the packet in this
6043 * case shouldn't make much difference since tcp_rexmit_interval_min
6044 * is of the order of several 100ms and starting the timer
6045 * first and then sending the packet will result in difference
6046 * of few micro seconds.
6047 *
6048 * Without this optimization, we are forced to hold the fanout
6049 * lock across the ipcl_bind_insert() and sending the packet
6050 * so that we don't race against an incoming packet (maybe RST)
6051 * for this eager.
6052 *
6053 * It is necessary to acquire an extra reference on the eager
6054 * at this point and hold it until after tcp_send_data() to
6055 * ensure against an eager close race.
6056 */
6057
6058 CONN_INC_REF(eager->tcp_connp);
6059
6060 TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT);
6061 TCP_TIMER_RESTART(eager, eager->tcp_rto);
6062
6063
6064 /*
6065 * Insert the eager in its own perimeter now. We are ready to deal
6066 * with any packets on eager.
6067 */
6068 if (eager->tcp_ipversion == IPV4_VERSION) {
6069 if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) {
6070 goto error;
6071 }
6072 } else {
6073 if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) {
6074 goto error;
6075 }
6076 }
6077
6078 /* mark conn as fully-bound */
6079 econnp->conn_fully_bound = B_TRUE;
6080
6081 /* Send the SYN-ACK */
6082 tcp_send_data(eager, eager->tcp_wq, mp1);
6083 CONN_DEC_REF(eager->tcp_connp);
6084 freemsg(mp);
6085
6086 return;
6087 error:
6088 freemsg(mp1);
6089 eager->tcp_closemp_used = B_TRUE;
6090 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
6091 squeue_fill(econnp->conn_sqp, &eager->tcp_closemp, tcp_eager_kill,
6092 econnp, SQTAG_TCP_CONN_REQ_2);
6093
6094 /*
6095 * If a connection already exists, send the mp to that connections so
6096 * that it can be appropriately dealt with.
6097 */
6098 ipst = tcps->tcps_netstack->netstack_ip;
6099
6100 if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) {
6101 if (!IPCL_IS_CONNECTED(econnp)) {
6102 /*
6103 * Something bad happened. ipcl_conn_insert()
6104 * failed because a connection already existed
6105 * in connected hash but we can't find it
6106 * anymore (someone blew it away). Just
6107 * free this message and hopefully remote
6108 * will retransmit at which time the SYN can be
6109 * treated as a new connection or dealth with
6110 * a TH_RST if a connection already exists.
6111 */
6112 CONN_DEC_REF(econnp);
6113 freemsg(mp);
6114 } else {
6115 squeue_fill(econnp->conn_sqp, mp, tcp_input,
6116 econnp, SQTAG_TCP_CONN_REQ_1);
6117 }
6118 } else {
6119 /* Nobody wants this packet */
6120 freemsg(mp);
6121 }
6122 return;
6123 error3:
6124 CONN_DEC_REF(econnp);
6125 error2:
6126 freemsg(mp);
6127 }
6128
6129 /*
6130 * In an ideal case of vertical partition in NUMA architecture, its
6131 * beneficial to have the listener and all the incoming connections
6132 * tied to the same squeue. The other constraint is that incoming
6133 * connections should be tied to the squeue attached to interrupted
6134 * CPU for obvious locality reason so this leaves the listener to
6135 * be tied to the same squeue. Our only problem is that when listener
6136 * is binding, the CPU that will get interrupted by the NIC whose
6137 * IP address the listener is binding to is not even known. So
6138 * the code below allows us to change that binding at the time the
6139 * CPU is interrupted by virtue of incoming connection's squeue.
6140 *
6141 * This is usefull only in case of a listener bound to a specific IP
6142 * address. For other kind of listeners, they get bound the
6143 * very first time and there is no attempt to rebind them.
6144 */
6145 void
6146 tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
6147 {
6148 conn_t *connp = (conn_t *)arg;
6149 squeue_t *sqp = (squeue_t *)arg2;
6150 squeue_t *new_sqp;
6151 uint32_t conn_flags;
6152
6153 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
6154 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
6155 } else {
6156 goto done;
6157 }
6158
6159 if (connp->conn_fanout == NULL)
6160 goto done;
6161
6162 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
6163 mutex_enter(&connp->conn_fanout->connf_lock);
6164 mutex_enter(&connp->conn_lock);
6165 /*
6166 * No one from read or write side can access us now
6167 * except for already queued packets on this squeue.
6168 * But since we haven't changed the squeue yet, they
6169 * can't execute. If they are processed after we have
6170 * changed the squeue, they are sent back to the
6171 * correct squeue down below.
6172 * But a listner close can race with processing of
6173 * incoming SYN. If incoming SYN processing changes
6174 * the squeue then the listener close which is waiting
6175 * to enter the squeue would operate on the wrong
6176 * squeue. Hence we don't change the squeue here unless
6177 * the refcount is exactly the minimum refcount. The
6178 * minimum refcount of 4 is counted as - 1 each for
6179 * TCP and IP, 1 for being in the classifier hash, and
6180 * 1 for the mblk being processed.
6181 */
6182
6183 if (connp->conn_ref != 4 ||
6184 connp->conn_tcp->tcp_state != TCPS_LISTEN) {
6185 mutex_exit(&connp->conn_lock);
6186 mutex_exit(&connp->conn_fanout->connf_lock);
6187 goto done;
6188 }
6189 if (connp->conn_sqp != new_sqp) {
6190 while (connp->conn_sqp != new_sqp)
6191 (void) casptr(&connp->conn_sqp, sqp, new_sqp);
6192 }
6193
6194 do {
6195 conn_flags = connp->conn_flags;
6196 conn_flags |= IPCL_FULLY_BOUND;
6197 (void) cas32(&connp->conn_flags, connp->conn_flags,
6198 conn_flags);
6199 } while (!(connp->conn_flags & IPCL_FULLY_BOUND));
6200
6201 mutex_exit(&connp->conn_fanout->connf_lock);
6202 mutex_exit(&connp->conn_lock);
6203 }
6204
6205 done:
6206 if (connp->conn_sqp != sqp) {
6207 CONN_INC_REF(connp);
6208 squeue_fill(connp->conn_sqp, mp,
6209 connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND);
6210 } else {
6211 tcp_conn_request(connp, mp, sqp);
6212 }
6213 }
6214
6215 /*
6216 * Successful connect request processing begins when our client passes
6217 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes
6218 * our T_OK_ACK reply message upstream. The control flow looks like this:
6219 * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP
6220 * upstream <- tcp_rput() <- IP
6221 * After various error checks are completed, tcp_connect() lays
6222 * the target address and port into the composite header template,
6223 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind
6224 * request followed by an IRE request, and passes the three mblk message
6225 * down to IP looking like this:
6226 * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client
6227 * Processing continues in tcp_rput() when we receive the following message:
6228 * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client
6229 * After consuming the first two mblks, tcp_rput() calls tcp_timer(),
6230 * to fire off the connection request, and then passes the T_OK_ACK mblk
6231 * upstream that we filled in below. There are, of course, numerous
6232 * error conditions along the way which truncate the processing described
6233 * above.
6234 */
6235 static void
6236 tcp_connect(tcp_t *tcp, mblk_t *mp)
6237 {
6238 sin_t *sin;
6239 sin6_t *sin6;
6240 queue_t *q = tcp->tcp_wq;
6241 struct T_conn_req *tcr;
6242 ipaddr_t *dstaddrp;
6243 in_port_t dstport;
6244 uint_t srcid;
6245
6246 tcr = (struct T_conn_req *)mp->b_rptr;
6247
6248 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
6249 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
6250 tcp_err_ack(tcp, mp, TPROTO, 0);
6251 return;
6252 }
6253
6254 /*
6255 * Determine packet type based on type of address passed in
6256 * the request should contain an IPv4 or IPv6 address.
6257 * Make sure that address family matches the type of
6258 * family of the the address passed down
6259 */
6260 switch (tcr->DEST_length) {
6261 default:
6262 tcp_err_ack(tcp, mp, TBADADDR, 0);
6263 return;
6264
6265 case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
6266 /*
6267 * XXX: The check for valid DEST_length was not there
6268 * in earlier releases and some buggy
6269 * TLI apps (e.g Sybase) got away with not feeding
6270 * in sin_zero part of address.
6271 * We allow that bug to keep those buggy apps humming.
6272 * Test suites require the check on DEST_length.
6273 * We construct a new mblk with valid DEST_length
6274 * free the original so the rest of the code does
6275 * not have to keep track of this special shorter
6276 * length address case.
6277 */
6278 mblk_t *nmp;
6279 struct T_conn_req *ntcr;
6280 sin_t *nsin;
6281
6282 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
6283 tcr->OPT_length, BPRI_HI);
6284 if (nmp == NULL) {
6285 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
6286 return;
6287 }
6288 ntcr = (struct T_conn_req *)nmp->b_rptr;
6289 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
6290 ntcr->PRIM_type = T_CONN_REQ;
6291 ntcr->DEST_length = sizeof (sin_t);
6292 ntcr->DEST_offset = sizeof (struct T_conn_req);
6293
6294 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
6295 *nsin = sin_null;
6296 /* Get pointer to shorter address to copy from original mp */
6297 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6298 tcr->DEST_length); /* extract DEST_length worth of sin_t */
6299 if (sin == NULL || !OK_32PTR((char *)sin)) {
6300 freemsg(nmp);
6301 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6302 return;
6303 }
6304 nsin->sin_family = sin->sin_family;
6305 nsin->sin_port = sin->sin_port;
6306 nsin->sin_addr = sin->sin_addr;
6307 /* Note:nsin->sin_zero zero-fill with sin_null assign above */
6308 nmp->b_wptr = (uchar_t *)&nsin[1];
6309 if (tcr->OPT_length != 0) {
6310 ntcr->OPT_length = tcr->OPT_length;
6311 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
6312 bcopy((uchar_t *)tcr + tcr->OPT_offset,
6313 (uchar_t *)ntcr + ntcr->OPT_offset,
6314 tcr->OPT_length);
6315 nmp->b_wptr += tcr->OPT_length;
6316 }
6317 freemsg(mp); /* original mp freed */
6318 mp = nmp; /* re-initialize original variables */
6319 tcr = ntcr;
6320 }
6321 /* FALLTHRU */
6322
6323 case sizeof (sin_t):
6324 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6325 sizeof (sin_t));
6326 if (sin == NULL || !OK_32PTR((char *)sin)) {
6327 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6328 return;
6329 }
6330 if (tcp->tcp_family != AF_INET ||
6331 sin->sin_family != AF_INET) {
6332 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6333 return;
6334 }
6335 if (sin->sin_port == 0) {
6336 tcp_err_ack(tcp, mp, TBADADDR, 0);
6337 return;
6338 }
6339 if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
6340 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6341 return;
6342 }
6343
6344 break;
6345
6346 case sizeof (sin6_t):
6347 sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
6348 sizeof (sin6_t));
6349 if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
6350 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6351 return;
6352 }
6353 if (tcp->tcp_family != AF_INET6 ||
6354 sin6->sin6_family != AF_INET6) {
6355 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6356 return;
6357 }
6358 if (sin6->sin6_port == 0) {
6359 tcp_err_ack(tcp, mp, TBADADDR, 0);
6360 return;
6361 }
6362 break;
6363 }
6364 /*
6365 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
6366 * should key on their sequence number and cut them loose.
6367 */
6368
6369 /*
6370 * If options passed in, feed it for verification and handling
6371 */
6372 if (tcr->OPT_length != 0) {
6373 mblk_t *ok_mp;
6374 mblk_t *discon_mp;
6375 mblk_t *conn_opts_mp;
6376 int t_error, sys_error, do_disconnect;
6377
6378 conn_opts_mp = NULL;
6379
6380 if (tcp_conprim_opt_process(tcp, mp,
6381 &do_disconnect, &t_error, &sys_error) < 0) {
6382 if (do_disconnect) {
6383 ASSERT(t_error == 0 && sys_error == 0);
6384 discon_mp = mi_tpi_discon_ind(NULL,
6385 ECONNREFUSED, 0);
6386 if (!discon_mp) {
6387 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6388 TSYSERR, ENOMEM);
6389 return;
6390 }
6391 ok_mp = mi_tpi_ok_ack_alloc(mp);
6392 if (!ok_mp) {
6393 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6394 TSYSERR, ENOMEM);
6395 return;
6396 }
6397 qreply(q, ok_mp);
6398 qreply(q, discon_mp); /* no flush! */
6399 } else {
6400 ASSERT(t_error != 0);
6401 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
6402 sys_error);
6403 }
6404 return;
6405 }
6406 /*
6407 * Success in setting options, the mp option buffer represented
6408 * by OPT_length/offset has been potentially modified and
6409 * contains results of option processing. We copy it in
6410 * another mp to save it for potentially influencing returning
6411 * it in T_CONN_CONN.
6412 */
6413 if (tcr->OPT_length != 0) { /* there are resulting options */
6414 conn_opts_mp = copyb(mp);
6415 if (!conn_opts_mp) {
6416 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6417 TSYSERR, ENOMEM);
6418 return;
6419 }
6420 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
6421 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
6422 /*
6423 * Note:
6424 * These resulting option negotiation can include any
6425 * end-to-end negotiation options but there no such
6426 * thing (yet?) in our TCP/IP.
6427 */
6428 }
6429 }
6430
6431 /*
6432 * If we're connecting to an IPv4-mapped IPv6 address, we need to
6433 * make sure that the template IP header in the tcp structure is an
6434 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We
6435 * need to this before we call tcp_bindi() so that the port lookup
6436 * code will look for ports in the correct port space (IPv4 and
6437 * IPv6 have separate port spaces).
6438 */
6439 if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
6440 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6441 int err = 0;
6442
6443 err = tcp_header_init_ipv4(tcp);
6444 if (err != 0) {
6445 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6446 goto connect_failed;
6447 }
6448 if (tcp->tcp_lport != 0)
6449 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
6450 }
6451
6452 if (tcp->tcp_issocket) {
6453 /*
6454 * TCP is _D_SODIRECT and sockfs is directly above so save
6455 * the shared sonode sodirect_t pointer (if any) to enable
6456 * TCP sodirect.
6457 */
6458 tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq);
6459 }
6460
6461 switch (tcp->tcp_state) {
6462 case TCPS_IDLE:
6463 /*
6464 * We support quick connect, refer to comments in
6465 * tcp_connect_*()
6466 */
6467 /* FALLTHRU */
6468 case TCPS_BOUND:
6469 case TCPS_LISTEN:
6470 if (tcp->tcp_family == AF_INET6) {
6471 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6472 tcp_connect_ipv6(tcp, mp,
6473 &sin6->sin6_addr,
6474 sin6->sin6_port, sin6->sin6_flowinfo,
6475 sin6->__sin6_src_id, sin6->sin6_scope_id);
6476 return;
6477 }
6478 /*
6479 * Destination adress is mapped IPv6 address.
6480 * Source bound address should be unspecified or
6481 * IPv6 mapped address as well.
6482 */
6483 if (!IN6_IS_ADDR_UNSPECIFIED(
6484 &tcp->tcp_bound_source_v6) &&
6485 !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
6486 mp = mi_tpi_err_ack_alloc(mp, TSYSERR,
6487 EADDRNOTAVAIL);
6488 break;
6489 }
6490 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
6491 dstport = sin6->sin6_port;
6492 srcid = sin6->__sin6_src_id;
6493 } else {
6494 dstaddrp = &sin->sin_addr.s_addr;
6495 dstport = sin->sin_port;
6496 srcid = 0;
6497 }
6498
6499 tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid);
6500 return;
6501 default:
6502 mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0);
6503 break;
6504 }
6505 /*
6506 * Note: Code below is the "failure" case
6507 */
6508 /* return error ack and blow away saved option results if any */
6509 connect_failed:
6510 if (mp != NULL)
6511 putnext(tcp->tcp_rq, mp);
6512 else {
6513 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6514 TSYSERR, ENOMEM);
6515 }
6516 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6517 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6518 }
6519
6520 /*
6521 * Handle connect to IPv4 destinations, including connections for AF_INET6
6522 * sockets connecting to IPv4 mapped IPv6 destinations.
6523 */
6524 static void
6525 tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport,
6526 uint_t srcid)
6527 {
6528 tcph_t *tcph;
6529 mblk_t *mp1;
6530 ipaddr_t dstaddr = *dstaddrp;
6531 int32_t oldstate;
6532 uint16_t lport;
6533 tcp_stack_t *tcps = tcp->tcp_tcps;
6534
6535 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
6536
6537 /* Check for attempt to connect to INADDR_ANY */
6538 if (dstaddr == INADDR_ANY) {
6539 /*
6540 * SunOS 4.x and 4.3 BSD allow an application
6541 * to connect a TCP socket to INADDR_ANY.
6542 * When they do this, the kernel picks the
6543 * address of one interface and uses it
6544 * instead. The kernel usually ends up
6545 * picking the address of the loopback
6546 * interface. This is an undocumented feature.
6547 * However, we provide the same thing here
6548 * in order to have source and binary
6549 * compatibility with SunOS 4.x.
6550 * Update the T_CONN_REQ (sin/sin6) since it is used to
6551 * generate the T_CONN_CON.
6552 */
6553 dstaddr = htonl(INADDR_LOOPBACK);
6554 *dstaddrp = dstaddr;
6555 }
6556
6557 /* Handle __sin6_src_id if socket not bound to an IP address */
6558 if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) {
6559 ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6,
6560 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
6561 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6,
6562 tcp->tcp_ipha->ipha_src);
6563 }
6564
6565 /*
6566 * Don't let an endpoint connect to itself. Note that
6567 * the test here does not catch the case where the
6568 * source IP addr was left unspecified by the user. In
6569 * this case, the source addr is set in tcp_adapt_ire()
6570 * using the reply to the T_BIND message that we send
6571 * down to IP here and the check is repeated in tcp_rput_other.
6572 */
6573 if (dstaddr == tcp->tcp_ipha->ipha_src &&
6574 dstport == tcp->tcp_lport) {
6575 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6576 goto failed;
6577 }
6578
6579 tcp->tcp_ipha->ipha_dst = dstaddr;
6580 IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6);
6581
6582 /*
6583 * Massage a source route if any putting the first hop
6584 * in iph_dst. Compute a starting value for the checksum which
6585 * takes into account that the original iph_dst should be
6586 * included in the checksum but that ip will include the
6587 * first hop in the source route in the tcp checksum.
6588 */
6589 tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack);
6590 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6591 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
6592 (tcp->tcp_ipha->ipha_dst & 0xffff));
6593 if ((int)tcp->tcp_sum < 0)
6594 tcp->tcp_sum--;
6595 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6596 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6597 (tcp->tcp_sum >> 16));
6598 tcph = tcp->tcp_tcph;
6599 *(uint16_t *)tcph->th_fport = dstport;
6600 tcp->tcp_fport = dstport;
6601
6602 oldstate = tcp->tcp_state;
6603 /*
6604 * At this point the remote destination address and remote port fields
6605 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6606 * have to see which state tcp was in so we can take apropriate action.
6607 */
6608 if (oldstate == TCPS_IDLE) {
6609 /*
6610 * We support a quick connect capability here, allowing
6611 * clients to transition directly from IDLE to SYN_SENT
6612 * tcp_bindi will pick an unused port, insert the connection
6613 * in the bind hash and transition to BOUND state.
6614 */
6615 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6616 tcp, B_TRUE);
6617 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6618 B_FALSE, B_FALSE);
6619 if (lport == 0) {
6620 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6621 goto failed;
6622 }
6623 }
6624 tcp->tcp_state = TCPS_SYN_SENT;
6625
6626 /*
6627 * TODO: allow data with connect requests
6628 * by unlinking M_DATA trailers here and
6629 * linking them in behind the T_OK_ACK mblk.
6630 * The tcp_rput() bind ack handler would then
6631 * feed them to tcp_wput_data() rather than call
6632 * tcp_timer().
6633 */
6634 mp = mi_tpi_ok_ack_alloc(mp);
6635 if (!mp) {
6636 tcp->tcp_state = oldstate;
6637 goto failed;
6638 }
6639 if (tcp->tcp_family == AF_INET) {
6640 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6641 sizeof (ipa_conn_t));
6642 } else {
6643 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6644 sizeof (ipa6_conn_t));
6645 }
6646 if (mp1) {
6647 /*
6648 * We need to make sure that the conn_recv is set to a non-null
6649 * value before we insert the conn_t into the classifier table.
6650 * This is to avoid a race with an incoming packet which does
6651 * an ipcl_classify().
6652 */
6653 tcp->tcp_connp->conn_recv = tcp_input;
6654
6655 /* Hang onto the T_OK_ACK for later. */
6656 linkb(mp1, mp);
6657 mblk_setcred(mp1, tcp->tcp_cred);
6658 if (tcp->tcp_family == AF_INET)
6659 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6660 else {
6661 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6662 &tcp->tcp_sticky_ipp);
6663 }
6664 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6665 tcp->tcp_active_open = 1;
6666 /*
6667 * If the bind cannot complete immediately
6668 * IP will arrange to call tcp_rput_other
6669 * when the bind completes.
6670 */
6671 if (mp1 != NULL)
6672 tcp_rput_other(tcp, mp1);
6673 return;
6674 }
6675 /* Error case */
6676 tcp->tcp_state = oldstate;
6677 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6678
6679 failed:
6680 /* return error ack and blow away saved option results if any */
6681 if (mp != NULL)
6682 putnext(tcp->tcp_rq, mp);
6683 else {
6684 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6685 TSYSERR, ENOMEM);
6686 }
6687 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6688 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6689
6690 }
6691
6692 /*
6693 * Handle connect to IPv6 destinations.
6694 */
6695 static void
6696 tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
6697 in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
6698 {
6699 tcph_t *tcph;
6700 mblk_t *mp1;
6701 ip6_rthdr_t *rth;
6702 int32_t oldstate;
6703 uint16_t lport;
6704 tcp_stack_t *tcps = tcp->tcp_tcps;
6705
6706 ASSERT(tcp->tcp_family == AF_INET6);
6707
6708 /*
6709 * If we're here, it means that the destination address is a native
6710 * IPv6 address. Return an error if tcp_ipversion is not IPv6. A
6711 * reason why it might not be IPv6 is if the socket was bound to an
6712 * IPv4-mapped IPv6 address.
6713 */
6714 if (tcp->tcp_ipversion != IPV6_VERSION) {
6715 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6716 goto failed;
6717 }
6718
6719 /*
6720 * Interpret a zero destination to mean loopback.
6721 * Update the T_CONN_REQ (sin/sin6) since it is used to
6722 * generate the T_CONN_CON.
6723 */
6724 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) {
6725 *dstaddrp = ipv6_loopback;
6726 }
6727
6728 /* Handle __sin6_src_id if socket not bound to an IP address */
6729 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
6730 ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src,
6731 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
6732 tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src;
6733 }
6734
6735 /*
6736 * Take care of the scope_id now and add ip6i_t
6737 * if ip6i_t is not already allocated through TCP
6738 * sticky options. At this point tcp_ip6h does not
6739 * have dst info, thus use dstaddrp.
6740 */
6741 if (scope_id != 0 &&
6742 IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
6743 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
6744 ip6i_t *ip6i;
6745
6746 ipp->ipp_ifindex = scope_id;
6747 ip6i = (ip6i_t *)tcp->tcp_iphc;
6748
6749 if ((ipp->ipp_fields & IPPF_HAS_IP6I) &&
6750 ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) {
6751 /* Already allocated */
6752 ip6i->ip6i_flags |= IP6I_IFINDEX;
6753 ip6i->ip6i_ifindex = ipp->ipp_ifindex;
6754 ipp->ipp_fields |= IPPF_SCOPE_ID;
6755 } else {
6756 int reterr;
6757
6758 ipp->ipp_fields |= IPPF_SCOPE_ID;
6759 if (ipp->ipp_fields & IPPF_HAS_IP6I)
6760 ip2dbg(("tcp_connect_v6: SCOPE_ID set\n"));
6761 reterr = tcp_build_hdrs(tcp->tcp_rq, tcp);
6762 if (reterr != 0)
6763 goto failed;
6764 ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n"));
6765 }
6766 }
6767
6768 /*
6769 * Don't let an endpoint connect to itself. Note that
6770 * the test here does not catch the case where the
6771 * source IP addr was left unspecified by the user. In
6772 * this case, the source addr is set in tcp_adapt_ire()
6773 * using the reply to the T_BIND message that we send
6774 * down to IP here and the check is repeated in tcp_rput_other.
6775 */
6776 if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) &&
6777 (dstport == tcp->tcp_lport)) {
6778 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6779 goto failed;
6780 }
6781
6782 tcp->tcp_ip6h->ip6_dst = *dstaddrp;
6783 tcp->tcp_remote_v6 = *dstaddrp;
6784 tcp->tcp_ip6h->ip6_vcf =
6785 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
6786 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
6787
6788
6789 /*
6790 * Massage a routing header (if present) putting the first hop
6791 * in ip6_dst. Compute a starting value for the checksum which
6792 * takes into account that the original ip6_dst should be
6793 * included in the checksum but that ip will include the
6794 * first hop in the source route in the tcp checksum.
6795 */
6796 rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph);
6797 if (rth != NULL) {
6798 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth,
6799 tcps->tcps_netstack);
6800 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6801 (tcp->tcp_sum >> 16));
6802 } else {
6803 tcp->tcp_sum = 0;
6804 }
6805
6806 tcph = tcp->tcp_tcph;
6807 *(uint16_t *)tcph->th_fport = dstport;
6808 tcp->tcp_fport = dstport;
6809
6810 oldstate = tcp->tcp_state;
6811 /*
6812 * At this point the remote destination address and remote port fields
6813 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6814 * have to see which state tcp was in so we can take apropriate action.
6815 */
6816 if (oldstate == TCPS_IDLE) {
6817 /*
6818 * We support a quick connect capability here, allowing
6819 * clients to transition directly from IDLE to SYN_SENT
6820 * tcp_bindi will pick an unused port, insert the connection
6821 * in the bind hash and transition to BOUND state.
6822 */
6823 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6824 tcp, B_TRUE);
6825 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6826 B_FALSE, B_FALSE);
6827 if (lport == 0) {
6828 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6829 goto failed;
6830 }
6831 }
6832 tcp->tcp_state = TCPS_SYN_SENT;
6833 /*
6834 * TODO: allow data with connect requests
6835 * by unlinking M_DATA trailers here and
6836 * linking them in behind the T_OK_ACK mblk.
6837 * The tcp_rput() bind ack handler would then
6838 * feed them to tcp_wput_data() rather than call
6839 * tcp_timer().
6840 */
6841 mp = mi_tpi_ok_ack_alloc(mp);
6842 if (!mp) {
6843 tcp->tcp_state = oldstate;
6844 goto failed;
6845 }
6846 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6847 if (mp1) {
6848 /*
6849 * We need to make sure that the conn_recv is set to a non-null
6850 * value before we insert the conn_t into the classifier table.
6851 * This is to avoid a race with an incoming packet which does
6852 * an ipcl_classify().
6853 */
6854 tcp->tcp_connp->conn_recv = tcp_input;
6855
6856 /* Hang onto the T_OK_ACK for later. */
6857 linkb(mp1, mp);
6858 mblk_setcred(mp1, tcp->tcp_cred);
6859 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6860 &tcp->tcp_sticky_ipp);
6861 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6862 tcp->tcp_active_open = 1;
6863 /* ip_bind_v6() may return ACK or ERROR */
6864 if (mp1 != NULL)
6865 tcp_rput_other(tcp, mp1);
6866 return;
6867 }
6868 /* Error case */
6869 tcp->tcp_state = oldstate;
6870 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6871
6872 failed:
6873 /* return error ack and blow away saved option results if any */
6874 if (mp != NULL)
6875 putnext(tcp->tcp_rq, mp);
6876 else {
6877 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6878 TSYSERR, ENOMEM);
6879 }
6880 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6881 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6882 }
6883
6884 /*
6885 * We need a stream q for detached closing tcp connections
6886 * to use. Our client hereby indicates that this q is the
6887 * one to use.
6888 */
6889 static void
6890 tcp_def_q_set(tcp_t *tcp, mblk_t *mp)
6891 {
6892 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
6893 queue_t *q = tcp->tcp_wq;
6894 tcp_stack_t *tcps = tcp->tcp_tcps;
6895
6896 #ifdef NS_DEBUG
6897 (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n",
6898 tcps->tcps_netstack->netstack_stackid);
6899 #endif
6900 mp->b_datap->db_type = M_IOCACK;
6901 iocp->ioc_count = 0;
6902 mutex_enter(&tcps->tcps_g_q_lock);
6903 if (tcps->tcps_g_q != NULL) {
6904 mutex_exit(&tcps->tcps_g_q_lock);
6905 iocp->ioc_error = EALREADY;
6906 } else {
6907 mblk_t *mp1;
6908
6909 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0);
6910 if (mp1 == NULL) {
6911 mutex_exit(&tcps->tcps_g_q_lock);
6912 iocp->ioc_error = ENOMEM;
6913 } else {
6914 tcps->tcps_g_q = tcp->tcp_rq;
6915 mutex_exit(&tcps->tcps_g_q_lock);
6916 iocp->ioc_error = 0;
6917 iocp->ioc_rval = 0;
6918 /*
6919 * We are passing tcp_sticky_ipp as NULL
6920 * as it is not useful for tcp_default queue
6921 *
6922 * Set conn_recv just in case.
6923 */
6924 tcp->tcp_connp->conn_recv = tcp_conn_request;
6925
6926 mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL);
6927 if (mp1 != NULL)
6928 tcp_rput_other(tcp, mp1);
6929 }
6930 }
6931 qreply(q, mp);
6932 }
6933
6934 /*
6935 * Our client hereby directs us to reject the connection request
6936 * that tcp_conn_request() marked with 'seqnum'. Rejection consists
6937 * of sending the appropriate RST, not an ICMP error.
6938 */
6939 static void
6940 tcp_disconnect(tcp_t *tcp, mblk_t *mp)
6941 {
6942 tcp_t *ltcp = NULL;
6943 t_scalar_t seqnum;
6944 conn_t *connp;
6945 tcp_stack_t *tcps = tcp->tcp_tcps;
6946
6947 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
6948 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
6949 tcp_err_ack(tcp, mp, TPROTO, 0);
6950 return;
6951 }
6952
6953 /*
6954 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
6955 * when the stream is in BOUND state. Do not send a reset,
6956 * since the destination IP address is not valid, and it can
6957 * be the initialized value of all zeros (broadcast address).
6958 *
6959 * If TCP has sent down a bind request to IP and has not
6960 * received the reply, reject the request. Otherwise, TCP
6961 * will be confused.
6962 */
6963 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
6964 if (tcp->tcp_debug) {
6965 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
6966 "tcp_disconnect: bad state, %d", tcp->tcp_state);
6967 }
6968 tcp_err_ack(tcp, mp, TOUTSTATE, 0);
6969 return;
6970 }
6971
6972 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
6973
6974 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
6975
6976 /*
6977 * According to TPI, for non-listeners, ignore seqnum
6978 * and disconnect.
6979 * Following interpretation of -1 seqnum is historical
6980 * and implied TPI ? (TPI only states that for T_CONN_IND,
6981 * a valid seqnum should not be -1).
6982 *
6983 * -1 means disconnect everything
6984 * regardless even on a listener.
6985 */
6986
6987 int old_state = tcp->tcp_state;
6988 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
6989
6990 /*
6991 * The connection can't be on the tcp_time_wait_head list
6992 * since it is not detached.
6993 */
6994 ASSERT(tcp->tcp_time_wait_next == NULL);
6995 ASSERT(tcp->tcp_time_wait_prev == NULL);
6996 ASSERT(tcp->tcp_time_wait_expire == 0);
6997 ltcp = NULL;
6998 /*
6999 * If it used to be a listener, check to make sure no one else
7000 * has taken the port before switching back to LISTEN state.
7001 */
7002 if (tcp->tcp_ipversion == IPV4_VERSION) {
7003 connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
7004 tcp->tcp_ipha->ipha_src,
7005 tcp->tcp_connp->conn_zoneid, ipst);
7006 if (connp != NULL)
7007 ltcp = connp->conn_tcp;
7008 } else {
7009 /* Allow tcp_bound_if listeners? */
7010 connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
7011 &tcp->tcp_ip6h->ip6_src, 0,
7012 tcp->tcp_connp->conn_zoneid, ipst);
7013 if (connp != NULL)
7014 ltcp = connp->conn_tcp;
7015 }
7016 if (tcp->tcp_conn_req_max && ltcp == NULL) {
7017 tcp->tcp_state = TCPS_LISTEN;
7018 } else if (old_state > TCPS_BOUND) {
7019 tcp->tcp_conn_req_max = 0;
7020 tcp->tcp_state = TCPS_BOUND;
7021 }
7022 if (ltcp != NULL)
7023 CONN_DEC_REF(ltcp->tcp_connp);
7024 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
7025 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
7026 } else if (old_state == TCPS_ESTABLISHED ||
7027 old_state == TCPS_CLOSE_WAIT) {
7028 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
7029 }
7030
7031 if (tcp->tcp_fused)
7032 tcp_unfuse(tcp);
7033
7034 mutex_enter(&tcp->tcp_eager_lock);
7035 if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
7036 (tcp->tcp_conn_req_cnt_q != 0)) {
7037 tcp_eager_cleanup(tcp, 0);
7038 }
7039 mutex_exit(&tcp->tcp_eager_lock);
7040
7041 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt,
7042 tcp->tcp_rnxt, TH_RST | TH_ACK);
7043
7044 tcp_reinit(tcp);
7045
7046 if (old_state >= TCPS_ESTABLISHED) {
7047 /* Send M_FLUSH according to TPI */
7048 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
7049 }
7050 mp = mi_tpi_ok_ack_alloc(mp);
7051 if (mp)
7052 putnext(tcp->tcp_rq, mp);
7053 return;
7054 } else if (!tcp_eager_blowoff(tcp, seqnum)) {
7055 tcp_err_ack(tcp, mp, TBADSEQ, 0);
7056 return;
7057 }
7058 if (tcp->tcp_state >= TCPS_ESTABLISHED) {
7059 /* Send M_FLUSH according to TPI */
7060 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
7061 }
7062 mp = mi_tpi_ok_ack_alloc(mp);
7063 if (mp)
7064 putnext(tcp->tcp_rq, mp);
7065 }
7066
7067 /*
7068 * Diagnostic routine used to return a string associated with the tcp state.
7069 * Note that if the caller does not supply a buffer, it will use an internal
7070 * static string. This means that if multiple threads call this function at
7071 * the same time, output can be corrupted... Note also that this function
7072 * does not check the size of the supplied buffer. The caller has to make
7073 * sure that it is big enough.
7074 */
7075 static char *
7076 tcp_display(tcp_t *tcp, char *sup_buf, char format)
7077 {
7078 char buf1[30];
7079 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80];
7080 char *buf;
7081 char *cp;
7082 in6_addr_t local, remote;
7083 char local_addrbuf[INET6_ADDRSTRLEN];
7084 char remote_addrbuf[INET6_ADDRSTRLEN];
7085
7086 if (sup_buf != NULL)
7087 buf = sup_buf;
7088 else
7089 buf = priv_buf;
7090
7091 if (tcp == NULL)
7092 return ("NULL_TCP");
7093 switch (tcp->tcp_state) {
7094 case TCPS_CLOSED:
7095 cp = "TCP_CLOSED";
7096 break;
7097 case TCPS_IDLE:
7098 cp = "TCP_IDLE";
7099 break;
7100 case TCPS_BOUND:
7101 cp = "TCP_BOUND";
7102 break;
7103 case TCPS_LISTEN:
7104 cp = "TCP_LISTEN";
7105 break;
7106 case TCPS_SYN_SENT:
7107 cp = "TCP_SYN_SENT";
7108 break;
7109 case TCPS_SYN_RCVD:
7110 cp = "TCP_SYN_RCVD";
7111 break;
7112 case TCPS_ESTABLISHED:
7113 cp = "TCP_ESTABLISHED";
7114 break;
7115 case TCPS_CLOSE_WAIT:
7116 cp = "TCP_CLOSE_WAIT";
7117 break;
7118 case TCPS_FIN_WAIT_1:
7119 cp = "TCP_FIN_WAIT_1";
7120 break;
7121 case TCPS_CLOSING:
7122 cp = "TCP_CLOSING";
7123 break;
7124 case TCPS_LAST_ACK:
7125 cp = "TCP_LAST_ACK";
7126 break;
7127 case TCPS_FIN_WAIT_2:
7128 cp = "TCP_FIN_WAIT_2";
7129 break;
7130 case TCPS_TIME_WAIT:
7131 cp = "TCP_TIME_WAIT";
7132 break;
7133 default:
7134 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
7135 cp = buf1;
7136 break;
7137 }
7138 switch (format) {
7139 case DISP_ADDR_AND_PORT:
7140 if (tcp->tcp_ipversion == IPV4_VERSION) {
7141 /*
7142 * Note that we use the remote address in the tcp_b
7143 * structure. This means that it will print out
7144 * the real destination address, not the next hop's
7145 * address if source routing is used.
7146 */
7147 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local);
7148 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote);
7149
7150 } else {
7151 local = tcp->tcp_ip_src_v6;
7152 remote = tcp->tcp_remote_v6;
7153 }
7154 (void) inet_ntop(AF_INET6, &local, local_addrbuf,
7155 sizeof (local_addrbuf));
7156 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
7157 sizeof (remote_addrbuf));
7158 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
7159 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf,
7160 ntohs(tcp->tcp_fport), cp);
7161 break;
7162 case DISP_PORT_ONLY:
7163 default:
7164 (void) mi_sprintf(buf, "[%u, %u] %s",
7165 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp);
7166 break;
7167 }
7168
7169 return (buf);
7170 }
7171
7172 /*
7173 * Called via squeue to get on to eager's perimeter. It sends a
7174 * TH_RST if eager is in the fanout table. The listener wants the
7175 * eager to disappear either by means of tcp_eager_blowoff() or
7176 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
7177 * called (via squeue) if the eager cannot be inserted in the
7178 * fanout table in tcp_conn_request().
7179 */
7180 /* ARGSUSED */
7181 void
7182 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
7183 {
7184 conn_t *econnp = (conn_t *)arg;
7185 tcp_t *eager = econnp->conn_tcp;
7186 tcp_t *listener = eager->tcp_listener;
7187 tcp_stack_t *tcps = eager->tcp_tcps;
7188
7189 /*
7190 * We could be called because listener is closing. Since
7191 * the eager is using listener's queue's, its not safe.
7192 * Better use the default queue just to send the TH_RST
7193 * out.
7194 */
7195 ASSERT(tcps->tcps_g_q != NULL);
7196 eager->tcp_rq = tcps->tcps_g_q;
7197 eager->tcp_wq = WR(tcps->tcps_g_q);
7198
7199 /*
7200 * An eager's conn_fanout will be NULL if it's a duplicate
7201 * for an existing 4-tuples in the conn fanout table.
7202 * We don't want to send an RST out in such case.
7203 */
7204 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
7205 tcp_xmit_ctl("tcp_eager_kill, can't wait",
7206 eager, eager->tcp_snxt, 0, TH_RST);
7207 }
7208
7209 /* We are here because listener wants this eager gone */
7210 if (listener != NULL) {
7211 mutex_enter(&listener->tcp_eager_lock);
7212 tcp_eager_unlink(eager);
7213 if (eager->tcp_tconnind_started) {
7214 /*
7215 * The eager has sent a conn_ind up to the
7216 * listener but listener decides to close
7217 * instead. We need to drop the extra ref
7218 * placed on eager in tcp_rput_data() before
7219 * sending the conn_ind to listener.
7220 */
7221 CONN_DEC_REF(econnp);
7222 }
7223 mutex_exit(&listener->tcp_eager_lock);
7224 CONN_DEC_REF(listener->tcp_connp);
7225 }
7226
7227 if (eager->tcp_state > TCPS_BOUND)
7228 tcp_close_detached(eager);
7229 }
7230
7231 /*
7232 * Reset any eager connection hanging off this listener marked
7233 * with 'seqnum' and then reclaim it's resources.
7234 */
7235 static boolean_t
7236 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum)
7237 {
7238 tcp_t *eager;
7239 mblk_t *mp;
7240 tcp_stack_t *tcps = listener->tcp_tcps;
7241
7242 TCP_STAT(tcps, tcp_eager_blowoff_calls);
7243 eager = listener;
7244 mutex_enter(&listener->tcp_eager_lock);
7245 do {
7246 eager = eager->tcp_eager_next_q;
7247 if (eager == NULL) {
7248 mutex_exit(&listener->tcp_eager_lock);
7249 return (B_FALSE);
7250 }
7251 } while (eager->tcp_conn_req_seqnum != seqnum);
7252
7253 if (eager->tcp_closemp_used) {
7254 mutex_exit(&listener->tcp_eager_lock);
7255 return (B_TRUE);
7256 }
7257 eager->tcp_closemp_used = B_TRUE;
7258 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7259 CONN_INC_REF(eager->tcp_connp);
7260 mutex_exit(&listener->tcp_eager_lock);
7261 mp = &eager->tcp_closemp;
7262 squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
7263 eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF);
7264 return (B_TRUE);
7265 }
7266
7267 /*
7268 * Reset any eager connection hanging off this listener
7269 * and then reclaim it's resources.
7270 */
7271 static void
7272 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
7273 {
7274 tcp_t *eager;
7275 mblk_t *mp;
7276 tcp_stack_t *tcps = listener->tcp_tcps;
7277
7278 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
7279
7280 if (!q0_only) {
7281 /* First cleanup q */
7282 TCP_STAT(tcps, tcp_eager_blowoff_q);
7283 eager = listener->tcp_eager_next_q;
7284 while (eager != NULL) {
7285 if (!eager->tcp_closemp_used) {
7286 eager->tcp_closemp_used = B_TRUE;
7287 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7288 CONN_INC_REF(eager->tcp_connp);
7289 mp = &eager->tcp_closemp;
7290 squeue_fill(eager->tcp_connp->conn_sqp, mp,
7291 tcp_eager_kill, eager->tcp_connp,
7292 SQTAG_TCP_EAGER_CLEANUP);
7293 }
7294 eager = eager->tcp_eager_next_q;
7295 }
7296 }
7297 /* Then cleanup q0 */
7298 TCP_STAT(tcps, tcp_eager_blowoff_q0);
7299 eager = listener->tcp_eager_next_q0;
7300 while (eager != listener) {
7301 if (!eager->tcp_closemp_used) {
7302 eager->tcp_closemp_used = B_TRUE;
7303 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7304 CONN_INC_REF(eager->tcp_connp);
7305 mp = &eager->tcp_closemp;
7306 squeue_fill(eager->tcp_connp->conn_sqp, mp,
7307 tcp_eager_kill, eager->tcp_connp,
7308 SQTAG_TCP_EAGER_CLEANUP_Q0);
7309 }
7310 eager = eager->tcp_eager_next_q0;
7311 }
7312 }
7313
7314 /*
7315 * If we are an eager connection hanging off a listener that hasn't
7316 * formally accepted the connection yet, get off his list and blow off
7317 * any data that we have accumulated.
7318 */
7319 static void
7320 tcp_eager_unlink(tcp_t *tcp)
7321 {
7322 tcp_t *listener = tcp->tcp_listener;
7323
7324 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
7325 ASSERT(listener != NULL);
7326 if (tcp->tcp_eager_next_q0 != NULL) {
7327 ASSERT(tcp->tcp_eager_prev_q0 != NULL);
7328
7329 /* Remove the eager tcp from q0 */
7330 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
7331 tcp->tcp_eager_prev_q0;
7332 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
7333 tcp->tcp_eager_next_q0;
7334 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
7335 listener->tcp_conn_req_cnt_q0--;
7336
7337 tcp->tcp_eager_next_q0 = NULL;
7338 tcp->tcp_eager_prev_q0 = NULL;
7339
7340 /*
7341 * Take the eager out, if it is in the list of droppable
7342 * eagers.
7343 */
7344 MAKE_UNDROPPABLE(tcp);
7345
7346 if (tcp->tcp_syn_rcvd_timeout != 0) {
7347 /* we have timed out before */
7348 ASSERT(listener->tcp_syn_rcvd_timeout > 0);
7349 listener->tcp_syn_rcvd_timeout--;
7350 }
7351 } else {
7352 tcp_t **tcpp = &listener->tcp_eager_next_q;
7353 tcp_t *prev = NULL;
7354
7355 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
7356 if (tcpp[0] == tcp) {
7357 if (listener->tcp_eager_last_q == tcp) {
7358 /*
7359 * If we are unlinking the last
7360 * element on the list, adjust
7361 * tail pointer. Set tail pointer
7362 * to nil when list is empty.
7363 */
7364 ASSERT(tcp->tcp_eager_next_q == NULL);
7365 if (listener->tcp_eager_last_q ==
7366 listener->tcp_eager_next_q) {
7367 listener->tcp_eager_last_q =
7368 NULL;
7369 } else {
7370 /*
7371 * We won't get here if there
7372 * is only one eager in the
7373 * list.
7374 */
7375 ASSERT(prev != NULL);
7376 listener->tcp_eager_last_q =
7377 prev;
7378 }
7379 }
7380 tcpp[0] = tcp->tcp_eager_next_q;
7381 tcp->tcp_eager_next_q = NULL;
7382 tcp->tcp_eager_last_q = NULL;
7383 ASSERT(listener->tcp_conn_req_cnt_q > 0);
7384 listener->tcp_conn_req_cnt_q--;
7385 break;
7386 }
7387 prev = tcpp[0];
7388 }
7389 }
7390 tcp->tcp_listener = NULL;
7391 }
7392
7393 /* Shorthand to generate and send TPI error acks to our client */
7394 static void
7395 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
7396 {
7397 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
7398 putnext(tcp->tcp_rq, mp);
7399 }
7400
7401 /* Shorthand to generate and send TPI error acks to our client */
7402 static void
7403 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
7404 int t_error, int sys_error)
7405 {
7406 struct T_error_ack *teackp;
7407
7408 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
7409 M_PCPROTO, T_ERROR_ACK)) != NULL) {
7410 teackp = (struct T_error_ack *)mp->b_rptr;
7411 teackp->ERROR_prim = primitive;
7412 teackp->TLI_error = t_error;
7413 teackp->UNIX_error = sys_error;
7414 putnext(tcp->tcp_rq, mp);
7415 }
7416 }
7417
7418 /*
7419 * Note: No locks are held when inspecting tcp_g_*epriv_ports
7420 * but instead the code relies on:
7421 * - the fact that the address of the array and its size never changes
7422 * - the atomic assignment of the elements of the array
7423 */
7424 /* ARGSUSED */
7425 static int
7426 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
7427 {
7428 int i;
7429 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7430
7431 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7432 if (tcps->tcps_g_epriv_ports[i] != 0)
7433 (void) mi_mpprintf(mp, "%d ",
7434 tcps->tcps_g_epriv_ports[i]);
7435 }
7436 return (0);
7437 }
7438
7439 /*
7440 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7441 * threads from changing it at the same time.
7442 */
7443 /* ARGSUSED */
7444 static int
7445 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7446 cred_t *cr)
7447 {
7448 long new_value;
7449 int i;
7450 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7451
7452 /*
7453 * Fail the request if the new value does not lie within the
7454 * port number limits.
7455 */
7456 if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
7457 new_value <= 0 || new_value >= 65536) {
7458 return (EINVAL);
7459 }
7460
7461 mutex_enter(&tcps->tcps_epriv_port_lock);
7462 /* Check if the value is already in the list */
7463 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7464 if (new_value == tcps->tcps_g_epriv_ports[i]) {
7465 mutex_exit(&tcps->tcps_epriv_port_lock);
7466 return (EEXIST);
7467 }
7468 }
7469 /* Find an empty slot */
7470 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7471 if (tcps->tcps_g_epriv_ports[i] == 0)
7472 break;
7473 }
7474 if (i == tcps->tcps_g_num_epriv_ports) {
7475 mutex_exit(&tcps->tcps_epriv_port_lock);
7476 return (EOVERFLOW);
7477 }
7478 /* Set the new value */
7479 tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value;
7480 mutex_exit(&tcps->tcps_epriv_port_lock);
7481 return (0);
7482 }
7483
7484 /*
7485 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7486 * threads from changing it at the same time.
7487 */
7488 /* ARGSUSED */
7489 static int
7490 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7491 cred_t *cr)
7492 {
7493 long new_value;
7494 int i;
7495 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7496
7497 /*
7498 * Fail the request if the new value does not lie within the
7499 * port number limits.
7500 */
7501 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 ||
7502 new_value >= 65536) {
7503 return (EINVAL);
7504 }
7505
7506 mutex_enter(&tcps->tcps_epriv_port_lock);
7507 /* Check that the value is already in the list */
7508 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7509 if (tcps->tcps_g_epriv_ports[i] == new_value)
7510 break;
7511 }
7512 if (i == tcps->tcps_g_num_epriv_ports) {
7513 mutex_exit(&tcps->tcps_epriv_port_lock);
7514 return (ESRCH);
7515 }
7516 /* Clear the value */
7517 tcps->tcps_g_epriv_ports[i] = 0;
7518 mutex_exit(&tcps->tcps_epriv_port_lock);
7519 return (0);
7520 }
7521
7522 /* Return the TPI/TLI equivalent of our current tcp_state */
7523 static int
7524 tcp_tpistate(tcp_t *tcp)
7525 {
7526 switch (tcp->tcp_state) {
7527 case TCPS_IDLE:
7528 return (TS_UNBND);
7529 case TCPS_LISTEN:
7530 /*
7531 * Return whether there are outstanding T_CONN_IND waiting
7532 * for the matching T_CONN_RES. Therefore don't count q0.
7533 */
7534 if (tcp->tcp_conn_req_cnt_q > 0)
7535 return (TS_WRES_CIND);
7536 else
7537 return (TS_IDLE);
7538 case TCPS_BOUND:
7539 return (TS_IDLE);
7540 case TCPS_SYN_SENT:
7541 return (TS_WCON_CREQ);
7542 case TCPS_SYN_RCVD:
7543 /*
7544 * Note: assumption: this has to the active open SYN_RCVD.
7545 * The passive instance is detached in SYN_RCVD stage of
7546 * incoming connection processing so we cannot get request
7547 * for T_info_ack on it.
7548 */
7549 return (TS_WACK_CRES);
7550 case TCPS_ESTABLISHED:
7551 return (TS_DATA_XFER);
7552 case TCPS_CLOSE_WAIT:
7553 return (TS_WREQ_ORDREL);
7554 case TCPS_FIN_WAIT_1:
7555 return (TS_WIND_ORDREL);
7556 case TCPS_FIN_WAIT_2:
7557 return (TS_WIND_ORDREL);
7558
7559 case TCPS_CLOSING:
7560 case TCPS_LAST_ACK:
7561 case TCPS_TIME_WAIT:
7562 case TCPS_CLOSED:
7563 /*
7564 * Following TS_WACK_DREQ7 is a rendition of "not
7565 * yet TS_IDLE" TPI state. There is no best match to any
7566 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
7567 * choose a value chosen that will map to TLI/XTI level
7568 * state of TSTATECHNG (state is process of changing) which
7569 * captures what this dummy state represents.
7570 */
7571 return (TS_WACK_DREQ7);
7572 default:
7573 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
7574 tcp->tcp_state, tcp_display(tcp, NULL,
7575 DISP_PORT_ONLY));
7576 return (TS_UNBND);
7577 }
7578 }
7579
7580 static void
7581 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
7582 {
7583 tcp_stack_t *tcps = tcp->tcp_tcps;
7584
7585 if (tcp->tcp_family == AF_INET6)
7586 *tia = tcp_g_t_info_ack_v6;
7587 else
7588 *tia = tcp_g_t_info_ack;
7589 tia->CURRENT_state = tcp_tpistate(tcp);
7590 tia->OPT_size = tcp_max_optsize;
7591 if (tcp->tcp_mss == 0) {
7592 /* Not yet set - tcp_open does not set mss */
7593 if (tcp->tcp_ipversion == IPV4_VERSION)
7594 tia->TIDU_size = tcps->tcps_mss_def_ipv4;
7595 else
7596 tia->TIDU_size = tcps->tcps_mss_def_ipv6;
7597 } else {
7598 tia->TIDU_size = tcp->tcp_mss;
7599 }
7600 /* TODO: Default ETSDU is 1. Is that correct for tcp? */
7601 }
7602
7603 /*
7604 * This routine responds to T_CAPABILITY_REQ messages. It is called by
7605 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from
7606 * tcp_g_t_info_ack. The current state of the stream is copied from
7607 * tcp_state.
7608 */
7609 static void
7610 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
7611 {
7612 t_uscalar_t cap_bits1;
7613 struct T_capability_ack *tcap;
7614
7615 if (MBLKL(mp) < sizeof (struct T_capability_req)) {
7616 freemsg(mp);
7617 return;
7618 }
7619
7620 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
7621
7622 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
7623 mp->b_datap->db_type, T_CAPABILITY_ACK);
7624 if (mp == NULL)
7625 return;
7626
7627 tcap = (struct T_capability_ack *)mp->b_rptr;
7628 tcap->CAP_bits1 = 0;
7629
7630 if (cap_bits1 & TC1_INFO) {
7631 tcp_copy_info(&tcap->INFO_ack, tcp);
7632 tcap->CAP_bits1 |= TC1_INFO;
7633 }
7634
7635 if (cap_bits1 & TC1_ACCEPTOR_ID) {
7636 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
7637 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
7638 }
7639
7640 putnext(tcp->tcp_rq, mp);
7641 }
7642
7643 /*
7644 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput.
7645 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
7646 * The current state of the stream is copied from tcp_state.
7647 */
7648 static void
7649 tcp_info_req(tcp_t *tcp, mblk_t *mp)
7650 {
7651 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
7652 T_INFO_ACK);
7653 if (!mp) {
7654 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7655 return;
7656 }
7657 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
7658 putnext(tcp->tcp_rq, mp);
7659 }
7660
7661 /* Respond to the TPI addr request */
7662 static void
7663 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
7664 {
7665 sin_t *sin;
7666 mblk_t *ackmp;
7667 struct T_addr_ack *taa;
7668
7669 /* Make it large enough for worst case */
7670 ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
7671 2 * sizeof (sin6_t), 1);
7672 if (ackmp == NULL) {
7673 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7674 return;
7675 }
7676
7677 if (tcp->tcp_ipversion == IPV6_VERSION) {
7678 tcp_addr_req_ipv6(tcp, ackmp);
7679 return;
7680 }
7681 taa = (struct T_addr_ack *)ackmp->b_rptr;
7682
7683 bzero(taa, sizeof (struct T_addr_ack));
7684 ackmp->b_wptr = (uchar_t *)&taa[1];
7685
7686 taa->PRIM_type = T_ADDR_ACK;
7687 ackmp->b_datap->db_type = M_PCPROTO;
7688
7689 /*
7690 * Note: Following code assumes 32 bit alignment of basic
7691 * data structures like sin_t and struct T_addr_ack.
7692 */
7693 if (tcp->tcp_state >= TCPS_BOUND) {
7694 /*
7695 * Fill in local address
7696 */
7697 taa->LOCADDR_length = sizeof (sin_t);
7698 taa->LOCADDR_offset = sizeof (*taa);
7699
7700 sin = (sin_t *)&taa[1];
7701
7702 /* Fill zeroes and then intialize non-zero fields */
7703 *sin = sin_null;
7704
7705 sin->sin_family = AF_INET;
7706
7707 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
7708 sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport;
7709
7710 ackmp->b_wptr = (uchar_t *)&sin[1];
7711
7712 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7713 /*
7714 * Fill in Remote address
7715 */
7716 taa->REMADDR_length = sizeof (sin_t);
7717 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7718 taa->LOCADDR_length);
7719
7720 sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7721 *sin = sin_null;
7722 sin->sin_family = AF_INET;
7723 sin->sin_addr.s_addr = tcp->tcp_remote;
7724 sin->sin_port = tcp->tcp_fport;
7725
7726 ackmp->b_wptr = (uchar_t *)&sin[1];
7727 }
7728 }
7729 putnext(tcp->tcp_rq, ackmp);
7730 }
7731
7732 /* Assumes that tcp_addr_req gets enough space and alignment */
7733 static void
7734 tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
7735 {
7736 sin6_t *sin6;
7737 struct T_addr_ack *taa;
7738
7739 ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
7740 ASSERT(OK_32PTR(ackmp->b_rptr));
7741 ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) +
7742 2 * sizeof (sin6_t));
7743
7744 taa = (struct T_addr_ack *)ackmp->b_rptr;
7745
7746 bzero(taa, sizeof (struct T_addr_ack));
7747 ackmp->b_wptr = (uchar_t *)&taa[1];
7748
7749 taa->PRIM_type = T_ADDR_ACK;
7750 ackmp->b_datap->db_type = M_PCPROTO;
7751
7752 /*
7753 * Note: Following code assumes 32 bit alignment of basic
7754 * data structures like sin6_t and struct T_addr_ack.
7755 */
7756 if (tcp->tcp_state >= TCPS_BOUND) {
7757 /*
7758 * Fill in local address
7759 */
7760 taa->LOCADDR_length = sizeof (sin6_t);
7761 taa->LOCADDR_offset = sizeof (*taa);
7762
7763 sin6 = (sin6_t *)&taa[1];
7764 *sin6 = sin6_null;
7765
7766 sin6->sin6_family = AF_INET6;
7767 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
7768 sin6->sin6_port = tcp->tcp_lport;
7769
7770 ackmp->b_wptr = (uchar_t *)&sin6[1];
7771
7772 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7773 /*
7774 * Fill in Remote address
7775 */
7776 taa->REMADDR_length = sizeof (sin6_t);
7777 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7778 taa->LOCADDR_length);
7779
7780 sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7781 *sin6 = sin6_null;
7782 sin6->sin6_family = AF_INET6;
7783 sin6->sin6_flowinfo =
7784 tcp->tcp_ip6h->ip6_vcf &
7785 ~IPV6_VERS_AND_FLOW_MASK;
7786 sin6->sin6_addr = tcp->tcp_remote_v6;
7787 sin6->sin6_port = tcp->tcp_fport;
7788
7789 ackmp->b_wptr = (uchar_t *)&sin6[1];
7790 }
7791 }
7792 putnext(tcp->tcp_rq, ackmp);
7793 }
7794
7795 /*
7796 * Handle reinitialization of a tcp structure.
7797 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE.
7798 */
7799 static void
7800 tcp_reinit(tcp_t *tcp)
7801 {
7802 mblk_t *mp;
7803 int err;
7804 tcp_stack_t *tcps = tcp->tcp_tcps;
7805
7806 TCP_STAT(tcps, tcp_reinit_calls);
7807
7808 /* tcp_reinit should never be called for detached tcp_t's */
7809 ASSERT(tcp->tcp_listener == NULL);
7810 ASSERT((tcp->tcp_family == AF_INET &&
7811 tcp->tcp_ipversion == IPV4_VERSION) ||
7812 (tcp->tcp_family == AF_INET6 &&
7813 (tcp->tcp_ipversion == IPV4_VERSION ||
7814 tcp->tcp_ipversion == IPV6_VERSION)));
7815
7816 /* Cancel outstanding timers */
7817 tcp_timers_stop(tcp);
7818
7819 /*
7820 * Reset everything in the state vector, after updating global
7821 * MIB data from instance counters.
7822 */
7823 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
7824 tcp->tcp_ibsegs = 0;
7825 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
7826 tcp->tcp_obsegs = 0;
7827
7828 tcp_close_mpp(&tcp->tcp_xmit_head);
7829 if (tcp->tcp_snd_zcopy_aware)
7830 tcp_zcopy_notify(tcp);
7831 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
7832 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
7833 mutex_enter(&tcp->tcp_non_sq_lock);
7834 if (tcp->tcp_flow_stopped &&
7835 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
7836 tcp_clrqfull(tcp);
7837 }
7838 mutex_exit(&tcp->tcp_non_sq_lock);
7839 tcp_close_mpp(&tcp->tcp_reass_head);
7840 tcp->tcp_reass_tail = NULL;
7841 if (tcp->tcp_rcv_list != NULL) {
7842 /* Free b_next chain */
7843 tcp_close_mpp(&tcp->tcp_rcv_list);
7844 tcp->tcp_rcv_last_head = NULL;
7845 tcp->tcp_rcv_last_tail = NULL;
7846 tcp->tcp_rcv_cnt = 0;
7847 }
7848 tcp->tcp_rcv_last_tail = NULL;
7849
7850 if ((mp = tcp->tcp_urp_mp) != NULL) {
7851 freemsg(mp);
7852 tcp->tcp_urp_mp = NULL;
7853 }
7854 if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
7855 freemsg(mp);
7856 tcp->tcp_urp_mark_mp = NULL;
7857 }
7858 if (tcp->tcp_fused_sigurg_mp != NULL) {
7859 freeb(tcp->tcp_fused_sigurg_mp);
7860 tcp->tcp_fused_sigurg_mp = NULL;
7861 }
7862
7863 /*
7864 * Following is a union with two members which are
7865 * identical types and size so the following cleanup
7866 * is enough.
7867 */
7868 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
7869
7870 CL_INET_DISCONNECT(tcp);
7871
7872 /*
7873 * The connection can't be on the tcp_time_wait_head list
7874 * since it is not detached.
7875 */
7876 ASSERT(tcp->tcp_time_wait_next == NULL);
7877 ASSERT(tcp->tcp_time_wait_prev == NULL);
7878 ASSERT(tcp->tcp_time_wait_expire == 0);
7879
7880 if (tcp->tcp_kssl_pending) {
7881 tcp->tcp_kssl_pending = B_FALSE;
7882
7883 /* Don't reset if the initialized by bind. */
7884 if (tcp->tcp_kssl_ent != NULL) {
7885 kssl_release_ent(tcp->tcp_kssl_ent, NULL,
7886 KSSL_NO_PROXY);
7887 }
7888 }
7889 if (tcp->tcp_kssl_ctx != NULL) {
7890 kssl_release_ctx(tcp->tcp_kssl_ctx);
7891 tcp->tcp_kssl_ctx = NULL;
7892 }
7893
7894 /*
7895 * Reset/preserve other values
7896 */
7897 tcp_reinit_values(tcp);
7898 ipcl_hash_remove(tcp->tcp_connp);
7899 conn_delete_ire(tcp->tcp_connp, NULL);
7900 tcp_ipsec_cleanup(tcp);
7901
7902 if (tcp->tcp_conn_req_max != 0) {
7903 /*
7904 * This is the case when a TLI program uses the same
7905 * transport end point to accept a connection. This
7906 * makes the TCP both a listener and acceptor. When
7907 * this connection is closed, we need to set the state
7908 * back to TCPS_LISTEN. Make sure that the eager list
7909 * is reinitialized.
7910 *
7911 * Note that this stream is still bound to the four
7912 * tuples of the previous connection in IP. If a new
7913 * SYN with different foreign address comes in, IP will
7914 * not find it and will send it to the global queue. In
7915 * the global queue, TCP will do a tcp_lookup_listener()
7916 * to find this stream. This works because this stream
7917 * is only removed from connected hash.
7918 *
7919 */
7920 tcp->tcp_state = TCPS_LISTEN;
7921 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7922 tcp->tcp_eager_next_drop_q0 = tcp;
7923 tcp->tcp_eager_prev_drop_q0 = tcp;
7924 tcp->tcp_connp->conn_recv = tcp_conn_request;
7925 if (tcp->tcp_family == AF_INET6) {
7926 ASSERT(tcp->tcp_connp->conn_af_isv6);
7927 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7928 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7929 } else {
7930 ASSERT(!tcp->tcp_connp->conn_af_isv6);
7931 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7932 tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7933 }
7934 } else {
7935 tcp->tcp_state = TCPS_BOUND;
7936 }
7937
7938 /*
7939 * Initialize to default values
7940 * Can't fail since enough header template space already allocated
7941 * at open().
7942 */
7943 err = tcp_init_values(tcp);
7944 ASSERT(err == 0);
7945 /* Restore state in tcp_tcph */
7946 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
7947 if (tcp->tcp_ipversion == IPV4_VERSION)
7948 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
7949 else
7950 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
7951 /*
7952 * Copy of the src addr. in tcp_t is needed in tcp_t
7953 * since the lookup funcs can only lookup on tcp_t
7954 */
7955 tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
7956
7957 ASSERT(tcp->tcp_ptpbhn != NULL);
7958 tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat;
7959 tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
7960 tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ?
7961 tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
7962 }
7963
7964 /*
7965 * Force values to zero that need be zero.
7966 * Do not touch values asociated with the BOUND or LISTEN state
7967 * since the connection will end up in that state after the reinit.
7968 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t
7969 * structure!
7970 */
7971 static void
7972 tcp_reinit_values(tcp)
7973 tcp_t *tcp;
7974 {
7975 tcp_stack_t *tcps = tcp->tcp_tcps;
7976
7977 #ifndef lint
7978 #define DONTCARE(x)
7979 #define PRESERVE(x)
7980 #else
7981 #define DONTCARE(x) ((x) = (x))
7982 #define PRESERVE(x) ((x) = (x))
7983 #endif /* lint */
7984
7985 PRESERVE(tcp->tcp_bind_hash);
7986 PRESERVE(tcp->tcp_ptpbhn);
7987 PRESERVE(tcp->tcp_acceptor_hash);
7988 PRESERVE(tcp->tcp_ptpahn);
7989
7990 /* Should be ASSERT NULL on these with new code! */
7991 ASSERT(tcp->tcp_time_wait_next == NULL);
7992 ASSERT(tcp->tcp_time_wait_prev == NULL);
7993 ASSERT(tcp->tcp_time_wait_expire == 0);
7994 PRESERVE(tcp->tcp_state);
7995 PRESERVE(tcp->tcp_rq);
7996 PRESERVE(tcp->tcp_wq);
7997
7998 ASSERT(tcp->tcp_xmit_head == NULL);
7999 ASSERT(tcp->tcp_xmit_last == NULL);
8000 ASSERT(tcp->tcp_unsent == 0);
8001 ASSERT(tcp->tcp_xmit_tail == NULL);
8002 ASSERT(tcp->tcp_xmit_tail_unsent == 0);
8003
8004 tcp->tcp_snxt = 0; /* Displayed in mib */
8005 tcp->tcp_suna = 0; /* Displayed in mib */
8006 tcp->tcp_swnd = 0;
8007 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */
8008
8009 ASSERT(tcp->tcp_ibsegs == 0);
8010 ASSERT(tcp->tcp_obsegs == 0);
8011
8012 if (tcp->tcp_iphc != NULL) {
8013 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8014 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
8015 }
8016
8017 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */
8018 DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */
8019 DONTCARE(tcp->tcp_ipha);
8020 DONTCARE(tcp->tcp_ip6h);
8021 DONTCARE(tcp->tcp_ip_hdr_len);
8022 DONTCARE(tcp->tcp_tcph);
8023 DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */
8024 tcp->tcp_valid_bits = 0;
8025
8026 DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */
8027 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */
8028 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */
8029 tcp->tcp_last_rcv_lbolt = 0;
8030
8031 tcp->tcp_init_cwnd = 0;
8032
8033 tcp->tcp_urp_last_valid = 0;
8034 tcp->tcp_hard_binding = 0;
8035 tcp->tcp_hard_bound = 0;
8036 PRESERVE(tcp->tcp_cred);
8037 PRESERVE(tcp->tcp_cpid);
8038 PRESERVE(tcp->tcp_open_time);
8039 PRESERVE(tcp->tcp_exclbind);
8040
8041 tcp->tcp_fin_acked = 0;
8042 tcp->tcp_fin_rcvd = 0;
8043 tcp->tcp_fin_sent = 0;
8044 tcp->tcp_ordrel_done = 0;
8045
8046 tcp->tcp_debug = 0;
8047 tcp->tcp_dontroute = 0;
8048 tcp->tcp_broadcast = 0;
8049
8050 tcp->tcp_useloopback = 0;
8051 tcp->tcp_reuseaddr = 0;
8052 tcp->tcp_oobinline = 0;
8053 tcp->tcp_dgram_errind = 0;
8054
8055 tcp->tcp_detached = 0;
8056 tcp->tcp_bind_pending = 0;
8057 tcp->tcp_unbind_pending = 0;
8058 tcp->tcp_deferred_clean_death = 0;
8059
8060 tcp->tcp_snd_ws_ok = B_FALSE;
8061 tcp->tcp_snd_ts_ok = B_FALSE;
8062 tcp->tcp_linger = 0;
8063 tcp->tcp_ka_enabled = 0;
8064 tcp->tcp_zero_win_probe = 0;
8065
8066 tcp->tcp_loopback = 0;
8067 tcp->tcp_localnet = 0;
8068 tcp->tcp_syn_defense = 0;
8069 tcp->tcp_set_timer = 0;
8070
8071 tcp->tcp_active_open = 0;
8072 ASSERT(tcp->tcp_timeout == B_FALSE);
8073 tcp->tcp_rexmit = B_FALSE;
8074 tcp->tcp_xmit_zc_clean = B_FALSE;
8075
8076 tcp->tcp_snd_sack_ok = B_FALSE;
8077 PRESERVE(tcp->tcp_recvdstaddr);
8078 tcp->tcp_hwcksum = B_FALSE;
8079
8080 tcp->tcp_ire_ill_check_done = B_FALSE;
8081 DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */
8082
8083 tcp->tcp_mdt = B_FALSE;
8084 tcp->tcp_mdt_hdr_head = 0;
8085 tcp->tcp_mdt_hdr_tail = 0;
8086
8087 tcp->tcp_conn_def_q0 = 0;
8088 tcp->tcp_ip_forward_progress = B_FALSE;
8089 tcp->tcp_anon_priv_bind = 0;
8090 tcp->tcp_ecn_ok = B_FALSE;
8091
8092 tcp->tcp_cwr = B_FALSE;
8093 tcp->tcp_ecn_echo_on = B_FALSE;
8094
8095 if (tcp->tcp_sack_info != NULL) {
8096 if (tcp->tcp_notsack_list != NULL) {
8097 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
8098 }
8099 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info);
8100 tcp->tcp_sack_info = NULL;
8101 }
8102
8103 tcp->tcp_rcv_ws = 0;
8104 tcp->tcp_snd_ws = 0;
8105 tcp->tcp_ts_recent = 0;
8106 tcp->tcp_rnxt = 0; /* Displayed in mib */
8107 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */
8108 tcp->tcp_if_mtu = 0;
8109
8110 ASSERT(tcp->tcp_reass_head == NULL);
8111 ASSERT(tcp->tcp_reass_tail == NULL);
8112
8113 tcp->tcp_cwnd_cnt = 0;
8114
8115 ASSERT(tcp->tcp_rcv_list == NULL);
8116 ASSERT(tcp->tcp_rcv_last_head == NULL);
8117 ASSERT(tcp->tcp_rcv_last_tail == NULL);
8118 ASSERT(tcp->tcp_rcv_cnt == 0);
8119
8120 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */
8121 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */
8122 tcp->tcp_csuna = 0;
8123
8124 tcp->tcp_rto = 0; /* Displayed in MIB */
8125 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */
8126 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */
8127 tcp->tcp_rtt_update = 0;
8128
8129 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
8130 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
8131
8132 tcp->tcp_rack = 0; /* Displayed in mib */
8133 tcp->tcp_rack_cnt = 0;
8134 tcp->tcp_rack_cur_max = 0;
8135 tcp->tcp_rack_abs_max = 0;
8136
8137 tcp->tcp_max_swnd = 0;
8138
8139 ASSERT(tcp->tcp_listener == NULL);
8140
8141 DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */
8142
8143 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */
8144 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */
8145 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */
8146 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */
8147
8148 ASSERT(tcp->tcp_conn_req_cnt_q == 0);
8149 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
8150 PRESERVE(tcp->tcp_conn_req_max);
8151 PRESERVE(tcp->tcp_conn_req_seqnum);
8152
8153 DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */
8154 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
8155 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
8156 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
8157 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
8158
8159 tcp->tcp_lingertime = 0;
8160
8161 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */
8162 ASSERT(tcp->tcp_urp_mp == NULL);
8163 ASSERT(tcp->tcp_urp_mark_mp == NULL);
8164 ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
8165
8166 ASSERT(tcp->tcp_eager_next_q == NULL);
8167 ASSERT(tcp->tcp_eager_last_q == NULL);
8168 ASSERT((tcp->tcp_eager_next_q0 == NULL &&
8169 tcp->tcp_eager_prev_q0 == NULL) ||
8170 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
8171 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
8172
8173 ASSERT((tcp->tcp_eager_next_drop_q0 == NULL &&
8174 tcp->tcp_eager_prev_drop_q0 == NULL) ||
8175 tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0);
8176
8177 tcp->tcp_client_errno = 0;
8178
8179 DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */
8180
8181 tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */
8182
8183 PRESERVE(tcp->tcp_bound_source_v6);
8184 tcp->tcp_last_sent_len = 0;
8185 tcp->tcp_dupack_cnt = 0;
8186
8187 tcp->tcp_fport = 0; /* Displayed in MIB */
8188 PRESERVE(tcp->tcp_lport);
8189
8190 PRESERVE(tcp->tcp_acceptor_lockp);
8191
8192 ASSERT(tcp->tcp_ordrelid == 0);
8193 PRESERVE(tcp->tcp_acceptor_id);
8194 DONTCARE(tcp->tcp_ipsec_overhead);
8195
8196 /*
8197 * If tcp_tracing flag is ON (i.e. We have a trace buffer
8198 * in tcp structure and now tracing), Re-initialize all
8199 * members of tcp_traceinfo.
8200 */
8201 if (tcp->tcp_tracebuf != NULL) {
8202 bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t));
8203 }
8204
8205 PRESERVE(tcp->tcp_family);
8206 if (tcp->tcp_family == AF_INET6) {
8207 tcp->tcp_ipversion = IPV6_VERSION;
8208 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
8209 } else {
8210 tcp->tcp_ipversion = IPV4_VERSION;
8211 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
8212 }
8213
8214 tcp->tcp_bound_if = 0;
8215 tcp->tcp_ipv6_recvancillary = 0;
8216 tcp->tcp_recvifindex = 0;
8217 tcp->tcp_recvhops = 0;
8218 tcp->tcp_closed = 0;
8219 tcp->tcp_cleandeathtag = 0;
8220 if (tcp->tcp_hopopts != NULL) {
8221 mi_free(tcp->tcp_hopopts);
8222 tcp->tcp_hopopts = NULL;
8223 tcp->tcp_hopoptslen = 0;
8224 }
8225 ASSERT(tcp->tcp_hopoptslen == 0);
8226 if (tcp->tcp_dstopts != NULL) {
8227 mi_free(tcp->tcp_dstopts);
8228 tcp->tcp_dstopts = NULL;
8229 tcp->tcp_dstoptslen = 0;
8230 }
8231 ASSERT(tcp->tcp_dstoptslen == 0);
8232 if (tcp->tcp_rtdstopts != NULL) {
8233 mi_free(tcp->tcp_rtdstopts);
8234 tcp->tcp_rtdstopts = NULL;
8235 tcp->tcp_rtdstoptslen = 0;
8236 }
8237 ASSERT(tcp->tcp_rtdstoptslen == 0);
8238 if (tcp->tcp_rthdr != NULL) {
8239 mi_free(tcp->tcp_rthdr);
8240 tcp->tcp_rthdr = NULL;
8241 tcp->tcp_rthdrlen = 0;
8242 }
8243 ASSERT(tcp->tcp_rthdrlen == 0);
8244 PRESERVE(tcp->tcp_drop_opt_ack_cnt);
8245
8246 /* Reset fusion-related fields */
8247 tcp->tcp_fused = B_FALSE;
8248 tcp->tcp_unfusable = B_FALSE;
8249 tcp->tcp_fused_sigurg = B_FALSE;
8250 tcp->tcp_direct_sockfs = B_FALSE;
8251 tcp->tcp_fuse_syncstr_stopped = B_FALSE;
8252 tcp->tcp_fuse_syncstr_plugged = B_FALSE;
8253 tcp->tcp_loopback_peer = NULL;
8254 tcp->tcp_fuse_rcv_hiwater = 0;
8255 tcp->tcp_fuse_rcv_unread_hiwater = 0;
8256 tcp->tcp_fuse_rcv_unread_cnt = 0;
8257
8258 tcp->tcp_lso = B_FALSE;
8259
8260 tcp->tcp_in_ack_unsent = 0;
8261 tcp->tcp_cork = B_FALSE;
8262 tcp->tcp_tconnind_started = B_FALSE;
8263
8264 PRESERVE(tcp->tcp_squeue_bytes);
8265
8266 ASSERT(tcp->tcp_kssl_ctx == NULL);
8267 ASSERT(!tcp->tcp_kssl_pending);
8268 PRESERVE(tcp->tcp_kssl_ent);
8269
8270 /* Sodirect */
8271 tcp->tcp_sodirect = NULL;
8272
8273 tcp->tcp_closemp_used = B_FALSE;
8274
8275 #ifdef DEBUG
8276 DONTCARE(tcp->tcmp_stk[0]);
8277 #endif
8278
8279
8280 #undef DONTCARE
8281 #undef PRESERVE
8282 }
8283
8284 /*
8285 * Allocate necessary resources and initialize state vector.
8286 * Guaranteed not to fail so that when an error is returned,
8287 * the caller doesn't need to do any additional cleanup.
8288 */
8289 int
8290 tcp_init(tcp_t *tcp, queue_t *q)
8291 {
8292 int err;
8293
8294 tcp->tcp_rq = q;
8295 tcp->tcp_wq = WR(q);
8296 tcp->tcp_state = TCPS_IDLE;
8297 if ((err = tcp_init_values(tcp)) != 0)
8298 tcp_timers_stop(tcp);
8299 return (err);
8300 }
8301
8302 static int
8303 tcp_init_values(tcp_t *tcp)
8304 {
8305 int err;
8306 tcp_stack_t *tcps = tcp->tcp_tcps;
8307
8308 ASSERT((tcp->tcp_family == AF_INET &&
8309 tcp->tcp_ipversion == IPV4_VERSION) ||
8310 (tcp->tcp_family == AF_INET6 &&
8311 (tcp->tcp_ipversion == IPV4_VERSION ||
8312 tcp->tcp_ipversion == IPV6_VERSION)));
8313
8314 /*
8315 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
8316 * will be close to tcp_rexmit_interval_initial. By doing this, we
8317 * allow the algorithm to adjust slowly to large fluctuations of RTT
8318 * during first few transmissions of a connection as seen in slow
8319 * links.
8320 */
8321 tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2;
8322 tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1;
8323 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
8324 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
8325 tcps->tcps_conn_grace_period;
8326 if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min)
8327 tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
8328 tcp->tcp_timer_backoff = 0;
8329 tcp->tcp_ms_we_have_waited = 0;
8330 tcp->tcp_last_recv_time = lbolt;
8331 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
8332 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
8333 tcp->tcp_snd_burst = TCP_CWND_INFINITE;
8334
8335 tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier;
8336
8337 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
8338 tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
8339 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
8340 /*
8341 * Fix it to tcp_ip_abort_linterval later if it turns out to be a
8342 * passive open.
8343 */
8344 tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval;
8345
8346 tcp->tcp_naglim = tcps->tcps_naglim_def;
8347
8348 /* NOTE: ISS is now set in tcp_adapt_ire(). */
8349
8350 tcp->tcp_mdt_hdr_head = 0;
8351 tcp->tcp_mdt_hdr_tail = 0;
8352
8353 /* Reset fusion-related fields */
8354 tcp->tcp_fused = B_FALSE;
8355 tcp->tcp_unfusable = B_FALSE;
8356 tcp->tcp_fused_sigurg = B_FALSE;
8357 tcp->tcp_direct_sockfs = B_FALSE;
8358 tcp->tcp_fuse_syncstr_stopped = B_FALSE;
8359 tcp->tcp_fuse_syncstr_plugged = B_FALSE;
8360 tcp->tcp_loopback_peer = NULL;
8361 tcp->tcp_fuse_rcv_hiwater = 0;
8362 tcp->tcp_fuse_rcv_unread_hiwater = 0;
8363 tcp->tcp_fuse_rcv_unread_cnt = 0;
8364
8365 /* Sodirect */
8366 tcp->tcp_sodirect = NULL;
8367
8368 /* Initialize the header template */
8369 if (tcp->tcp_ipversion == IPV4_VERSION) {
8370 err = tcp_header_init_ipv4(tcp);
8371 } else {
8372 err = tcp_header_init_ipv6(tcp);
8373 }
8374 if (err)
8375 return (err);
8376
8377 /*
8378 * Init the window scale to the max so tcp_rwnd_set() won't pare
8379 * down tcp_rwnd. tcp_adapt_ire() will set the right value later.
8380 */
8381 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
8382 tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat;
8383 tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat;
8384
8385 tcp->tcp_cork = B_FALSE;
8386 /*
8387 * Init the tcp_debug option. This value determines whether TCP
8388 * calls strlog() to print out debug messages. Doing this
8389 * initialization here means that this value is not inherited thru
8390 * tcp_reinit().
8391 */
8392 tcp->tcp_debug = tcps->tcps_dbg;
8393
8394 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
8395 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
8396
8397 return (0);
8398 }
8399
8400 /*
8401 * Initialize the IPv4 header. Loses any record of any IP options.
8402 */
8403 static int
8404 tcp_header_init_ipv4(tcp_t *tcp)
8405 {
8406 tcph_t *tcph;
8407 uint32_t sum;
8408 conn_t *connp;
8409 tcp_stack_t *tcps = tcp->tcp_tcps;
8410
8411 /*
8412 * This is a simple initialization. If there's
8413 * already a template, it should never be too small,
8414 * so reuse it. Otherwise, allocate space for the new one.
8415 */
8416 if (tcp->tcp_iphc == NULL) {
8417 ASSERT(tcp->tcp_iphc_len == 0);
8418 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8419 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8420 if (tcp->tcp_iphc == NULL) {
8421 tcp->tcp_iphc_len = 0;
8422 return (ENOMEM);
8423 }
8424 }
8425
8426 /* options are gone; may need a new label */
8427 connp = tcp->tcp_connp;
8428 connp->conn_mlp_type = mlptSingle;
8429 connp->conn_ulp_labeled = !is_system_labeled();
8430 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8431 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
8432 tcp->tcp_ip6h = NULL;
8433 tcp->tcp_ipversion = IPV4_VERSION;
8434 tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t);
8435 tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8436 tcp->tcp_ip_hdr_len = sizeof (ipha_t);
8437 tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t));
8438 tcp->tcp_ipha->ipha_version_and_hdr_length
8439 = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
8440 tcp->tcp_ipha->ipha_ident = 0;
8441
8442 tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
8443 tcp->tcp_tos = 0;
8444 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
8445 tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
8446 tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP;
8447
8448 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t));
8449 tcp->tcp_tcph = tcph;
8450 tcph->th_offset_and_rsrvd[0] = (5 << 4);
8451 /*
8452 * IP wants our header length in the checksum field to
8453 * allow it to perform a single pseudo-header+checksum
8454 * calculation on behalf of TCP.
8455 * Include the adjustment for a source route once IP_OPTIONS is set.
8456 */
8457 sum = sizeof (tcph_t) + tcp->tcp_sum;
8458 sum = (sum >> 16) + (sum & 0xFFFF);
8459 U16_TO_ABE16(sum, tcph->th_sum);
8460 return (0);
8461 }
8462
8463 /*
8464 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
8465 */
8466 static int
8467 tcp_header_init_ipv6(tcp_t *tcp)
8468 {
8469 tcph_t *tcph;
8470 uint32_t sum;
8471 conn_t *connp;
8472 tcp_stack_t *tcps = tcp->tcp_tcps;
8473
8474 /*
8475 * This is a simple initialization. If there's
8476 * already a template, it should never be too small,
8477 * so reuse it. Otherwise, allocate space for the new one.
8478 * Ensure that there is enough space to "downgrade" the tcp_t
8479 * to an IPv4 tcp_t. This requires having space for a full load
8480 * of IPv4 options, as well as a full load of TCP options
8481 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space
8482 * than a v6 header and a TCP header with a full load of TCP options
8483 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes).
8484 * We want to avoid reallocation in the "downgraded" case when
8485 * processing outbound IPv4 options.
8486 */
8487 if (tcp->tcp_iphc == NULL) {
8488 ASSERT(tcp->tcp_iphc_len == 0);
8489 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8490 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8491 if (tcp->tcp_iphc == NULL) {
8492 tcp->tcp_iphc_len = 0;
8493 return (ENOMEM);
8494 }
8495 }
8496
8497 /* options are gone; may need a new label */
8498 connp = tcp->tcp_connp;
8499 connp->conn_mlp_type = mlptSingle;
8500 connp->conn_ulp_labeled = !is_system_labeled();
8501
8502 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8503 tcp->tcp_ipversion = IPV6_VERSION;
8504 tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t);
8505 tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8506 tcp->tcp_ip_hdr_len = IPV6_HDR_LEN;
8507 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
8508 tcp->tcp_ipha = NULL;
8509
8510 /* Initialize the header template */
8511
8512 tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
8513 tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t));
8514 tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP;
8515 tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit;
8516
8517 tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN);
8518 tcp->tcp_tcph = tcph;
8519 tcph->th_offset_and_rsrvd[0] = (5 << 4);
8520 /*
8521 * IP wants our header length in the checksum field to
8522 * allow it to perform a single psuedo-header+checksum
8523 * calculation on behalf of TCP.
8524 * Include the adjustment for a source route when IPV6_RTHDR is set.
8525 */
8526 sum = sizeof (tcph_t) + tcp->tcp_sum;
8527 sum = (sum >> 16) + (sum & 0xFFFF);
8528 U16_TO_ABE16(sum, tcph->th_sum);
8529 return (0);
8530 }
8531
8532 /* At minimum we need 8 bytes in the TCP header for the lookup */
8533 #define ICMP_MIN_TCP_HDR 8
8534
8535 /*
8536 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages
8537 * passed up by IP. The message is always received on the correct tcp_t.
8538 * Assumes that IP has pulled up everything up to and including the ICMP header.
8539 */
8540 void
8541 tcp_icmp_error(tcp_t *tcp, mblk_t *mp)
8542 {
8543 icmph_t *icmph;
8544 ipha_t *ipha;
8545 int iph_hdr_length;
8546 tcph_t *tcph;
8547 boolean_t ipsec_mctl = B_FALSE;
8548 boolean_t secure;
8549 mblk_t *first_mp = mp;
8550 uint32_t new_mss;
8551 uint32_t ratio;
8552 size_t mp_size = MBLKL(mp);
8553 uint32_t seg_seq;
8554 tcp_stack_t *tcps = tcp->tcp_tcps;
8555
8556 /* Assume IP provides aligned packets - otherwise toss */
8557 if (!OK_32PTR(mp->b_rptr)) {
8558 freemsg(mp);
8559 return;
8560 }
8561
8562 /*
8563 * Since ICMP errors are normal data marked with M_CTL when sent
8564 * to TCP or UDP, we have to look for a IPSEC_IN value to identify
8565 * packets starting with an ipsec_info_t, see ipsec_info.h.
8566 */
8567 if ((mp_size == sizeof (ipsec_info_t)) &&
8568 (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) {
8569 ASSERT(mp->b_cont != NULL);
8570 mp = mp->b_cont;
8571 /* IP should have done this */
8572 ASSERT(OK_32PTR(mp->b_rptr));
8573 mp_size = MBLKL(mp);
8574 ipsec_mctl = B_TRUE;
8575 }
8576
8577 /*
8578 * Verify that we have a complete outer IP header. If not, drop it.
8579 */
8580 if (mp_size < sizeof (ipha_t)) {
8581 noticmpv4:
8582 freemsg(first_mp);
8583 return;
8584 }
8585
8586 ipha = (ipha_t *)mp->b_rptr;
8587 /*
8588 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
8589 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
8590 */
8591 switch (IPH_HDR_VERSION(ipha)) {
8592 case IPV6_VERSION:
8593 tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl);
8594 return;
8595 case IPV4_VERSION:
8596 break;
8597 default:
8598 goto noticmpv4;
8599 }
8600
8601 /* Skip past the outer IP and ICMP headers */
8602 iph_hdr_length = IPH_HDR_LENGTH(ipha);
8603 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
8604 /*
8605 * If we don't have the correct outer IP header length or if the ULP
8606 * is not IPPROTO_ICMP or if we don't have a complete inner IP header
8607 * send it upstream.
8608 */
8609 if (iph_hdr_length < sizeof (ipha_t) ||
8610 ipha->ipha_protocol != IPPROTO_ICMP ||
8611 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
8612 goto noticmpv4;
8613 }
8614 ipha = (ipha_t *)&icmph[1];
8615
8616 /* Skip past the inner IP and find the ULP header */
8617 iph_hdr_length = IPH_HDR_LENGTH(ipha);
8618 tcph = (tcph_t *)((char *)ipha + iph_hdr_length);
8619 /*
8620 * If we don't have the correct inner IP header length or if the ULP
8621 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
8622 * bytes of TCP header, drop it.
8623 */
8624 if (iph_hdr_length < sizeof (ipha_t) ||
8625 ipha->ipha_protocol != IPPROTO_TCP ||
8626 (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) {
8627 goto noticmpv4;
8628 }
8629
8630 if (TCP_IS_DETACHED_NONEAGER(tcp)) {
8631 if (ipsec_mctl) {
8632 secure = ipsec_in_is_secure(first_mp);
8633 } else {
8634 secure = B_FALSE;
8635 }
8636 if (secure) {
8637 /*
8638 * If we are willing to accept this in clear
8639 * we don't have to verify policy.
8640 */
8641 if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) {
8642 if (!tcp_check_policy(tcp, first_mp,
8643 ipha, NULL, secure, ipsec_mctl)) {
8644 /*
8645 * tcp_check_policy called
8646 * ip_drop_packet() on failure.
8647 */
8648 return;
8649 }
8650 }
8651 }
8652 } else if (ipsec_mctl) {
8653 /*
8654 * This is a hard_bound connection. IP has already
8655 * verified policy. We don't have to do it again.
8656 */
8657 freeb(first_mp);
8658 first_mp = mp;
8659 ipsec_mctl = B_FALSE;
8660 }
8661
8662 seg_seq = ABE32_TO_U32(tcph->th_seq);
8663 /*
8664 * TCP SHOULD check that the TCP sequence number contained in
8665 * payload of the ICMP error message is within the range
8666 * SND.UNA <= SEG.SEQ < SND.NXT.
8667 */
8668 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
8669 /*
8670 * If the ICMP message is bogus, should we kill the
8671 * connection, or should we just drop the bogus ICMP
8672 * message? It would probably make more sense to just
8673 * drop the message so that if this one managed to get
8674 * in, the real connection should not suffer.
8675 */
8676 goto noticmpv4;
8677 }
8678
8679 switch (icmph->icmph_type) {
8680 case ICMP_DEST_UNREACHABLE:
8681 switch (icmph->icmph_code) {
8682 case ICMP_FRAGMENTATION_NEEDED:
8683 /*
8684 * Reduce the MSS based on the new MTU. This will
8685 * eliminate any fragmentation locally.
8686 * N.B. There may well be some funny side-effects on
8687 * the local send policy and the remote receive policy.
8688 * Pending further research, we provide
8689 * tcp_ignore_path_mtu just in case this proves
8690 * disastrous somewhere.
8691 *
8692 * After updating the MSS, retransmit part of the
8693 * dropped segment using the new mss by calling
8694 * tcp_wput_data(). Need to adjust all those
8695 * params to make sure tcp_wput_data() work properly.
8696 */
8697 if (tcps->tcps_ignore_path_mtu)
8698 break;
8699
8700 /*
8701 * Decrease the MSS by time stamp options
8702 * IP options and IPSEC options. tcp_hdr_len
8703 * includes time stamp option and IP option
8704 * length.
8705 */
8706
8707 new_mss = ntohs(icmph->icmph_du_mtu) -
8708 tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead;
8709
8710 /*
8711 * Only update the MSS if the new one is
8712 * smaller than the previous one. This is
8713 * to avoid problems when getting multiple
8714 * ICMP errors for the same MTU.
8715 */
8716 if (new_mss >= tcp->tcp_mss)
8717 break;
8718
8719 /*
8720 * Stop doing PMTU if new_mss is less than 68
8721 * or less than tcp_mss_min.
8722 * The value 68 comes from rfc 1191.
8723 */
8724 if (new_mss < MAX(68, tcps->tcps_mss_min))
8725 tcp->tcp_ipha->ipha_fragment_offset_and_flags =
8726 0;
8727
8728 ratio = tcp->tcp_cwnd / tcp->tcp_mss;
8729 ASSERT(ratio >= 1);
8730 tcp_mss_set(tcp, new_mss, B_TRUE);
8731
8732 /*
8733 * Make sure we have something to
8734 * send.
8735 */
8736 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
8737 (tcp->tcp_xmit_head != NULL)) {
8738 /*
8739 * Shrink tcp_cwnd in
8740 * proportion to the old MSS/new MSS.
8741 */
8742 tcp->tcp_cwnd = ratio * tcp->tcp_mss;
8743 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
8744 (tcp->tcp_unsent == 0)) {
8745 tcp->tcp_rexmit_max = tcp->tcp_fss;
8746 } else {
8747 tcp->tcp_rexmit_max = tcp->tcp_snxt;
8748 }
8749 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
8750 tcp->tcp_rexmit = B_TRUE;
8751 tcp->tcp_dupack_cnt = 0;
8752 tcp->tcp_snd_burst = TCP_CWND_SS;
8753 tcp_ss_rexmit(tcp);
8754 }
8755 break;
8756 case ICMP_PORT_UNREACHABLE:
8757 case ICMP_PROTOCOL_UNREACHABLE:
8758 switch (tcp->tcp_state) {
8759 case TCPS_SYN_SENT:
8760 case TCPS_SYN_RCVD:
8761 /*
8762 * ICMP can snipe away incipient
8763 * TCP connections as long as
8764 * seq number is same as initial
8765 * send seq number.
8766 */
8767 if (seg_seq == tcp->tcp_iss) {
8768 (void) tcp_clean_death(tcp,
8769 ECONNREFUSED, 6);
8770 }
8771 break;
8772 }
8773 break;
8774 case ICMP_HOST_UNREACHABLE:
8775 case ICMP_NET_UNREACHABLE:
8776 /* Record the error in case we finally time out. */
8777 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
8778 tcp->tcp_client_errno = EHOSTUNREACH;
8779 else
8780 tcp->tcp_client_errno = ENETUNREACH;
8781 if (tcp->tcp_state == TCPS_SYN_RCVD) {
8782 if (tcp->tcp_listener != NULL &&
8783 tcp->tcp_listener->tcp_syn_defense) {
8784 /*
8785 * Ditch the half-open connection if we
8786 * suspect a SYN attack is under way.
8787 */
8788 tcp_ip_ire_mark_advice(tcp);
8789 (void) tcp_clean_death(tcp,
8790 tcp->tcp_client_errno, 7);
8791 }
8792 }
8793 break;
8794 default:
8795 break;
8796 }
8797 break;
8798 case ICMP_SOURCE_QUENCH: {
8799 /*
8800 * use a global boolean to control
8801 * whether TCP should respond to ICMP_SOURCE_QUENCH.
8802 * The default is false.
8803 */
8804 if (tcp_icmp_source_quench) {
8805 /*
8806 * Reduce the sending rate as if we got a
8807 * retransmit timeout
8808 */
8809 uint32_t npkt;
8810
8811 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
8812 tcp->tcp_mss;
8813 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
8814 tcp->tcp_cwnd = tcp->tcp_mss;
8815 tcp->tcp_cwnd_cnt = 0;
8816 }
8817 break;
8818 }
8819 }
8820 freemsg(first_mp);
8821 }
8822
8823 /*
8824 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6
8825 * error messages passed up by IP.
8826 * Assumes that IP has pulled up all the extension headers as well
8827 * as the ICMPv6 header.
8828 */
8829 static void
8830 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl)
8831 {
8832 icmp6_t *icmp6;
8833 ip6_t *ip6h;
8834 uint16_t iph_hdr_length;
8835 tcpha_t *tcpha;
8836 uint8_t *nexthdrp;
8837 uint32_t new_mss;
8838 uint32_t ratio;
8839 boolean_t secure;
8840 mblk_t *first_mp = mp;
8841 size_t mp_size;
8842 uint32_t seg_seq;
8843 tcp_stack_t *tcps = tcp->tcp_tcps;
8844
8845 /*
8846 * The caller has determined if this is an IPSEC_IN packet and
8847 * set ipsec_mctl appropriately (see tcp_icmp_error).
8848 */
8849 if (ipsec_mctl)
8850 mp = mp->b_cont;
8851
8852 mp_size = MBLKL(mp);
8853
8854 /*
8855 * Verify that we have a complete IP header. If not, send it upstream.
8856 */
8857 if (mp_size < sizeof (ip6_t)) {
8858 noticmpv6:
8859 freemsg(first_mp);
8860 return;
8861 }
8862
8863 /*
8864 * Verify this is an ICMPV6 packet, else send it upstream.
8865 */
8866 ip6h = (ip6_t *)mp->b_rptr;
8867 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8868 iph_hdr_length = IPV6_HDR_LEN;
8869 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
8870 &nexthdrp) ||
8871 *nexthdrp != IPPROTO_ICMPV6) {
8872 goto noticmpv6;
8873 }
8874 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
8875 ip6h = (ip6_t *)&icmp6[1];
8876 /*
8877 * Verify if we have a complete ICMP and inner IP header.
8878 */
8879 if ((uchar_t *)&ip6h[1] > mp->b_wptr)
8880 goto noticmpv6;
8881
8882 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
8883 goto noticmpv6;
8884 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
8885 /*
8886 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
8887 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the
8888 * packet.
8889 */
8890 if ((*nexthdrp != IPPROTO_TCP) ||
8891 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
8892 goto noticmpv6;
8893 }
8894
8895 /*
8896 * ICMP errors come on the right queue or come on
8897 * listener/global queue for detached connections and
8898 * get switched to the right queue. If it comes on the
8899 * right queue, policy check has already been done by IP
8900 * and thus free the first_mp without verifying the policy.
8901 * If it has come for a non-hard bound connection, we need
8902 * to verify policy as IP may not have done it.
8903 */
8904 if (!tcp->tcp_hard_bound) {
8905 if (ipsec_mctl) {
8906 secure = ipsec_in_is_secure(first_mp);
8907 } else {
8908 secure = B_FALSE;
8909 }
8910 if (secure) {
8911 /*
8912 * If we are willing to accept this in clear
8913 * we don't have to verify policy.
8914 */
8915 if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) {
8916 if (!tcp_check_policy(tcp, first_mp,
8917 NULL, ip6h, secure, ipsec_mctl)) {
8918 /*
8919 * tcp_check_policy called
8920 * ip_drop_packet() on failure.
8921 */
8922 return;
8923 }
8924 }
8925 }
8926 } else if (ipsec_mctl) {
8927 /*
8928 * This is a hard_bound connection. IP has already
8929 * verified policy. We don't have to do it again.
8930 */
8931 freeb(first_mp);
8932 first_mp = mp;
8933 ipsec_mctl = B_FALSE;
8934 }
8935
8936 seg_seq = ntohl(tcpha->tha_seq);
8937 /*
8938 * TCP SHOULD check that the TCP sequence number contained in
8939 * payload of the ICMP error message is within the range
8940 * SND.UNA <= SEG.SEQ < SND.NXT.
8941 */
8942 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
8943 /*
8944 * If the ICMP message is bogus, should we kill the
8945 * connection, or should we just drop the bogus ICMP
8946 * message? It would probably make more sense to just
8947 * drop the message so that if this one managed to get
8948 * in, the real connection should not suffer.
8949 */
8950 goto noticmpv6;
8951 }
8952
8953 switch (icmp6->icmp6_type) {
8954 case ICMP6_PACKET_TOO_BIG:
8955 /*
8956 * Reduce the MSS based on the new MTU. This will
8957 * eliminate any fragmentation locally.
8958 * N.B. There may well be some funny side-effects on
8959 * the local send policy and the remote receive policy.
8960 * Pending further research, we provide
8961 * tcp_ignore_path_mtu just in case this proves
8962 * disastrous somewhere.
8963 *
8964 * After updating the MSS, retransmit part of the
8965 * dropped segment using the new mss by calling
8966 * tcp_wput_data(). Need to adjust all those
8967 * params to make sure tcp_wput_data() work properly.
8968 */
8969 if (tcps->tcps_ignore_path_mtu)
8970 break;
8971
8972 /*
8973 * Decrease the MSS by time stamp options
8974 * IP options and IPSEC options. tcp_hdr_len
8975 * includes time stamp option and IP option
8976 * length.
8977 */
8978 new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len -
8979 tcp->tcp_ipsec_overhead;
8980
8981 /*
8982 * Only update the MSS if the new one is
8983 * smaller than the previous one. This is
8984 * to avoid problems when getting multiple
8985 * ICMP errors for the same MTU.
8986 */
8987 if (new_mss >= tcp->tcp_mss)
8988 break;
8989
8990 ratio = tcp->tcp_cwnd / tcp->tcp_mss;
8991 ASSERT(ratio >= 1);
8992 tcp_mss_set(tcp, new_mss, B_TRUE);
8993
8994 /*
8995 * Make sure we have something to
8996 * send.
8997 */
8998 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
8999 (tcp->tcp_xmit_head != NULL)) {
9000 /*
9001 * Shrink tcp_cwnd in
9002 * proportion to the old MSS/new MSS.
9003 */
9004 tcp->tcp_cwnd = ratio * tcp->tcp_mss;
9005 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
9006 (tcp->tcp_unsent == 0)) {
9007 tcp->tcp_rexmit_max = tcp->tcp_fss;
9008 } else {
9009 tcp->tcp_rexmit_max = tcp->tcp_snxt;
9010 }
9011 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
9012 tcp->tcp_rexmit = B_TRUE;
9013 tcp->tcp_dupack_cnt = 0;
9014 tcp->tcp_snd_burst = TCP_CWND_SS;
9015 tcp_ss_rexmit(tcp);
9016 }
9017 break;
9018
9019 case ICMP6_DST_UNREACH:
9020 switch (icmp6->icmp6_code) {
9021 case ICMP6_DST_UNREACH_NOPORT:
9022 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
9023 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
9024 (seg_seq == tcp->tcp_iss)) {
9025 (void) tcp_clean_death(tcp,
9026 ECONNREFUSED, 8);
9027 }
9028 break;
9029
9030 case ICMP6_DST_UNREACH_ADMIN:
9031 case ICMP6_DST_UNREACH_NOROUTE:
9032 case ICMP6_DST_UNREACH_BEYONDSCOPE:
9033 case ICMP6_DST_UNREACH_ADDR:
9034 /* Record the error in case we finally time out. */
9035 tcp->tcp_client_errno = EHOSTUNREACH;
9036 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
9037 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
9038 (seg_seq == tcp->tcp_iss)) {
9039 if (tcp->tcp_listener != NULL &&
9040 tcp->tcp_listener->tcp_syn_defense) {
9041 /*
9042 * Ditch the half-open connection if we
9043 * suspect a SYN attack is under way.
9044 */
9045 tcp_ip_ire_mark_advice(tcp);
9046 (void) tcp_clean_death(tcp,
9047 tcp->tcp_client_errno, 9);
9048 }
9049 }
9050
9051
9052 break;
9053 default:
9054 break;
9055 }
9056 break;
9057
9058 case ICMP6_PARAM_PROB:
9059 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
9060 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
9061 (uchar_t *)ip6h + icmp6->icmp6_pptr ==
9062 (uchar_t *)nexthdrp) {
9063 if (tcp->tcp_state == TCPS_SYN_SENT ||
9064 tcp->tcp_state == TCPS_SYN_RCVD) {
9065 (void) tcp_clean_death(tcp,
9066 ECONNREFUSED, 10);
9067 }
9068 break;
9069 }
9070 break;
9071
9072 case ICMP6_TIME_EXCEEDED:
9073 default:
9074 break;
9075 }
9076 freemsg(first_mp);
9077 }
9078
9079 /*
9080 * IP recognizes seven kinds of bind requests:
9081 *
9082 * - A zero-length address binds only to the protocol number.
9083 *
9084 * - A 4-byte address is treated as a request to
9085 * validate that the address is a valid local IPv4
9086 * address, appropriate for an application to bind to.
9087 * IP does the verification, but does not make any note
9088 * of the address at this time.
9089 *
9090 * - A 16-byte address contains is treated as a request
9091 * to validate a local IPv6 address, as the 4-byte
9092 * address case above.
9093 *
9094 * - A 16-byte sockaddr_in to validate the local IPv4 address and also
9095 * use it for the inbound fanout of packets.
9096 *
9097 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also
9098 * use it for the inbound fanout of packets.
9099 *
9100 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout
9101 * information consisting of local and remote addresses
9102 * and ports. In this case, the addresses are both
9103 * validated as appropriate for this operation, and, if
9104 * so, the information is retained for use in the
9105 * inbound fanout.
9106 *
9107 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6
9108 * fanout information, like the 12-byte case above.
9109 *
9110 * IP will also fill in the IRE request mblk with information
9111 * regarding our peer. In all cases, we notify IP of our protocol
9112 * type by appending a single protocol byte to the bind request.
9113 */
9114 static mblk_t *
9115 tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length)
9116 {
9117 char *cp;
9118 mblk_t *mp;
9119 struct T_bind_req *tbr;
9120 ipa_conn_t *ac;
9121 ipa6_conn_t *ac6;
9122 sin_t *sin;
9123 sin6_t *sin6;
9124
9125 ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ);
9126 ASSERT((tcp->tcp_family == AF_INET &&
9127 tcp->tcp_ipversion == IPV4_VERSION) ||
9128 (tcp->tcp_family == AF_INET6 &&
9129 (tcp->tcp_ipversion == IPV4_VERSION ||
9130 tcp->tcp_ipversion == IPV6_VERSION)));
9131
9132 mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI);
9133 if (!mp)
9134 return (mp);
9135 mp->b_datap->db_type = M_PROTO;
9136 tbr = (struct T_bind_req *)mp->b_rptr;
9137 tbr->PRIM_type = bind_prim;
9138 tbr->ADDR_offset = sizeof (*tbr);
9139 tbr->CONIND_number = 0;
9140 tbr->ADDR_length = addr_length;
9141 cp = (char *)&tbr[1];
9142 switch (addr_length) {
9143 case sizeof (ipa_conn_t):
9144 ASSERT(tcp->tcp_family == AF_INET);
9145 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
9146
9147 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
9148 if (mp->b_cont == NULL) {
9149 freemsg(mp);
9150 return (NULL);
9151 }
9152 mp->b_cont->b_wptr += sizeof (ire_t);
9153 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
9154
9155 /* cp known to be 32 bit aligned */
9156 ac = (ipa_conn_t *)cp;
9157 ac->ac_laddr = tcp->tcp_ipha->ipha_src;
9158 ac->ac_faddr = tcp->tcp_remote;
9159 ac->ac_fport = tcp->tcp_fport;
9160 ac->ac_lport = tcp->tcp_lport;
9161 tcp->tcp_hard_binding = 1;
9162 break;
9163
9164 case sizeof (ipa6_conn_t):
9165 ASSERT(tcp->tcp_family == AF_INET6);
9166
9167 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
9168 if (mp->b_cont == NULL) {
9169 freemsg(mp);
9170 return (NULL);
9171 }
9172 mp->b_cont->b_wptr += sizeof (ire_t);
9173 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
9174
9175 /* cp known to be 32 bit aligned */
9176 ac6 = (ipa6_conn_t *)cp;
9177 if (tcp->tcp_ipversion == IPV4_VERSION) {
9178 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
9179 &ac6->ac6_laddr);
9180 } else {
9181 ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src;
9182 }
9183 ac6->ac6_faddr = tcp->tcp_remote_v6;
9184 ac6->ac6_fport = tcp->tcp_fport;
9185 ac6->ac6_lport = tcp->tcp_lport;
9186 tcp->tcp_hard_binding = 1;
9187 break;
9188
9189 case sizeof (sin_t):
9190 /*
9191 * NOTE: IPV6_ADDR_LEN also has same size.
9192 * Use family to discriminate.
9193 */
9194 if (tcp->tcp_family == AF_INET) {
9195 sin = (sin_t *)cp;
9196
9197 *sin = sin_null;
9198 sin->sin_family = AF_INET;
9199 sin->sin_addr.s_addr = tcp->tcp_bound_source;
9200 sin->sin_port = tcp->tcp_lport;
9201 break;
9202 } else {
9203 *(in6_addr_t *)cp = tcp->tcp_bound_source_v6;
9204 }
9205 break;
9206
9207 case sizeof (sin6_t):
9208 ASSERT(tcp->tcp_family == AF_INET6);
9209 sin6 = (sin6_t *)cp;
9210
9211 *sin6 = sin6_null;
9212 sin6->sin6_family = AF_INET6;
9213 sin6->sin6_addr = tcp->tcp_bound_source_v6;
9214 sin6->sin6_port = tcp->tcp_lport;
9215 break;
9216
9217 case IP_ADDR_LEN:
9218 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
9219 *(uint32_t *)cp = tcp->tcp_ipha->ipha_src;
9220 break;
9221
9222 }
9223 /* Add protocol number to end */
9224 cp[addr_length] = (char)IPPROTO_TCP;
9225 mp->b_wptr = (uchar_t *)&cp[addr_length + 1];
9226 return (mp);
9227 }
9228
9229 /*
9230 * Notify IP that we are having trouble with this connection. IP should
9231 * blow the IRE away and start over.
9232 */
9233 static void
9234 tcp_ip_notify(tcp_t *tcp)
9235 {
9236 struct iocblk *iocp;
9237 ipid_t *ipid;
9238 mblk_t *mp;
9239
9240 /* IPv6 has NUD thus notification to delete the IRE is not needed */
9241 if (tcp->tcp_ipversion == IPV6_VERSION)
9242 return;
9243
9244 mp = mkiocb(IP_IOCTL);
9245 if (mp == NULL)
9246 return;
9247
9248 iocp = (struct iocblk *)mp->b_rptr;
9249 iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst);
9250
9251 mp->b_cont = allocb(iocp->ioc_count, BPRI_HI);
9252 if (!mp->b_cont) {
9253 freeb(mp);
9254 return;
9255 }
9256
9257 ipid = (ipid_t *)mp->b_cont->b_rptr;
9258 mp->b_cont->b_wptr += iocp->ioc_count;
9259 bzero(ipid, sizeof (*ipid));
9260 ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
9261 ipid->ipid_ire_type = IRE_CACHE;
9262 ipid->ipid_addr_offset = sizeof (ipid_t);
9263 ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst);
9264 /*
9265 * Note: in the case of source routing we want to blow away the
9266 * route to the first source route hop.
9267 */
9268 bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1],
9269 sizeof (tcp->tcp_ipha->ipha_dst));
9270
9271 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
9272 }
9273
9274 /* Unlink and return any mblk that looks like it contains an ire */
9275 static mblk_t *
9276 tcp_ire_mp(mblk_t *mp)
9277 {
9278 mblk_t *prev_mp;
9279
9280 for (;;) {
9281 prev_mp = mp;
9282 mp = mp->b_cont;
9283 if (mp == NULL)
9284 break;
9285 switch (DB_TYPE(mp)) {
9286 case IRE_DB_TYPE:
9287 case IRE_DB_REQ_TYPE:
9288 if (prev_mp != NULL)
9289 prev_mp->b_cont = mp->b_cont;
9290 mp->b_cont = NULL;
9291 return (mp);
9292 default:
9293 break;
9294 }
9295 }
9296 return (mp);
9297 }
9298
9299 /*
9300 * Timer callback routine for keepalive probe. We do a fake resend of
9301 * last ACKed byte. Then set a timer using RTO. When the timer expires,
9302 * check to see if we have heard anything from the other end for the last
9303 * RTO period. If we have, set the timer to expire for another
9304 * tcp_keepalive_intrvl and check again. If we have not, set a timer using
9305 * RTO << 1 and check again when it expires. Keep exponentially increasing
9306 * the timeout if we have not heard from the other side. If for more than
9307 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
9308 * kill the connection unless the keepalive abort threshold is 0. In
9309 * that case, we will probe "forever."
9310 */
9311 static void
9312 tcp_keepalive_killer(void *arg)
9313 {
9314 mblk_t *mp;
9315 conn_t *connp = (conn_t *)arg;
9316 tcp_t *tcp = connp->conn_tcp;
9317 int32_t firetime;
9318 int32_t idletime;
9319 int32_t ka_intrvl;
9320 tcp_stack_t *tcps = tcp->tcp_tcps;
9321
9322 tcp->tcp_ka_tid = 0;
9323
9324 if (tcp->tcp_fused)
9325 return;
9326
9327 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive);
9328 ka_intrvl = tcp->tcp_ka_interval;
9329
9330 /*
9331 * Keepalive probe should only be sent if the application has not
9332 * done a close on the connection.
9333 */
9334 if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
9335 return;
9336 }
9337 /* Timer fired too early, restart it. */
9338 if (tcp->tcp_state < TCPS_ESTABLISHED) {
9339 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
9340 MSEC_TO_TICK(ka_intrvl));
9341 return;
9342 }
9343
9344 idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time);
9345 /*
9346 * If we have not heard from the other side for a long
9347 * time, kill the connection unless the keepalive abort
9348 * threshold is 0. In that case, we will probe "forever."
9349 */
9350 if (tcp->tcp_ka_abort_thres != 0 &&
9351 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
9352 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop);
9353 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
9354 tcp->tcp_client_errno : ETIMEDOUT, 11);
9355 return;
9356 }
9357
9358 if (tcp->tcp_snxt == tcp->tcp_suna &&
9359 idletime >= ka_intrvl) {
9360 /* Fake resend of last ACKed byte. */
9361 mblk_t *mp1 = allocb(1, BPRI_LO);
9362
9363 if (mp1 != NULL) {
9364 *mp1->b_wptr++ = '\0';
9365 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
9366 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
9367 freeb(mp1);
9368 /*
9369 * if allocation failed, fall through to start the
9370 * timer back.
9371 */
9372 if (mp != NULL) {
9373 TCP_RECORD_TRACE(tcp, mp,
9374 TCP_TRACE_SEND_PKT);
9375 tcp_send_data(tcp, tcp->tcp_wq, mp);
9376 BUMP_MIB(&tcps->tcps_mib,
9377 tcpTimKeepaliveProbe);
9378 if (tcp->tcp_ka_last_intrvl != 0) {
9379 int max;
9380 /*
9381 * We should probe again at least
9382 * in ka_intrvl, but not more than
9383 * tcp_rexmit_interval_max.
9384 */
9385 max = tcps->tcps_rexmit_interval_max;
9386 firetime = MIN(ka_intrvl - 1,
9387 tcp->tcp_ka_last_intrvl << 1);
9388 if (firetime > max)
9389 firetime = max;
9390 } else {
9391 firetime = tcp->tcp_rto;
9392 }
9393 tcp->tcp_ka_tid = TCP_TIMER(tcp,
9394 tcp_keepalive_killer,
9395 MSEC_TO_TICK(firetime));
9396 tcp->tcp_ka_last_intrvl = firetime;
9397 return;
9398 }
9399 }
9400 } else {
9401 tcp->tcp_ka_last_intrvl = 0;
9402 }
9403
9404 /* firetime can be negative if (mp1 == NULL || mp == NULL) */
9405 if ((firetime = ka_intrvl - idletime) < 0) {
9406 firetime = ka_intrvl;
9407 }
9408 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
9409 MSEC_TO_TICK(firetime));
9410 }
9411
9412 int
9413 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
9414 {
9415 queue_t *q = tcp->tcp_rq;
9416 int32_t mss = tcp->tcp_mss;
9417 int maxpsz;
9418
9419 if (TCP_IS_DETACHED(tcp))
9420 return (mss);
9421
9422 if (tcp->tcp_fused) {
9423 maxpsz = tcp_fuse_maxpsz_set(tcp);
9424 mss = INFPSZ;
9425 } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) {
9426 /*
9427 * Set the sd_qn_maxpsz according to the socket send buffer
9428 * size, and sd_maxblk to INFPSZ (-1). This will essentially
9429 * instruct the stream head to copyin user data into contiguous
9430 * kernel-allocated buffers without breaking it up into smaller
9431 * chunks. We round up the buffer size to the nearest SMSS.
9432 */
9433 maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss);
9434 if (tcp->tcp_kssl_ctx == NULL)
9435 mss = INFPSZ;
9436 else
9437 mss = SSL3_MAX_RECORD_LEN;
9438 } else {
9439 /*
9440 * Set sd_qn_maxpsz to approx half the (receivers) buffer
9441 * (and a multiple of the mss). This instructs the stream
9442 * head to break down larger than SMSS writes into SMSS-
9443 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
9444 */
9445 maxpsz = tcp->tcp_maxpsz * mss;
9446 if (maxpsz > tcp->tcp_xmit_hiwater/2) {
9447 maxpsz = tcp->tcp_xmit_hiwater/2;
9448 /* Round up to nearest mss */
9449 maxpsz = MSS_ROUNDUP(maxpsz, mss);
9450 }
9451 }
9452 (void) setmaxps(q, maxpsz);
9453 tcp->tcp_wq->q_maxpsz = maxpsz;
9454
9455 if (set_maxblk)
9456 (void) mi_set_sth_maxblk(q, mss);
9457
9458 return (mss);
9459 }
9460
9461 /*
9462 * Extract option values from a tcp header. We put any found values into the
9463 * tcpopt struct and return a bitmask saying which options were found.
9464 */
9465 static int
9466 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
9467 {
9468 uchar_t *endp;
9469 int len;
9470 uint32_t mss;
9471 uchar_t *up = (uchar_t *)tcph;
9472 int found = 0;
9473 int32_t sack_len;
9474 tcp_seq sack_begin, sack_end;
9475 tcp_t *tcp;
9476
9477 endp = up + TCP_HDR_LENGTH(tcph);
9478 up += TCP_MIN_HEADER_LENGTH;
9479 while (up < endp) {
9480 len = endp - up;
9481 switch (*up) {
9482 case TCPOPT_EOL:
9483 break;
9484
9485 case TCPOPT_NOP:
9486 up++;
9487 continue;
9488
9489 case TCPOPT_MAXSEG:
9490 if (len < TCPOPT_MAXSEG_LEN ||
9491 up[1] != TCPOPT_MAXSEG_LEN)
9492 break;
9493
9494 mss = BE16_TO_U16(up+2);
9495 /* Caller must handle tcp_mss_min and tcp_mss_max_* */
9496 tcpopt->tcp_opt_mss = mss;
9497 found |= TCP_OPT_MSS_PRESENT;
9498
9499 up += TCPOPT_MAXSEG_LEN;
9500 continue;
9501
9502 case TCPOPT_WSCALE:
9503 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
9504 break;
9505
9506 if (up[2] > TCP_MAX_WINSHIFT)
9507 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
9508 else
9509 tcpopt->tcp_opt_wscale = up[2];
9510 found |= TCP_OPT_WSCALE_PRESENT;
9511
9512 up += TCPOPT_WS_LEN;
9513 continue;
9514
9515 case TCPOPT_SACK_PERMITTED:
9516 if (len < TCPOPT_SACK_OK_LEN ||
9517 up[1] != TCPOPT_SACK_OK_LEN)
9518 break;
9519 found |= TCP_OPT_SACK_OK_PRESENT;
9520 up += TCPOPT_SACK_OK_LEN;
9521 continue;
9522
9523 case TCPOPT_SACK:
9524 if (len <= 2 || up[1] <= 2 || len < up[1])
9525 break;
9526
9527 /* If TCP is not interested in SACK blks... */
9528 if ((tcp = tcpopt->tcp) == NULL) {
9529 up += up[1];
9530 continue;
9531 }
9532 sack_len = up[1] - TCPOPT_HEADER_LEN;
9533 up += TCPOPT_HEADER_LEN;
9534
9535 /*
9536 * If the list is empty, allocate one and assume
9537 * nothing is sack'ed.
9538 */
9539 ASSERT(tcp->tcp_sack_info != NULL);
9540 if (tcp->tcp_notsack_list == NULL) {
9541 tcp_notsack_update(&(tcp->tcp_notsack_list),
9542 tcp->tcp_suna, tcp->tcp_snxt,
9543 &(tcp->tcp_num_notsack_blk),
9544 &(tcp->tcp_cnt_notsack_list));
9545
9546 /*
9547 * Make sure tcp_notsack_list is not NULL.
9548 * This happens when kmem_alloc(KM_NOSLEEP)
9549 * returns NULL.
9550 */
9551 if (tcp->tcp_notsack_list == NULL) {
9552 up += sack_len;
9553 continue;
9554 }
9555 tcp->tcp_fack = tcp->tcp_suna;
9556 }
9557
9558 while (sack_len > 0) {
9559 if (up + 8 > endp) {
9560 up = endp;
9561 break;
9562 }
9563 sack_begin = BE32_TO_U32(up);
9564 up += 4;
9565 sack_end = BE32_TO_U32(up);
9566 up += 4;
9567 sack_len -= 8;
9568 /*
9569 * Bounds checking. Make sure the SACK
9570 * info is within tcp_suna and tcp_snxt.
9571 * If this SACK blk is out of bound, ignore
9572 * it but continue to parse the following
9573 * blks.
9574 */
9575 if (SEQ_LEQ(sack_end, sack_begin) ||
9576 SEQ_LT(sack_begin, tcp->tcp_suna) ||
9577 SEQ_GT(sack_end, tcp->tcp_snxt)) {
9578 continue;
9579 }
9580 tcp_notsack_insert(&(tcp->tcp_notsack_list),
9581 sack_begin, sack_end,
9582 &(tcp->tcp_num_notsack_blk),
9583 &(tcp->tcp_cnt_notsack_list));
9584 if (SEQ_GT(sack_end, tcp->tcp_fack)) {
9585 tcp->tcp_fack = sack_end;
9586 }
9587 }
9588 found |= TCP_OPT_SACK_PRESENT;
9589 continue;
9590
9591 case TCPOPT_TSTAMP:
9592 if (len < TCPOPT_TSTAMP_LEN ||
9593 up[1] != TCPOPT_TSTAMP_LEN)
9594 break;
9595
9596 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
9597 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
9598
9599 found |= TCP_OPT_TSTAMP_PRESENT;
9600
9601 up += TCPOPT_TSTAMP_LEN;
9602 continue;
9603
9604 default:
9605 if (len <= 1 || len < (int)up[1] || up[1] == 0)
9606 break;
9607 up += up[1];
9608 continue;
9609 }
9610 break;
9611 }
9612 return (found);
9613 }
9614
9615 /*
9616 * Set the mss associated with a particular tcp based on its current value,
9617 * and a new one passed in. Observe minimums and maximums, and reset
9618 * other state variables that we want to view as multiples of mss.
9619 *
9620 * This function is called mainly because values like tcp_mss, tcp_cwnd,
9621 * highwater marks etc. need to be initialized or adjusted.
9622 * 1) From tcp_process_options() when the other side's SYN/SYN-ACK
9623 * packet arrives.
9624 * 2) We need to set a new MSS when ICMP_FRAGMENTATION_NEEDED or
9625 * ICMP6_PACKET_TOO_BIG arrives.
9626 * 3) From tcp_paws_check() if the other side stops sending the timestamp,
9627 * to increase the MSS to use the extra bytes available.
9628 *
9629 * Callers except tcp_paws_check() ensure that they only reduce mss.
9630 */
9631 static void
9632 tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss)
9633 {
9634 uint32_t mss_max;
9635 tcp_stack_t *tcps = tcp->tcp_tcps;
9636
9637 if (tcp->tcp_ipversion == IPV4_VERSION)
9638 mss_max = tcps->tcps_mss_max_ipv4;
9639 else
9640 mss_max = tcps->tcps_mss_max_ipv6;
9641
9642 if (mss < tcps->tcps_mss_min)
9643 mss = tcps->tcps_mss_min;
9644 if (mss > mss_max)
9645 mss = mss_max;
9646 /*
9647 * Unless naglim has been set by our client to
9648 * a non-mss value, force naglim to track mss.
9649 * This can help to aggregate small writes.
9650 */
9651 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
9652 tcp->tcp_naglim = mss;
9653 /*
9654 * TCP should be able to buffer at least 4 MSS data for obvious
9655 * performance reason.
9656 */
9657 if ((mss << 2) > tcp->tcp_xmit_hiwater)
9658 tcp->tcp_xmit_hiwater = mss << 2;
9659
9660 if (do_ss) {
9661 /*
9662 * Either the tcp_cwnd is as yet uninitialized, or mss is
9663 * changing due to a reduction in MTU, presumably as a
9664 * result of a new path component, reset cwnd to its
9665 * "initial" value, as a multiple of the new mss.
9666 */
9667 SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_initial);
9668 } else {
9669 /*
9670 * Called by tcp_paws_check(), the mss increased
9671 * marginally to allow use of space previously taken
9672 * by the timestamp option. It would be inappropriate
9673 * to apply slow start or tcp_init_cwnd values to
9674 * tcp_cwnd, simply adjust to a multiple of the new mss.
9675 */
9676 tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
9677 tcp->tcp_cwnd_cnt = 0;
9678 }
9679 tcp->tcp_mss = mss;
9680 (void) tcp_maxpsz_set(tcp, B_TRUE);
9681 }
9682
9683 /* For /dev/tcp aka AF_INET open */
9684 static int
9685 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
9686 {
9687 return (tcp_open(q, devp, flag, sflag, credp, B_FALSE));
9688 }
9689
9690 /* For /dev/tcp6 aka AF_INET6 open */
9691 static int
9692 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
9693 {
9694 return (tcp_open(q, devp, flag, sflag, credp, B_TRUE));
9695 }
9696
9697 static int
9698 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
9699 boolean_t isv6)
9700 {
9701 tcp_t *tcp = NULL;
9702 conn_t *connp;
9703 int err;
9704 vmem_t *minor_arena = NULL;
9705 dev_t conn_dev;
9706 zoneid_t zoneid;
9707 tcp_stack_t *tcps = NULL;
9708
9709 if (q->q_ptr != NULL)
9710 return (0);
9711
9712 if (sflag == MODOPEN)
9713 return (EINVAL);
9714
9715 if (!(flag & SO_ACCEPTOR)) {
9716 /*
9717 * Special case for install: miniroot needs to be able to
9718 * access files via NFS as though it were always in the
9719 * global zone.
9720 */
9721 if (credp == kcred && nfs_global_client_only != 0) {
9722 zoneid = GLOBAL_ZONEID;
9723 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->
9724 netstack_tcp;
9725 ASSERT(tcps != NULL);
9726 } else {
9727 netstack_t *ns;
9728
9729 ns = netstack_find_by_cred(credp);
9730 ASSERT(ns != NULL);
9731 tcps = ns->netstack_tcp;
9732 ASSERT(tcps != NULL);
9733
9734 /*
9735 * For exclusive stacks we set the zoneid to zero
9736 * to make TCP operate as if in the global zone.
9737 */
9738 if (tcps->tcps_netstack->netstack_stackid !=
9739 GLOBAL_NETSTACKID)
9740 zoneid = GLOBAL_ZONEID;
9741 else
9742 zoneid = crgetzoneid(credp);
9743 }
9744 /*
9745 * For stackid zero this is done from strplumb.c, but
9746 * non-zero stackids are handled here.
9747 */
9748 if (tcps->tcps_g_q == NULL &&
9749 tcps->tcps_netstack->netstack_stackid !=
9750 GLOBAL_NETSTACKID) {
9751 tcp_g_q_setup(tcps);
9752 }
9753 }
9754
9755 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
9756 ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
9757 minor_arena = ip_minor_arena_la;
9758 } else {
9759 /*
9760 * Either minor numbers in the large arena were exhausted
9761 * or a non socket application is doing the open.
9762 * Try to allocate from the small arena.
9763 */
9764 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
9765 if (tcps != NULL)
9766 netstack_rele(tcps->tcps_netstack);
9767 return (EBUSY);
9768 }
9769 minor_arena = ip_minor_arena_sa;
9770 }
9771 ASSERT(minor_arena != NULL);
9772
9773 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
9774
9775 if (flag & SO_ACCEPTOR) {
9776 /* No netstack_find_by_cred, hence no netstack_rele needed */
9777 ASSERT(tcps == NULL);
9778 q->q_qinfo = &tcp_acceptor_rinit;
9779 /*
9780 * the conn_dev and minor_arena will be subsequently used by
9781 * tcp_wput_accept() and tcpclose_accept() to figure out the
9782 * minor device number for this connection from the q_ptr.
9783 */
9784 RD(q)->q_ptr = (void *)conn_dev;
9785 WR(q)->q_qinfo = &tcp_acceptor_winit;
9786 WR(q)->q_ptr = (void *)minor_arena;
9787 qprocson(q);
9788 return (0);
9789 }
9790
9791 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt), tcps);
9792 /*
9793 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
9794 * so we drop it by one.
9795 */
9796 netstack_rele(tcps->tcps_netstack);
9797 if (connp == NULL) {
9798 inet_minor_free(minor_arena, conn_dev);
9799 q->q_ptr = NULL;
9800 return (ENOSR);
9801 }
9802 connp->conn_sqp = IP_SQUEUE_GET(lbolt);
9803 tcp = connp->conn_tcp;
9804
9805 q->q_ptr = WR(q)->q_ptr = connp;
9806 if (isv6) {
9807 connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6);
9808 connp->conn_send = ip_output_v6;
9809 connp->conn_af_isv6 = B_TRUE;
9810 connp->conn_pkt_isv6 = B_TRUE;
9811 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
9812 tcp->tcp_ipversion = IPV6_VERSION;
9813 tcp->tcp_family = AF_INET6;
9814 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
9815 } else {
9816 connp->conn_flags |= IPCL_TCP4;
9817 connp->conn_send = ip_output;
9818 connp->conn_af_isv6 = B_FALSE;
9819 connp->conn_pkt_isv6 = B_FALSE;
9820 tcp->tcp_ipversion = IPV4_VERSION;
9821 tcp->tcp_family = AF_INET;
9822 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
9823 }
9824
9825 /*
9826 * TCP keeps a copy of cred for cache locality reasons but
9827 * we put a reference only once. If connp->conn_cred
9828 * becomes invalid, tcp_cred should also be set to NULL.
9829 */
9830 tcp->tcp_cred = connp->conn_cred = credp;
9831 crhold(connp->conn_cred);
9832 tcp->tcp_cpid = curproc->p_pid;
9833 tcp->tcp_open_time = lbolt64;
9834 connp->conn_zoneid = zoneid;
9835 connp->conn_mlp_type = mlptSingle;
9836 connp->conn_ulp_labeled = !is_system_labeled();
9837 ASSERT(connp->conn_netstack == tcps->tcps_netstack);
9838 ASSERT(tcp->tcp_tcps == tcps);
9839
9840 /*
9841 * If the caller has the process-wide flag set, then default to MAC
9842 * exempt mode. This allows read-down to unlabeled hosts.
9843 */
9844 if (getpflags(NET_MAC_AWARE, credp) != 0)
9845 connp->conn_mac_exempt = B_TRUE;
9846
9847 connp->conn_dev = conn_dev;
9848 connp->conn_minor_arena = minor_arena;
9849
9850 ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6);
9851 ASSERT(WR(q)->q_qinfo == &tcp_winit);
9852
9853 if (flag & SO_SOCKSTR) {
9854 /*
9855 * No need to insert a socket in tcp acceptor hash.
9856 * If it was a socket acceptor stream, we dealt with
9857 * it above. A socket listener can never accept a
9858 * connection and doesn't need acceptor_id.
9859 */
9860 connp->conn_flags |= IPCL_SOCKET;
9861 tcp->tcp_issocket = 1;
9862 WR(q)->q_qinfo = &tcp_sock_winit;
9863 } else {
9864 #ifdef _ILP32
9865 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
9866 #else
9867 tcp->tcp_acceptor_id = conn_dev;
9868 #endif /* _ILP32 */
9869 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
9870 }
9871
9872 if (tcps->tcps_trace)
9873 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP);
9874
9875 err = tcp_init(tcp, q);
9876 if (err != 0) {
9877 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
9878 tcp_acceptor_hash_remove(tcp);
9879 CONN_DEC_REF(connp);
9880 q->q_ptr = WR(q)->q_ptr = NULL;
9881 return (err);
9882 }
9883
9884 RD(q)->q_hiwat = tcps->tcps_recv_hiwat;
9885 tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
9886
9887 /* Non-zero default values */
9888 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
9889 /*
9890 * Put the ref for TCP. Ref for IP was already put
9891 * by ipcl_conn_create. Also Make the conn_t globally
9892 * visible to walkers
9893 */
9894 mutex_enter(&connp->conn_lock);
9895 CONN_INC_REF_LOCKED(connp);
9896 ASSERT(connp->conn_ref == 2);
9897 connp->conn_state_flags &= ~CONN_INCIPIENT;
9898 mutex_exit(&connp->conn_lock);
9899
9900 qprocson(q);
9901 return (0);
9902 }
9903
9904 /*
9905 * Some TCP options can be "set" by requesting them in the option
9906 * buffer. This is needed for XTI feature test though we do not
9907 * allow it in general. We interpret that this mechanism is more
9908 * applicable to OSI protocols and need not be allowed in general.
9909 * This routine filters out options for which it is not allowed (most)
9910 * and lets through those (few) for which it is. [ The XTI interface
9911 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
9912 * ever implemented will have to be allowed here ].
9913 */
9914 static boolean_t
9915 tcp_allow_connopt_set(int level, int name)
9916 {
9917
9918 switch (level) {
9919 case IPPROTO_TCP:
9920 switch (name) {
9921 case TCP_NODELAY:
9922 return (B_TRUE);
9923 default:
9924 return (B_FALSE);
9925 }
9926 /*NOTREACHED*/
9927 default:
9928 return (B_FALSE);
9929 }
9930 /*NOTREACHED*/
9931 }
9932
9933 /*
9934 * This routine gets default values of certain options whose default
9935 * values are maintained by protocol specific code
9936 */
9937 /* ARGSUSED */
9938 int
9939 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
9940 {
9941 int32_t *i1 = (int32_t *)ptr;
9942 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
9943
9944 switch (level) {
9945 case IPPROTO_TCP:
9946 switch (name) {
9947 case TCP_NOTIFY_THRESHOLD:
9948 *i1 = tcps->tcps_ip_notify_interval;
9949 break;
9950 case TCP_ABORT_THRESHOLD:
9951 *i1 = tcps->tcps_ip_abort_interval;
9952 break;
9953 case TCP_CONN_NOTIFY_THRESHOLD:
9954 *i1 = tcps->tcps_ip_notify_cinterval;
9955 break;
9956 case TCP_CONN_ABORT_THRESHOLD:
9957 *i1 = tcps->tcps_ip_abort_cinterval;
9958 break;
9959 default:
9960 return (-1);
9961 }
9962 break;
9963 case IPPROTO_IP:
9964 switch (name) {
9965 case IP_TTL:
9966 *i1 = tcps->tcps_ipv4_ttl;
9967 break;
9968 default:
9969 return (-1);
9970 }
9971 break;
9972 case IPPROTO_IPV6:
9973 switch (name) {
9974 case IPV6_UNICAST_HOPS:
9975 *i1 = tcps->tcps_ipv6_hoplimit;
9976 break;
9977 default:
9978 return (-1);
9979 }
9980 break;
9981 default:
9982 return (-1);
9983 }
9984 return (sizeof (int));
9985 }
9986
9987
9988 /*
9989 * TCP routine to get the values of options.
9990 */
9991 int
9992 tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
9993 {
9994 int *i1 = (int *)ptr;
9995 conn_t *connp = Q_TO_CONN(q);
9996 tcp_t *tcp = connp->conn_tcp;
9997 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
9998
9999 switch (level) {
10000 case SOL_SOCKET:
10001 switch (name) {
10002 case SO_LINGER: {
10003 struct linger *lgr = (struct linger *)ptr;
10004
10005 lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0;
10006 lgr->l_linger = tcp->tcp_lingertime;
10007 }
10008 return (sizeof (struct linger));
10009 case SO_DEBUG:
10010 *i1 = tcp->tcp_debug ? SO_DEBUG : 0;
10011 break;
10012 case SO_KEEPALIVE:
10013 *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0;
10014 break;
10015 case SO_DONTROUTE:
10016 *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0;
10017 break;
10018 case SO_USELOOPBACK:
10019 *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0;
10020 break;
10021 case SO_BROADCAST:
10022 *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0;
10023 break;
10024 case SO_REUSEADDR:
10025 *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0;
10026 break;
10027 case SO_OOBINLINE:
10028 *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0;
10029 break;
10030 case SO_DGRAM_ERRIND:
10031 *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0;
10032 break;
10033 case SO_TYPE:
10034 *i1 = SOCK_STREAM;
10035 break;
10036 case SO_SNDBUF:
10037 *i1 = tcp->tcp_xmit_hiwater;
10038 break;
10039 case SO_RCVBUF:
10040 *i1 = RD(q)->q_hiwat;
10041 break;
10042 case SO_SND_COPYAVOID:
10043 *i1 = tcp->tcp_snd_zcopy_on ?
10044 SO_SND_COPYAVOID : 0;
10045 break;
10046 case SO_ALLZONES:
10047 *i1 = connp->conn_allzones ? 1 : 0;
10048 break;
10049 case SO_ANON_MLP:
10050 *i1 = connp->conn_anon_mlp;
10051 break;
10052 case SO_MAC_EXEMPT:
10053 *i1 = connp->conn_mac_exempt;
10054 break;
10055 case SO_EXCLBIND:
10056 *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0;
10057 break;
10058 case SO_PROTOTYPE:
10059 *i1 = IPPROTO_TCP;
10060 break;
10061 case SO_DOMAIN:
10062 *i1 = tcp->tcp_family;
10063 break;
10064 default:
10065 return (-1);
10066 }
10067 break;
10068 case IPPROTO_TCP:
10069 switch (name) {
10070 case TCP_NODELAY:
10071 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
10072 break;
10073 case TCP_MAXSEG:
10074 *i1 = tcp->tcp_mss;
10075 break;
10076 case TCP_NOTIFY_THRESHOLD:
10077 *i1 = (int)tcp->tcp_first_timer_threshold;
10078 break;
10079 case TCP_ABORT_THRESHOLD:
10080 *i1 = tcp->tcp_second_timer_threshold;
10081 break;
10082 case TCP_CONN_NOTIFY_THRESHOLD:
10083 *i1 = tcp->tcp_first_ctimer_threshold;
10084 break;
10085 case TCP_CONN_ABORT_THRESHOLD:
10086 *i1 = tcp->tcp_second_ctimer_threshold;
10087 break;
10088 case TCP_RECVDSTADDR:
10089 *i1 = tcp->tcp_recvdstaddr;
10090 break;
10091 case TCP_ANONPRIVBIND:
10092 *i1 = tcp->tcp_anon_priv_bind;
10093 break;
10094 case TCP_EXCLBIND:
10095 *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0;
10096 break;
10097 case TCP_INIT_CWND:
10098 *i1 = tcp->tcp_init_cwnd;
10099 break;
10100 case TCP_KEEPALIVE_THRESHOLD:
10101 *i1 = tcp->tcp_ka_interval;
10102 break;
10103 case TCP_KEEPALIVE_ABORT_THRESHOLD:
10104 *i1 = tcp->tcp_ka_abort_thres;
10105 break;
10106 case TCP_CORK:
10107 *i1 = tcp->tcp_cork;
10108 break;
10109 default:
10110 return (-1);
10111 }
10112 break;
10113 case IPPROTO_IP:
10114 if (tcp->tcp_family != AF_INET)
10115 return (-1);
10116 switch (name) {
10117 case IP_OPTIONS:
10118 case T_IP_OPTIONS: {
10119 /*
10120 * This is compatible with BSD in that in only return
10121 * the reverse source route with the final destination
10122 * as the last entry. The first 4 bytes of the option
10123 * will contain the final destination.
10124 */
10125 int opt_len;
10126
10127 opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha;
10128 opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH;
10129 ASSERT(opt_len >= 0);
10130 /* Caller ensures enough space */
10131 if (opt_len > 0) {
10132 /*
10133 * TODO: Do we have to handle getsockopt on an
10134 * initiator as well?
10135 */
10136 return (ip_opt_get_user(tcp->tcp_ipha, ptr));
10137 }
10138 return (0);
10139 }
10140 case IP_TOS:
10141 case T_IP_TOS:
10142 *i1 = (int)tcp->tcp_ipha->ipha_type_of_service;
10143 break;
10144 case IP_TTL:
10145 *i1 = (int)tcp->tcp_ipha->ipha_ttl;
10146 break;
10147 case IP_NEXTHOP:
10148 /* Handled at IP level */
10149 return (-EINVAL);
10150 default:
10151 return (-1);
10152 }
10153 break;
10154 case IPPROTO_IPV6:
10155 /*
10156 * IPPROTO_IPV6 options are only supported for sockets
10157 * that are using IPv6 on the wire.
10158 */
10159 if (tcp->tcp_ipversion != IPV6_VERSION) {
10160 return (-1);
10161 }
10162 switch (name) {
10163 case IPV6_UNICAST_HOPS:
10164 *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops;
10165 break; /* goto sizeof (int) option return */
10166 case IPV6_BOUND_IF:
10167 /* Zero if not set */
10168 *i1 = tcp->tcp_bound_if;
10169 break; /* goto sizeof (int) option return */
10170 case IPV6_RECVPKTINFO:
10171 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)
10172 *i1 = 1;
10173 else
10174 *i1 = 0;
10175 break; /* goto sizeof (int) option return */
10176 case IPV6_RECVTCLASS:
10177 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)
10178 *i1 = 1;
10179 else
10180 *i1 = 0;
10181 break; /* goto sizeof (int) option return */
10182 case IPV6_RECVHOPLIMIT:
10183 if (tcp->tcp_ipv6_recvancillary &
10184 TCP_IPV6_RECVHOPLIMIT)
10185 *i1 = 1;
10186 else
10187 *i1 = 0;
10188 break; /* goto sizeof (int) option return */
10189 case IPV6_RECVHOPOPTS:
10190 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS)
10191 *i1 = 1;
10192 else
10193 *i1 = 0;
10194 break; /* goto sizeof (int) option return */
10195 case IPV6_RECVDSTOPTS:
10196 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS)
10197 *i1 = 1;
10198 else
10199 *i1 = 0;
10200 break; /* goto sizeof (int) option return */
10201 case _OLD_IPV6_RECVDSTOPTS:
10202 if (tcp->tcp_ipv6_recvancillary &
10203 TCP_OLD_IPV6_RECVDSTOPTS)
10204 *i1 = 1;
10205 else
10206 *i1 = 0;
10207 break; /* goto sizeof (int) option return */
10208 case IPV6_RECVRTHDR:
10209 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR)
10210 *i1 = 1;
10211 else
10212 *i1 = 0;
10213 break; /* goto sizeof (int) option return */
10214 case IPV6_RECVRTHDRDSTOPTS:
10215 if (tcp->tcp_ipv6_recvancillary &
10216 TCP_IPV6_RECVRTDSTOPTS)
10217 *i1 = 1;
10218 else
10219 *i1 = 0;
10220 break; /* goto sizeof (int) option return */
10221 case IPV6_PKTINFO: {
10222 /* XXX assumes that caller has room for max size! */
10223 struct in6_pktinfo *pkti;
10224
10225 pkti = (struct in6_pktinfo *)ptr;
10226 if (ipp->ipp_fields & IPPF_IFINDEX)
10227 pkti->ipi6_ifindex = ipp->ipp_ifindex;
10228 else
10229 pkti->ipi6_ifindex = 0;
10230 if (ipp->ipp_fields & IPPF_ADDR)
10231 pkti->ipi6_addr = ipp->ipp_addr;
10232 else
10233 pkti->ipi6_addr = ipv6_all_zeros;
10234 return (sizeof (struct in6_pktinfo));
10235 }
10236 case IPV6_TCLASS:
10237 if (ipp->ipp_fields & IPPF_TCLASS)
10238 *i1 = ipp->ipp_tclass;
10239 else
10240 *i1 = IPV6_FLOW_TCLASS(
10241 IPV6_DEFAULT_VERS_AND_FLOW);
10242 break; /* goto sizeof (int) option return */
10243 case IPV6_NEXTHOP: {
10244 sin6_t *sin6 = (sin6_t *)ptr;
10245
10246 if (!(ipp->ipp_fields & IPPF_NEXTHOP))
10247 return (0);
10248 *sin6 = sin6_null;
10249 sin6->sin6_family = AF_INET6;
10250 sin6->sin6_addr = ipp->ipp_nexthop;
10251 return (sizeof (sin6_t));
10252 }
10253 case IPV6_HOPOPTS:
10254 if (!(ipp->ipp_fields & IPPF_HOPOPTS))
10255 return (0);
10256 if (ipp->ipp_hopoptslen <= tcp->tcp_label_len)
10257 return (0);
10258 bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len,
10259 ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len);
10260 if (tcp->tcp_label_len > 0) {
10261 ptr[0] = ((char *)ipp->ipp_hopopts)[0];
10262 ptr[1] = (ipp->ipp_hopoptslen -
10263 tcp->tcp_label_len + 7) / 8 - 1;
10264 }
10265 return (ipp->ipp_hopoptslen - tcp->tcp_label_len);
10266 case IPV6_RTHDRDSTOPTS:
10267 if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
10268 return (0);
10269 bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
10270 return (ipp->ipp_rtdstoptslen);
10271 case IPV6_RTHDR:
10272 if (!(ipp->ipp_fields & IPPF_RTHDR))
10273 return (0);
10274 bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
10275 return (ipp->ipp_rthdrlen);
10276 case IPV6_DSTOPTS:
10277 if (!(ipp->ipp_fields & IPPF_DSTOPTS))
10278 return (0);
10279 bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
10280 return (ipp->ipp_dstoptslen);
10281 case IPV6_SRC_PREFERENCES:
10282 return (ip6_get_src_preferences(connp,
10283 (uint32_t *)ptr));
10284 case IPV6_PATHMTU: {
10285 struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr;
10286
10287 if (tcp->tcp_state < TCPS_ESTABLISHED)
10288 return (-1);
10289
10290 return (ip_fill_mtuinfo(&connp->conn_remv6,
10291 connp->conn_fport, mtuinfo,
10292 connp->conn_netstack));
10293 }
10294 default:
10295 return (-1);
10296 }
10297 break;
10298 default:
10299 return (-1);
10300 }
10301 return (sizeof (int));
10302 }
10303
10304 /*
10305 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
10306 * Parameters are assumed to be verified by the caller.
10307 */
10308 /* ARGSUSED */
10309 int
10310 tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
10311 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
10312 void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
10313 {
10314 conn_t *connp = Q_TO_CONN(q);
10315 tcp_t *tcp = connp->conn_tcp;
10316 int *i1 = (int *)invalp;
10317 boolean_t onoff = (*i1 == 0) ? 0 : 1;
10318 boolean_t checkonly;
10319 int reterr;
10320 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
10321
10322 switch (optset_context) {
10323 case SETFN_OPTCOM_CHECKONLY:
10324 checkonly = B_TRUE;
10325 /*
10326 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
10327 * inlen != 0 implies value supplied and
10328 * we have to "pretend" to set it.
10329 * inlen == 0 implies that there is no
10330 * value part in T_CHECK request and just validation
10331 * done elsewhere should be enough, we just return here.
10332 */
10333 if (inlen == 0) {
10334 *outlenp = 0;
10335 return (0);
10336 }
10337 break;
10338 case SETFN_OPTCOM_NEGOTIATE:
10339 checkonly = B_FALSE;
10340 break;
10341 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
10342 case SETFN_CONN_NEGOTIATE:
10343 checkonly = B_FALSE;
10344 /*
10345 * Negotiating local and "association-related" options
10346 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
10347 * primitives is allowed by XTI, but we choose
10348 * to not implement this style negotiation for Internet
10349 * protocols (We interpret it is a must for OSI world but
10350 * optional for Internet protocols) for all options.
10351 * [ Will do only for the few options that enable test
10352 * suites that our XTI implementation of this feature
10353 * works for transports that do allow it ]
10354 */
10355 if (!tcp_allow_connopt_set(level, name)) {
10356 *outlenp = 0;
10357 return (EINVAL);
10358 }
10359 break;
10360 default:
10361 /*
10362 * We should never get here
10363 */
10364 *outlenp = 0;
10365 return (EINVAL);
10366 }
10367
10368 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
10369 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
10370
10371 /*
10372 * For TCP, we should have no ancillary data sent down
10373 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
10374 * has to be zero.
10375 */
10376 ASSERT(thisdg_attrs == NULL);
10377
10378 /*
10379 * For fixed length options, no sanity check
10380 * of passed in length is done. It is assumed *_optcom_req()
10381 * routines do the right thing.
10382 */
10383
10384 switch (level) {
10385 case SOL_SOCKET:
10386 switch (name) {
10387 case SO_LINGER: {
10388 struct linger *lgr = (struct linger *)invalp;
10389
10390 if (!checkonly) {
10391 if (lgr->l_onoff) {
10392 tcp->tcp_linger = 1;
10393 tcp->tcp_lingertime = lgr->l_linger;
10394 } else {
10395 tcp->tcp_linger = 0;
10396 tcp->tcp_lingertime = 0;
10397 }
10398 /* struct copy */
10399 *(struct linger *)outvalp = *lgr;
10400 } else {
10401 if (!lgr->l_onoff) {
10402 ((struct linger *)
10403 outvalp)->l_onoff = 0;
10404 ((struct linger *)
10405 outvalp)->l_linger = 0;
10406 } else {
10407 /* struct copy */
10408 *(struct linger *)outvalp = *lgr;
10409 }
10410 }
10411 *outlenp = sizeof (struct linger);
10412 return (0);
10413 }
10414 case SO_DEBUG:
10415 if (!checkonly)
10416 tcp->tcp_debug = onoff;
10417 break;
10418 case SO_KEEPALIVE:
10419 if (checkonly) {
10420 /* T_CHECK case */
10421 break;
10422 }
10423
10424 if (!onoff) {
10425 if (tcp->tcp_ka_enabled) {
10426 if (tcp->tcp_ka_tid != 0) {
10427 (void) TCP_TIMER_CANCEL(tcp,
10428 tcp->tcp_ka_tid);
10429 tcp->tcp_ka_tid = 0;
10430 }
10431 tcp->tcp_ka_enabled = 0;
10432 }
10433 break;
10434 }
10435 if (!tcp->tcp_ka_enabled) {
10436 /* Crank up the keepalive timer */
10437 tcp->tcp_ka_last_intrvl = 0;
10438 tcp->tcp_ka_tid = TCP_TIMER(tcp,
10439 tcp_keepalive_killer,
10440 MSEC_TO_TICK(tcp->tcp_ka_interval));
10441 tcp->tcp_ka_enabled = 1;
10442 }
10443 break;
10444 case SO_DONTROUTE:
10445 /*
10446 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are
10447 * only of interest to IP. We track them here only so
10448 * that we can report their current value.
10449 */
10450 if (!checkonly) {
10451 tcp->tcp_dontroute = onoff;
10452 tcp->tcp_connp->conn_dontroute = onoff;
10453 }
10454 break;
10455 case SO_USELOOPBACK:
10456 if (!checkonly) {
10457 tcp->tcp_useloopback = onoff;
10458 tcp->tcp_connp->conn_loopback = onoff;
10459 }
10460 break;
10461 case SO_BROADCAST:
10462 if (!checkonly) {
10463 tcp->tcp_broadcast = onoff;
10464 tcp->tcp_connp->conn_broadcast = onoff;
10465 }
10466 break;
10467 case SO_REUSEADDR:
10468 if (!checkonly) {
10469 tcp->tcp_reuseaddr = onoff;
10470 tcp->tcp_connp->conn_reuseaddr = onoff;
10471 }
10472 break;
10473 case SO_OOBINLINE:
10474 if (!checkonly)
10475 tcp->tcp_oobinline = onoff;
10476 break;
10477 case SO_DGRAM_ERRIND:
10478 if (!checkonly)
10479 tcp->tcp_dgram_errind = onoff;
10480 break;
10481 case SO_SNDBUF: {
10482 if (*i1 > tcps->tcps_max_buf) {
10483 *outlenp = 0;
10484 return (ENOBUFS);
10485 }
10486 if (checkonly)
10487 break;
10488
10489 tcp->tcp_xmit_hiwater = *i1;
10490 if (tcps->tcps_snd_lowat_fraction != 0)
10491 tcp->tcp_xmit_lowater =
10492 tcp->tcp_xmit_hiwater /
10493 tcps->tcps_snd_lowat_fraction;
10494 (void) tcp_maxpsz_set(tcp, B_TRUE);
10495 /*
10496 * If we are flow-controlled, recheck the condition.
10497 * There are apps that increase SO_SNDBUF size when
10498 * flow-controlled (EWOULDBLOCK), and expect the flow
10499 * control condition to be lifted right away.
10500 */
10501 mutex_enter(&tcp->tcp_non_sq_lock);
10502 if (tcp->tcp_flow_stopped &&
10503 TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
10504 tcp_clrqfull(tcp);
10505 }
10506 mutex_exit(&tcp->tcp_non_sq_lock);
10507 break;
10508 }
10509 case SO_RCVBUF:
10510 if (*i1 > tcps->tcps_max_buf) {
10511 *outlenp = 0;
10512 return (ENOBUFS);
10513 }
10514 /* Silently ignore zero */
10515 if (!checkonly && *i1 != 0) {
10516 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
10517 (void) tcp_rwnd_set(tcp, *i1);
10518 }
10519 /*
10520 * XXX should we return the rwnd here
10521 * and tcp_opt_get ?
10522 */
10523 break;
10524 case SO_SND_COPYAVOID:
10525 if (!checkonly) {
10526 /* we only allow enable at most once for now */
10527 if (tcp->tcp_loopback ||
10528 (tcp->tcp_kssl_ctx != NULL) ||
10529 (!tcp->tcp_snd_zcopy_aware &&
10530 (onoff != 1 || !tcp_zcopy_check(tcp)))) {
10531 *outlenp = 0;
10532 return (EOPNOTSUPP);
10533 }
10534 tcp->tcp_snd_zcopy_aware = 1;
10535 }
10536 break;
10537 case SO_ALLZONES:
10538 /* Pass option along to IP level for handling */
10539 return (-EINVAL);
10540 case SO_ANON_MLP:
10541 /* Pass option along to IP level for handling */
10542 return (-EINVAL);
10543 case SO_MAC_EXEMPT:
10544 /* Pass option along to IP level for handling */
10545 return (-EINVAL);
10546 case SO_EXCLBIND:
10547 if (!checkonly)
10548 tcp->tcp_exclbind = onoff;
10549 break;
10550 default:
10551 *outlenp = 0;
10552 return (EINVAL);
10553 }
10554 break;
10555 case IPPROTO_TCP:
10556 switch (name) {
10557 case TCP_NODELAY:
10558 if (!checkonly)
10559 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
10560 break;
10561 case TCP_NOTIFY_THRESHOLD:
10562 if (!checkonly)
10563 tcp->tcp_first_timer_threshold = *i1;
10564 break;
10565 case TCP_ABORT_THRESHOLD:
10566 if (!checkonly)
10567 tcp->tcp_second_timer_threshold = *i1;
10568 break;
10569 case TCP_CONN_NOTIFY_THRESHOLD:
10570 if (!checkonly)
10571 tcp->tcp_first_ctimer_threshold = *i1;
10572 break;
10573 case TCP_CONN_ABORT_THRESHOLD:
10574 if (!checkonly)
10575 tcp->tcp_second_ctimer_threshold = *i1;
10576 break;
10577 case TCP_RECVDSTADDR:
10578 if (tcp->tcp_state > TCPS_LISTEN)
10579 return (EOPNOTSUPP);
10580 if (!checkonly)
10581 tcp->tcp_recvdstaddr = onoff;
10582 break;
10583 case TCP_ANONPRIVBIND:
10584 if ((reterr = secpolicy_net_privaddr(cr, 0,
10585 IPPROTO_TCP)) != 0) {
10586 *outlenp = 0;
10587 return (reterr);
10588 }
10589 if (!checkonly) {
10590 tcp->tcp_anon_priv_bind = onoff;
10591 }
10592 break;
10593 case TCP_EXCLBIND:
10594 if (!checkonly)
10595 tcp->tcp_exclbind = onoff;
10596 break; /* goto sizeof (int) option return */
10597 case TCP_INIT_CWND: {
10598 uint32_t init_cwnd = *((uint32_t *)invalp);
10599
10600 if (checkonly)
10601 break;
10602
10603 /*
10604 * Only allow socket with network configuration
10605 * privilege to set the initial cwnd to be larger
10606 * than allowed by RFC 3390.
10607 */
10608 if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
10609 tcp->tcp_init_cwnd = init_cwnd;
10610 break;
10611 }
10612 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
10613 *outlenp = 0;
10614 return (reterr);
10615 }
10616 if (init_cwnd > TCP_MAX_INIT_CWND) {
10617 *outlenp = 0;
10618 return (EINVAL);
10619 }
10620 tcp->tcp_init_cwnd = init_cwnd;
10621 break;
10622 }
10623 case TCP_KEEPALIVE_THRESHOLD:
10624 if (checkonly)
10625 break;
10626
10627 if (*i1 < tcps->tcps_keepalive_interval_low ||
10628 *i1 > tcps->tcps_keepalive_interval_high) {
10629 *outlenp = 0;
10630 return (EINVAL);
10631 }
10632 if (*i1 != tcp->tcp_ka_interval) {
10633 tcp->tcp_ka_interval = *i1;
10634 /*
10635 * Check if we need to restart the
10636 * keepalive timer.
10637 */
10638 if (tcp->tcp_ka_tid != 0) {
10639 ASSERT(tcp->tcp_ka_enabled);
10640 (void) TCP_TIMER_CANCEL(tcp,
10641 tcp->tcp_ka_tid);
10642 tcp->tcp_ka_last_intrvl = 0;
10643 tcp->tcp_ka_tid = TCP_TIMER(tcp,
10644 tcp_keepalive_killer,
10645 MSEC_TO_TICK(tcp->tcp_ka_interval));
10646 }
10647 }
10648 break;
10649 case TCP_KEEPALIVE_ABORT_THRESHOLD:
10650 if (!checkonly) {
10651 if (*i1 <
10652 tcps->tcps_keepalive_abort_interval_low ||
10653 *i1 >
10654 tcps->tcps_keepalive_abort_interval_high) {
10655 *outlenp = 0;
10656 return (EINVAL);
10657 }
10658 tcp->tcp_ka_abort_thres = *i1;
10659 }
10660 break;
10661 case TCP_CORK:
10662 if (!checkonly) {
10663 /*
10664 * if tcp->tcp_cork was set and is now
10665 * being unset, we have to make sure that
10666 * the remaining data gets sent out. Also
10667 * unset tcp->tcp_cork so that tcp_wput_data()
10668 * can send data even if it is less than mss
10669 */
10670 if (tcp->tcp_cork && onoff == 0 &&
10671 tcp->tcp_unsent > 0) {
10672 tcp->tcp_cork = B_FALSE;
10673 tcp_wput_data(tcp, NULL, B_FALSE);
10674 }
10675 tcp->tcp_cork = onoff;
10676 }
10677 break;
10678 default:
10679 *outlenp = 0;
10680 return (EINVAL);
10681 }
10682 break;
10683 case IPPROTO_IP:
10684 if (tcp->tcp_family != AF_INET) {
10685 *outlenp = 0;
10686 return (ENOPROTOOPT);
10687 }
10688 switch (name) {
10689 case IP_OPTIONS:
10690 case T_IP_OPTIONS:
10691 reterr = tcp_opt_set_header(tcp, checkonly,
10692 invalp, inlen);
10693 if (reterr) {
10694 *outlenp = 0;
10695 return (reterr);
10696 }
10697 /* OK return - copy input buffer into output buffer */
10698 if (invalp != outvalp) {
10699 /* don't trust bcopy for identical src/dst */
10700 bcopy(invalp, outvalp, inlen);
10701 }
10702 *outlenp = inlen;
10703 return (0);
10704 case IP_TOS:
10705 case T_IP_TOS:
10706 if (!checkonly) {
10707 tcp->tcp_ipha->ipha_type_of_service =
10708 (uchar_t)*i1;
10709 tcp->tcp_tos = (uchar_t)*i1;
10710 }
10711 break;
10712 case IP_TTL:
10713 if (!checkonly) {
10714 tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1;
10715 tcp->tcp_ttl = (uchar_t)*i1;
10716 }
10717 break;
10718 case IP_BOUND_IF:
10719 case IP_NEXTHOP:
10720 /* Handled at the IP level */
10721 return (-EINVAL);
10722 case IP_SEC_OPT:
10723 /*
10724 * We should not allow policy setting after
10725 * we start listening for connections.
10726 */
10727 if (tcp->tcp_state == TCPS_LISTEN) {
10728 return (EINVAL);
10729 } else {
10730 /* Handled at the IP level */
10731 return (-EINVAL);
10732 }
10733 default:
10734 *outlenp = 0;
10735 return (EINVAL);
10736 }
10737 break;
10738 case IPPROTO_IPV6: {
10739 ip6_pkt_t *ipp;
10740
10741 /*
10742 * IPPROTO_IPV6 options are only supported for sockets
10743 * that are using IPv6 on the wire.
10744 */
10745 if (tcp->tcp_ipversion != IPV6_VERSION) {
10746 *outlenp = 0;
10747 return (ENOPROTOOPT);
10748 }
10749 /*
10750 * Only sticky options; no ancillary data
10751 */
10752 ASSERT(thisdg_attrs == NULL);
10753 ipp = &tcp->tcp_sticky_ipp;
10754
10755 switch (name) {
10756 case IPV6_UNICAST_HOPS:
10757 /* -1 means use default */
10758 if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
10759 *outlenp = 0;
10760 return (EINVAL);
10761 }
10762 if (!checkonly) {
10763 if (*i1 == -1) {
10764 tcp->tcp_ip6h->ip6_hops =
10765 ipp->ipp_unicast_hops =
10766 (uint8_t)tcps->tcps_ipv6_hoplimit;
10767 ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
10768 /* Pass modified value to IP. */
10769 *i1 = tcp->tcp_ip6h->ip6_hops;
10770 } else {
10771 tcp->tcp_ip6h->ip6_hops =
10772 ipp->ipp_unicast_hops =
10773 (uint8_t)*i1;
10774 ipp->ipp_fields |= IPPF_UNICAST_HOPS;
10775 }
10776 reterr = tcp_build_hdrs(q, tcp);
10777 if (reterr != 0)
10778 return (reterr);
10779 }
10780 break;
10781 case IPV6_BOUND_IF:
10782 if (!checkonly) {
10783 int error = 0;
10784
10785 tcp->tcp_bound_if = *i1;
10786 error = ip_opt_set_ill(tcp->tcp_connp, *i1,
10787 B_TRUE, checkonly, level, name, mblk);
10788 if (error != 0) {
10789 *outlenp = 0;
10790 return (error);
10791 }
10792 }
10793 break;
10794 /*
10795 * Set boolean switches for ancillary data delivery
10796 */
10797 case IPV6_RECVPKTINFO:
10798 if (!checkonly) {
10799 if (onoff)
10800 tcp->tcp_ipv6_recvancillary |=
10801 TCP_IPV6_RECVPKTINFO;
10802 else
10803 tcp->tcp_ipv6_recvancillary &=
10804 ~TCP_IPV6_RECVPKTINFO;
10805 /* Force it to be sent up with the next msg */
10806 tcp->tcp_recvifindex = 0;
10807 }
10808 break;
10809 case IPV6_RECVTCLASS:
10810 if (!checkonly) {
10811 if (onoff)
10812 tcp->tcp_ipv6_recvancillary |=
10813 TCP_IPV6_RECVTCLASS;
10814 else
10815 tcp->tcp_ipv6_recvancillary &=
10816 ~TCP_IPV6_RECVTCLASS;
10817 }
10818 break;
10819 case IPV6_RECVHOPLIMIT:
10820 if (!checkonly) {
10821 if (onoff)
10822 tcp->tcp_ipv6_recvancillary |=
10823 TCP_IPV6_RECVHOPLIMIT;
10824 else
10825 tcp->tcp_ipv6_recvancillary &=
10826 ~TCP_IPV6_RECVHOPLIMIT;
10827 /* Force it to be sent up with the next msg */
10828 tcp->tcp_recvhops = 0xffffffffU;
10829 }
10830 break;
10831 case IPV6_RECVHOPOPTS:
10832 if (!checkonly) {
10833 if (onoff)
10834 tcp->tcp_ipv6_recvancillary |=
10835 TCP_IPV6_RECVHOPOPTS;
10836 else
10837 tcp->tcp_ipv6_recvancillary &=
10838 ~TCP_IPV6_RECVHOPOPTS;
10839 }
10840 break;
10841 case IPV6_RECVDSTOPTS:
10842 if (!checkonly) {
10843 if (onoff)
10844 tcp->tcp_ipv6_recvancillary |=
10845 TCP_IPV6_RECVDSTOPTS;
10846 else
10847 tcp->tcp_ipv6_recvancillary &=
10848 ~TCP_IPV6_RECVDSTOPTS;
10849 }
10850 break;
10851 case _OLD_IPV6_RECVDSTOPTS:
10852 if (!checkonly) {
10853 if (onoff)
10854 tcp->tcp_ipv6_recvancillary |=
10855 TCP_OLD_IPV6_RECVDSTOPTS;
10856 else
10857 tcp->tcp_ipv6_recvancillary &=
10858 ~TCP_OLD_IPV6_RECVDSTOPTS;
10859 }
10860 break;
10861 case IPV6_RECVRTHDR:
10862 if (!checkonly) {
10863 if (onoff)
10864 tcp->tcp_ipv6_recvancillary |=
10865 TCP_IPV6_RECVRTHDR;
10866 else
10867 tcp->tcp_ipv6_recvancillary &=
10868 ~TCP_IPV6_RECVRTHDR;
10869 }
10870 break;
10871 case IPV6_RECVRTHDRDSTOPTS:
10872 if (!checkonly) {
10873 if (onoff)
10874 tcp->tcp_ipv6_recvancillary |=
10875 TCP_IPV6_RECVRTDSTOPTS;
10876 else
10877 tcp->tcp_ipv6_recvancillary &=
10878 ~TCP_IPV6_RECVRTDSTOPTS;
10879 }
10880 break;
10881 case IPV6_PKTINFO:
10882 if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
10883 return (EINVAL);
10884 if (checkonly)
10885 break;
10886
10887 if (inlen == 0) {
10888 ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
10889 } else {
10890 struct in6_pktinfo *pkti;
10891
10892 pkti = (struct in6_pktinfo *)invalp;
10893 /*
10894 * RFC 3542 states that ipi6_addr must be
10895 * the unspecified address when setting the
10896 * IPV6_PKTINFO sticky socket option on a
10897 * TCP socket.
10898 */
10899 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
10900 return (EINVAL);
10901 /*
10902 * ip6_set_pktinfo() validates the source
10903 * address and interface index.
10904 */
10905 reterr = ip6_set_pktinfo(cr, tcp->tcp_connp,
10906 pkti, mblk);
10907 if (reterr != 0)
10908 return (reterr);
10909 ipp->ipp_ifindex = pkti->ipi6_ifindex;
10910 ipp->ipp_addr = pkti->ipi6_addr;
10911 if (ipp->ipp_ifindex != 0)
10912 ipp->ipp_fields |= IPPF_IFINDEX;
10913 else
10914 ipp->ipp_fields &= ~IPPF_IFINDEX;
10915 if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
10916 ipp->ipp_fields |= IPPF_ADDR;
10917 else
10918 ipp->ipp_fields &= ~IPPF_ADDR;
10919 }
10920 reterr = tcp_build_hdrs(q, tcp);
10921 if (reterr != 0)
10922 return (reterr);
10923 break;
10924 case IPV6_TCLASS:
10925 if (inlen != 0 && inlen != sizeof (int))
10926 return (EINVAL);
10927 if (checkonly)
10928 break;
10929
10930 if (inlen == 0) {
10931 ipp->ipp_fields &= ~IPPF_TCLASS;
10932 } else {
10933 if (*i1 > 255 || *i1 < -1)
10934 return (EINVAL);
10935 if (*i1 == -1) {
10936 ipp->ipp_tclass = 0;
10937 *i1 = 0;
10938 } else {
10939 ipp->ipp_tclass = *i1;
10940 }
10941 ipp->ipp_fields |= IPPF_TCLASS;
10942 }
10943 reterr = tcp_build_hdrs(q, tcp);
10944 if (reterr != 0)
10945 return (reterr);
10946 break;
10947 case IPV6_NEXTHOP:
10948 /*
10949 * IP will verify that the nexthop is reachable
10950 * and fail for sticky options.
10951 */
10952 if (inlen != 0 && inlen != sizeof (sin6_t))
10953 return (EINVAL);
10954 if (checkonly)
10955 break;
10956
10957 if (inlen == 0) {
10958 ipp->ipp_fields &= ~IPPF_NEXTHOP;
10959 } else {
10960 sin6_t *sin6 = (sin6_t *)invalp;
10961
10962 if (sin6->sin6_family != AF_INET6)
10963 return (EAFNOSUPPORT);
10964 if (IN6_IS_ADDR_V4MAPPED(
10965 &sin6->sin6_addr))
10966 return (EADDRNOTAVAIL);
10967 ipp->ipp_nexthop = sin6->sin6_addr;
10968 if (!IN6_IS_ADDR_UNSPECIFIED(
10969 &ipp->ipp_nexthop))
10970 ipp->ipp_fields |= IPPF_NEXTHOP;
10971 else
10972 ipp->ipp_fields &= ~IPPF_NEXTHOP;
10973 }
10974 reterr = tcp_build_hdrs(q, tcp);
10975 if (reterr != 0)
10976 return (reterr);
10977 break;
10978 case IPV6_HOPOPTS: {
10979 ip6_hbh_t *hopts = (ip6_hbh_t *)invalp;
10980
10981 /*
10982 * Sanity checks - minimum size, size a multiple of
10983 * eight bytes, and matching size passed in.
10984 */
10985 if (inlen != 0 &&
10986 inlen != (8 * (hopts->ip6h_len + 1)))
10987 return (EINVAL);
10988
10989 if (checkonly)
10990