1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26 /* Copyright (c) 1990 Mentat Inc. */
27
28 #pragma ident "%Z%%M% %I% %E% SMI"
29 const char tcp_version[] = "%Z%%M% %I% %E% SMI";
30
31
32 #include <sys/types.h>
33 #include <sys/stream.h>
34 #include <sys/strsun.h>
35 #include <sys/strsubr.h>
36 #include <sys/stropts.h>
37 #include <sys/strlog.h>
38 #include <sys/strsun.h>
39 #define _SUN_TPI_VERSION 2
40 #include <sys/tihdr.h>
41 #include <sys/timod.h>
42 #include <sys/ddi.h>
43 #include <sys/sunddi.h>
44 #include <sys/suntpi.h>
45 #include <sys/xti_inet.h>
46 #include <sys/cmn_err.h>
47 #include <sys/debug.h>
48 #include <sys/sdt.h>
49 #include <sys/vtrace.h>
50 #include <sys/kmem.h>
51 #include <sys/ethernet.h>
52 #include <sys/cpuvar.h>
53 #include <sys/dlpi.h>
54 #include <sys/multidata.h>
55 #include <sys/multidata_impl.h>
56 #include <sys/pattr.h>
57 #include <sys/policy.h>
58 #include <sys/priv.h>
59 #include <sys/zone.h>
60 #include <sys/sunldi.h>
61
62 #include <sys/errno.h>
63 #include <sys/signal.h>
64 #include <sys/socket.h>
65 #include <sys/sockio.h>
66 #include <sys/isa_defs.h>
67 #include <sys/md5.h>
68 #include <sys/random.h>
69 #include <sys/sodirect.h>
70 #include <sys/uio.h>
71 #include <netinet/in.h>
72 #include <netinet/tcp.h>
73 #include <netinet/ip6.h>
74 #include <netinet/icmp6.h>
75 #include <net/if.h>
76 #include <net/route.h>
77 #include <inet/ipsec_impl.h>
78
79 #include <inet/common.h>
80 #include <inet/ip.h>
81 #include <inet/ip_impl.h>
82 #include <inet/ip6.h>
83 #include <inet/ip_ndp.h>
84 #include <inet/mi.h>
85 #include <inet/mib2.h>
86 #include <inet/nd.h>
87 #include <inet/optcom.h>
88 #include <inet/snmpcom.h>
89 #include <inet/kstatcom.h>
90 #include <inet/tcp.h>
91 #include <inet/tcp_impl.h>
92 #include <net/pfkeyv2.h>
93 #include <inet/ipsec_info.h>
94 #include <inet/ipdrop.h>
95 #include <inet/tcp_trace.h>
96
97 #include <inet/ipclassifier.h>
98 #include <inet/ip_ire.h>
99 #include <inet/ip_ftable.h>
100 #include <inet/ip_if.h>
101 #include <inet/ipp_common.h>
102 #include <inet/ip_netinfo.h>
103 #include <sys/squeue.h>
104 #include <inet/kssl/ksslapi.h>
105 #include <sys/tsol/label.h>
106 #include <sys/tsol/tnet.h>
107 #include <rpc/pmap_prot.h>
108
109 /*
110 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
111 *
112 * (Read the detailed design doc in PSARC case directory)
113 *
114 * The entire tcp state is contained in tcp_t and conn_t structure
115 * which are allocated in tandem using ipcl_conn_create() and passing
116 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
117 * the references on the tcp_t. The tcp_t structure is never compressed
118 * and packets always land on the correct TCP perimeter from the time
119 * eager is created till the time tcp_t dies (as such the old mentat
120 * TCP global queue is not used for detached state and no IPSEC checking
121 * is required). The global queue is still allocated to send out resets
122 * for connection which have no listeners and IP directly calls
123 * tcp_xmit_listeners_reset() which does any policy check.
124 *
125 * Protection and Synchronisation mechanism:
126 *
127 * The tcp data structure does not use any kind of lock for protecting
128 * its state but instead uses 'squeues' for mutual exclusion from various
129 * read and write side threads. To access a tcp member, the thread should
130 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
131 * squeue_fill). Since the squeues allow a direct function call, caller
132 * can pass any tcp function having prototype of edesc_t as argument
133 * (different from traditional STREAMs model where packets come in only
134 * designated entry points). The list of functions that can be directly
135 * called via squeue are listed before the usual function prototype.
136 *
137 * Referencing:
138 *
139 * TCP is MT-Hot and we use a reference based scheme to make sure that the
140 * tcp structure doesn't disappear when its needed. When the application
141 * creates an outgoing connection or accepts an incoming connection, we
142 * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
143 * The IP reference is just a symbolic reference since ip_tcpclose()
144 * looks at tcp structure after tcp_close_output() returns which could
145 * have dropped the last TCP reference. So as long as the connection is
146 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
147 * conn_t. The classifier puts its own reference when the connection is
148 * inserted in listen or connected hash. Anytime a thread needs to enter
149 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
150 * on write side or by doing a classify on read side and then puts a
151 * reference on the conn before doing squeue_enter/tryenter/fill. For
152 * read side, the classifier itself puts the reference under fanout lock
153 * to make sure that tcp can't disappear before it gets processed. The
154 * squeue will drop this reference automatically so the called function
155 * doesn't have to do a DEC_REF.
156 *
157 * Opening a new connection:
158 *
159 * The outgoing connection open is pretty simple. tcp_open() does the
160 * work in creating the conn/tcp structure and initializing it. The
161 * squeue assignment is done based on the CPU the application
162 * is running on. So for outbound connections, processing is always done
163 * on application CPU which might be different from the incoming CPU
164 * being interrupted by the NIC. An optimal way would be to figure out
165 * the NIC <-> CPU binding at listen time, and assign the outgoing
166 * connection to the squeue attached to the CPU that will be interrupted
167 * for incoming packets (we know the NIC based on the bind IP address).
168 * This might seem like a problem if more data is going out but the
169 * fact is that in most cases the transmit is ACK driven transmit where
170 * the outgoing data normally sits on TCP's xmit queue waiting to be
171 * transmitted.
172 *
173 * Accepting a connection:
174 *
175 * This is a more interesting case because of various races involved in
176 * establishing a eager in its own perimeter. Read the meta comment on
177 * top of tcp_conn_request(). But briefly, the squeue is picked by
178 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
179 *
180 * Closing a connection:
181 *
182 * The close is fairly straight forward. tcp_close() calls tcp_close_output()
183 * via squeue to do the close and mark the tcp as detached if the connection
184 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
185 * reference but tcp_close() drop IP's reference always. So if tcp was
186 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
187 * and 1 because it is in classifier's connected hash. This is the condition
188 * we use to determine that its OK to clean up the tcp outside of squeue
189 * when time wait expires (check the ref under fanout and conn_lock and
190 * if it is 2, remove it from fanout hash and kill it).
191 *
192 * Although close just drops the necessary references and marks the
193 * tcp_detached state, tcp_close needs to know the tcp_detached has been
194 * set (under squeue) before letting the STREAM go away (because a
195 * inbound packet might attempt to go up the STREAM while the close
196 * has happened and tcp_detached is not set). So a special lock and
197 * flag is used along with a condition variable (tcp_closelock, tcp_closed,
198 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
199 * tcp_detached.
200 *
201 * Special provisions and fast paths:
202 *
203 * We make special provision for (AF_INET, SOCK_STREAM) sockets which
204 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
205 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
206 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
207 * check to send packets directly to tcp_rput_data via squeue. Everyone
208 * else comes through tcp_input() on the read side.
209 *
210 * We also make special provisions for sockfs by marking tcp_issocket
211 * whenever we have only sockfs on top of TCP. This allows us to skip
212 * putting the tcp in acceptor hash since a sockfs listener can never
213 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
214 * since eager has already been allocated and the accept now happens
215 * on acceptor STREAM. There is a big blob of comment on top of
216 * tcp_conn_request explaining the new accept. When socket is POP'd,
217 * sockfs sends us an ioctl to mark the fact and we go back to old
218 * behaviour. Once tcp_issocket is unset, its never set for the
219 * life of that connection.
220 *
221 * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
222 * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's
223 * directly to the socket (sodirect) and start an asynchronous copyout
224 * to a user-land receive-side buffer (uioa) when a blocking socket read
225 * (e.g. read, recv, ...) is pending.
226 *
227 * This is accomplished when tcp_issocket is set and tcp_sodirect is not
228 * NULL so points to an sodirect_t and if marked enabled then we enqueue
229 * all mblk_t's directly to the socket.
230 *
231 * Further, if the sodirect_t sod_uioa and if marked enabled (due to a
232 * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous
233 * copyout will be started directly to the user-land uio buffer. Also, as we
234 * have a pending read, TCP's push logic can take into account the number of
235 * bytes to be received and only awake the blocked read()er when the uioa_t
236 * byte count has been satisfied.
237 *
238 * IPsec notes :
239 *
240 * Since a packet is always executed on the correct TCP perimeter
241 * all IPsec processing is defered to IP including checking new
242 * connections and setting IPSEC policies for new connection. The
243 * only exception is tcp_xmit_listeners_reset() which is called
244 * directly from IP and needs to policy check to see if TH_RST
245 * can be sent out.
246 *
247 * PFHooks notes :
248 *
249 * For mdt case, one meta buffer contains multiple packets. Mblks for every
250 * packet are assembled and passed to the hooks. When packets are blocked,
251 * or boundary of any packet is changed, the mdt processing is stopped, and
252 * packets of the meta buffer are send to the IP path one by one.
253 */
254
255 /*
256 * Values for squeue switch:
257 * 1: squeue_enter_nodrain
258 * 2: squeue_enter
259 * 3: squeue_fill
260 */
261 int tcp_squeue_close = 2; /* Setable in /etc/system */
262 int tcp_squeue_wput = 2;
263
264 squeue_func_t tcp_squeue_close_proc;
265 squeue_func_t tcp_squeue_wput_proc;
266
267 /*
268 * Macros for sodirect:
269 *
270 * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the
271 * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t
272 * if it exists and is enabled, else to NULL. Note, in the current
273 * sodirect implementation the sod_lock must not be held across any
274 * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC
275 * will result as sod_lock is the streamhead stdata.sd_lock.
276 *
277 * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the
278 * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve
279 * side tcp code path dealing with a tcp_rcv_list or putnext() isn't
280 * being used when sodirect code paths should be.
281 */
282
283 #define SOD_PTR_ENTER(tcp, sodp) \
284 (sodp) = (tcp)->tcp_sodirect; \
285 \
286 if ((sodp) != NULL) { \
287 mutex_enter((sodp)->sod_lock); \
288 if (!((sodp)->sod_state & SOD_ENABLED)) { \
289 mutex_exit((sodp)->sod_lock); \
290 (sodp) = NULL; \
291 } \
292 }
293
294 #define SOD_NOT_ENABLED(tcp) \
295 ((tcp)->tcp_sodirect == NULL || \
296 !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED))
297
298 /*
299 * This controls how tiny a write must be before we try to copy it
300 * into the the mblk on the tail of the transmit queue. Not much
301 * speedup is observed for values larger than sixteen. Zero will
302 * disable the optimisation.
303 */
304 int tcp_tx_pull_len = 16;
305
306 /*
307 * TCP Statistics.
308 *
309 * How TCP statistics work.
310 *
311 * There are two types of statistics invoked by two macros.
312 *
313 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
314 * supposed to be used in non MT-hot paths of the code.
315 *
316 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
317 * supposed to be used for DEBUG purposes and may be used on a hot path.
318 *
319 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
320 * (use "kstat tcp" to get them).
321 *
322 * There is also additional debugging facility that marks tcp_clean_death()
323 * instances and saves them in tcp_t structure. It is triggered by
324 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
325 * tcp_clean_death() calls that counts the number of times each tag was hit. It
326 * is triggered by TCP_CLD_COUNTERS define.
327 *
328 * How to add new counters.
329 *
330 * 1) Add a field in the tcp_stat structure describing your counter.
331 * 2) Add a line in the template in tcp_kstat2_init() with the name
332 * of the counter.
333 *
334 * IMPORTANT!! - make sure that both are in sync !!
335 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
336 *
337 * Please avoid using private counters which are not kstat-exported.
338 *
339 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
340 * in tcp_t structure.
341 *
342 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
343 */
344
345 #ifndef TCP_DEBUG_COUNTER
346 #ifdef DEBUG
347 #define TCP_DEBUG_COUNTER 1
348 #else
349 #define TCP_DEBUG_COUNTER 0
350 #endif
351 #endif
352
353 #define TCP_CLD_COUNTERS 0
354
355 #define TCP_TAG_CLEAN_DEATH 1
356 #define TCP_MAX_CLEAN_DEATH_TAG 32
357
358 #ifdef lint
359 static int _lint_dummy_;
360 #endif
361
362 #if TCP_CLD_COUNTERS
363 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
364 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
365 #elif defined(lint)
366 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
367 #else
368 #define TCP_CLD_STAT(x)
369 #endif
370
371 #if TCP_DEBUG_COUNTER
372 #define TCP_DBGSTAT(tcps, x) \
373 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
374 #define TCP_G_DBGSTAT(x) \
375 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
376 #elif defined(lint)
377 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
378 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
379 #else
380 #define TCP_DBGSTAT(tcps, x)
381 #define TCP_G_DBGSTAT(x)
382 #endif
383
384 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++)
385
386 tcp_g_stat_t tcp_g_statistics;
387 kstat_t *tcp_g_kstat;
388
389 /*
390 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
391 * tcp write side.
392 */
393 #define CALL_IP_WPUT(connp, q, mp) { \
394 tcp_stack_t *tcps; \
395 \
396 tcps = connp->conn_netstack->netstack_tcp; \
397 ASSERT(((q)->q_flag & QREADR) == 0); \
398 TCP_DBGSTAT(tcps, tcp_ip_output); \
399 connp->conn_send(connp, (mp), (q), IP_WPUT); \
400 }
401
402 /* Macros for timestamp comparisons */
403 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
404 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
405
406 /*
407 * Parameters for TCP Initial Send Sequence number (ISS) generation. When
408 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
409 * by adding three components: a time component which grows by 1 every 4096
410 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
411 * a per-connection component which grows by 125000 for every new connection;
412 * and an "extra" component that grows by a random amount centered
413 * approximately on 64000. This causes the the ISS generator to cycle every
414 * 4.89 hours if no TCP connections are made, and faster if connections are
415 * made.
416 *
417 * When tcp_strong_iss is set to 0, ISS is calculated by adding two
418 * components: a time component which grows by 250000 every second; and
419 * a per-connection component which grows by 125000 for every new connections.
420 *
421 * A third method, when tcp_strong_iss is set to 2, for generating ISS is
422 * prescribed by Steve Bellovin. This involves adding time, the 125000 per
423 * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
424 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
425 * password.
426 */
427 #define ISS_INCR 250000
428 #define ISS_NSEC_SHT 12
429
430 static sin_t sin_null; /* Zero address for quick clears */
431 static sin6_t sin6_null; /* Zero address for quick clears */
432
433 /*
434 * This implementation follows the 4.3BSD interpretation of the urgent
435 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
436 * incompatible changes in protocols like telnet and rlogin.
437 */
438 #define TCP_OLD_URP_INTERPRETATION 1
439
440 #define TCP_IS_DETACHED_NONEAGER(tcp) \
441 (TCP_IS_DETACHED(tcp) && \
442 (!(tcp)->tcp_hard_binding))
443
444 /*
445 * TCP reassembly macros. We hide starting and ending sequence numbers in
446 * b_next and b_prev of messages on the reassembly queue. The messages are
447 * chained using b_cont. These macros are used in tcp_reass() so we don't
448 * have to see the ugly casts and assignments.
449 */
450 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next))
451 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \
452 (mblk_t *)(uintptr_t)(u))
453 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev))
454 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \
455 (mblk_t *)(uintptr_t)(u))
456
457 /*
458 * Implementation of TCP Timers.
459 * =============================
460 *
461 * INTERFACE:
462 *
463 * There are two basic functions dealing with tcp timers:
464 *
465 * timeout_id_t tcp_timeout(connp, func, time)
466 * clock_t tcp_timeout_cancel(connp, timeout_id)
467 * TCP_TIMER_RESTART(tcp, intvl)
468 *
469 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
470 * after 'time' ticks passed. The function called by timeout() must adhere to
471 * the same restrictions as a driver soft interrupt handler - it must not sleep
472 * or call other functions that might sleep. The value returned is the opaque
473 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
474 * cancel the request. The call to tcp_timeout() may fail in which case it
475 * returns zero. This is different from the timeout(9F) function which never
476 * fails.
477 *
478 * The call-back function 'func' always receives 'connp' as its single
479 * argument. It is always executed in the squeue corresponding to the tcp
480 * structure. The tcp structure is guaranteed to be present at the time the
481 * call-back is called.
482 *
483 * NOTE: The call-back function 'func' is never called if tcp is in
484 * the TCPS_CLOSED state.
485 *
486 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
487 * request. locks acquired by the call-back routine should not be held across
488 * the call to tcp_timeout_cancel() or a deadlock may result.
489 *
490 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
491 * Otherwise, it returns an integer value greater than or equal to 0. In
492 * particular, if the call-back function is already placed on the squeue, it can
493 * not be canceled.
494 *
495 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
496 * within squeue context corresponding to the tcp instance. Since the
497 * call-back is also called via the same squeue, there are no race
498 * conditions described in untimeout(9F) manual page since all calls are
499 * strictly serialized.
500 *
501 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
502 * stored in tcp_timer_tid and starts a new one using
503 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
504 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
505 * field.
506 *
507 * NOTE: since the timeout cancellation is not guaranteed, the cancelled
508 * call-back may still be called, so it is possible tcp_timer() will be
509 * called several times. This should not be a problem since tcp_timer()
510 * should always check the tcp instance state.
511 *
512 *
513 * IMPLEMENTATION:
514 *
515 * TCP timers are implemented using three-stage process. The call to
516 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
517 * when the timer expires. The tcp_timer_callback() arranges the call of the
518 * tcp_timer_handler() function via squeue corresponding to the tcp
519 * instance. The tcp_timer_handler() calls actual requested timeout call-back
520 * and passes tcp instance as an argument to it. Information is passed between
521 * stages using the tcp_timer_t structure which contains the connp pointer, the
522 * tcp call-back to call and the timeout id returned by the timeout(9F).
523 *
524 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
525 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
526 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
527 * returns the pointer to this mblk.
528 *
529 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
530 * looks like a normal mblk without actual dblk attached to it.
531 *
532 * To optimize performance each tcp instance holds a small cache of timer
533 * mblocks. In the current implementation it caches up to two timer mblocks per
534 * tcp instance. The cache is preserved over tcp frees and is only freed when
535 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
536 * timer processing happens on a corresponding squeue, the cache manipulation
537 * does not require any locks. Experiments show that majority of timer mblocks
538 * allocations are satisfied from the tcp cache and do not involve kmem calls.
539 *
540 * The tcp_timeout() places a refhold on the connp instance which guarantees
541 * that it will be present at the time the call-back function fires. The
542 * tcp_timer_handler() drops the reference after calling the call-back, so the
543 * call-back function does not need to manipulate the references explicitly.
544 */
545
546 typedef struct tcp_timer_s {
547 conn_t *connp;
548 void (*tcpt_proc)(void *);
549 timeout_id_t tcpt_tid;
550 } tcp_timer_t;
551
552 static kmem_cache_t *tcp_timercache;
553 kmem_cache_t *tcp_sack_info_cache;
554 kmem_cache_t *tcp_iphc_cache;
555
556 /*
557 * For scalability, we must not run a timer for every TCP connection
558 * in TIME_WAIT state. To see why, consider (for time wait interval of
559 * 4 minutes):
560 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
561 *
562 * This list is ordered by time, so you need only delete from the head
563 * until you get to entries which aren't old enough to delete yet.
564 * The list consists of only the detached TIME_WAIT connections.
565 *
566 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
567 * becomes detached TIME_WAIT (either by changing the state and already
568 * being detached or the other way around). This means that the TIME_WAIT
569 * state can be extended (up to doubled) if the connection doesn't become
570 * detached for a long time.
571 *
572 * The list manipulations (including tcp_time_wait_next/prev)
573 * are protected by the tcp_time_wait_lock. The content of the
574 * detached TIME_WAIT connections is protected by the normal perimeters.
575 *
576 * This list is per squeue and squeues are shared across the tcp_stack_t's.
577 * Things on tcp_time_wait_head remain associated with the tcp_stack_t
578 * and conn_netstack.
579 * The tcp_t's that are added to tcp_free_list are disassociated and
580 * have NULL tcp_tcps and conn_netstack pointers.
581 */
582 typedef struct tcp_squeue_priv_s {
583 kmutex_t tcp_time_wait_lock;
584 timeout_id_t tcp_time_wait_tid;
585 tcp_t *tcp_time_wait_head;
586 tcp_t *tcp_time_wait_tail;
587 tcp_t *tcp_free_list;
588 uint_t tcp_free_list_cnt;
589 } tcp_squeue_priv_t;
590
591 /*
592 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
593 * Running it every 5 seconds seems to give the best results.
594 */
595 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
596
597 /*
598 * To prevent memory hog, limit the number of entries in tcp_free_list
599 * to 1% of available memory / number of cpus
600 */
601 uint_t tcp_free_list_max_cnt = 0;
602
603 #define TCP_XMIT_LOWATER 4096
604 #define TCP_XMIT_HIWATER 49152
605 #define TCP_RECV_LOWATER 2048
606 #define TCP_RECV_HIWATER 49152
607
608 /*
609 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days
610 */
611 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
612
613 #define TIDUSZ 4096 /* transport interface data unit size */
614
615 /*
616 * Bind hash list size and has function. It has to be a power of 2 for
617 * hashing.
618 */
619 #define TCP_BIND_FANOUT_SIZE 512
620 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
621 /*
622 * Size of listen and acceptor hash list. It has to be a power of 2 for
623 * hashing.
624 */
625 #define TCP_FANOUT_SIZE 256
626
627 #ifdef _ILP32
628 #define TCP_ACCEPTOR_HASH(accid) \
629 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
630 #else
631 #define TCP_ACCEPTOR_HASH(accid) \
632 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
633 #endif /* _ILP32 */
634
635 #define IP_ADDR_CACHE_SIZE 2048
636 #define IP_ADDR_CACHE_HASH(faddr) \
637 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
638
639 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */
640 #define TCP_HSP_HASH_SIZE 256
641
642 #define TCP_HSP_HASH(addr) \
643 (((addr>>24) ^ (addr >>16) ^ \
644 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE)
645
646 /*
647 * TCP options struct returned from tcp_parse_options.
648 */
649 typedef struct tcp_opt_s {
650 uint32_t tcp_opt_mss;
651 uint32_t tcp_opt_wscale;
652 uint32_t tcp_opt_ts_val;
653 uint32_t tcp_opt_ts_ecr;
654 tcp_t *tcp;
655 } tcp_opt_t;
656
657 /*
658 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
659 */
660
661 #ifdef _BIG_ENDIAN
662 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
663 (TCPOPT_TSTAMP << 8) | 10)
664 #else
665 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
666 (TCPOPT_NOP << 8) | TCPOPT_NOP)
667 #endif
668
669 /*
670 * Flags returned from tcp_parse_options.
671 */
672 #define TCP_OPT_MSS_PRESENT 1
673 #define TCP_OPT_WSCALE_PRESENT 2
674 #define TCP_OPT_TSTAMP_PRESENT 4
675 #define TCP_OPT_SACK_OK_PRESENT 8
676 #define TCP_OPT_SACK_PRESENT 16
677
678 /* TCP option length */
679 #define TCPOPT_NOP_LEN 1
680 #define TCPOPT_MAXSEG_LEN 4
681 #define TCPOPT_WS_LEN 3
682 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1)
683 #define TCPOPT_TSTAMP_LEN 10
684 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2)
685 #define TCPOPT_SACK_OK_LEN 2
686 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2)
687 #define TCPOPT_REAL_SACK_LEN 4
688 #define TCPOPT_MAX_SACK_LEN 36
689 #define TCPOPT_HEADER_LEN 2
690
691 /* TCP cwnd burst factor. */
692 #define TCP_CWND_INFINITE 65535
693 #define TCP_CWND_SS 3
694 #define TCP_CWND_NORMAL 5
695
696 /* Maximum TCP initial cwin (start/restart). */
697 #define TCP_MAX_INIT_CWND 8
698
699 /*
700 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is
701 * either tcp_slow_start_initial or tcp_slow_start_after idle
702 * depending on the caller. If the upper layer has not used the
703 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
704 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
705 * If the upper layer has changed set the tcp_init_cwnd, just use
706 * it to calculate the tcp_cwnd.
707 */
708 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \
709 { \
710 if ((tcp)->tcp_init_cwnd == 0) { \
711 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \
712 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
713 } else { \
714 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \
715 } \
716 tcp->tcp_cwnd_cnt = 0; \
717 }
718
719 /* TCP Timer control structure */
720 typedef struct tcpt_s {
721 pfv_t tcpt_pfv; /* The routine we are to call */
722 tcp_t *tcpt_tcp; /* The parameter we are to pass in */
723 } tcpt_t;
724
725 /* Host Specific Parameter structure */
726 typedef struct tcp_hsp {
727 struct tcp_hsp *tcp_hsp_next;
728 in6_addr_t tcp_hsp_addr_v6;
729 in6_addr_t tcp_hsp_subnet_v6;
730 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */
731 int32_t tcp_hsp_sendspace;
732 int32_t tcp_hsp_recvspace;
733 int32_t tcp_hsp_tstamp;
734 } tcp_hsp_t;
735 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6)
736 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6)
737
738 /*
739 * Functions called directly via squeue having a prototype of edesc_t.
740 */
741 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
742 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
743 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
744 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
745 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
746 void tcp_input(void *arg, mblk_t *mp, void *arg2);
747 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
748 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2);
749 void tcp_output(void *arg, mblk_t *mp, void *arg2);
750 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
751 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
752 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
753
754
755 /* Prototype for TCP functions */
756 static void tcp_random_init(void);
757 int tcp_random(void);
758 static void tcp_accept(tcp_t *tcp, mblk_t *mp);
759 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
760 tcp_t *eager);
761 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
762 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
763 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
764 boolean_t user_specified);
765 static void tcp_closei_local(tcp_t *tcp);
766 static void tcp_close_detached(tcp_t *tcp);
767 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
768 mblk_t *idmp, mblk_t **defermp);
769 static void tcp_connect(tcp_t *tcp, mblk_t *mp);
770 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
771 in_port_t dstport, uint_t srcid);
772 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
773 in_port_t dstport, uint32_t flowinfo, uint_t srcid,
774 uint32_t scope_id);
775 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
776 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
777 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
778 static char *tcp_display(tcp_t *tcp, char *, char);
779 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
780 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
781 static void tcp_eager_unlink(tcp_t *tcp);
782 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
783 int unixerr);
784 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
785 int tlierr, int unixerr);
786 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
787 cred_t *cr);
788 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
789 char *value, caddr_t cp, cred_t *cr);
790 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
791 char *value, caddr_t cp, cred_t *cr);
792 static int tcp_tpistate(tcp_t *tcp);
793 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
794 int caller_holds_lock);
795 static void tcp_bind_hash_remove(tcp_t *tcp);
796 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
797 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
798 static void tcp_acceptor_hash_remove(tcp_t *tcp);
799 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp);
800 static void tcp_info_req(tcp_t *tcp, mblk_t *mp);
801 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp);
802 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
803 void tcp_g_q_setup(tcp_stack_t *);
804 void tcp_g_q_create(tcp_stack_t *);
805 void tcp_g_q_destroy(tcp_stack_t *);
806 static int tcp_header_init_ipv4(tcp_t *tcp);
807 static int tcp_header_init_ipv6(tcp_t *tcp);
808 int tcp_init(tcp_t *tcp, queue_t *q);
809 static int tcp_init_values(tcp_t *tcp);
810 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
811 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
812 t_scalar_t addr_length);
813 static void tcp_ip_ire_mark_advice(tcp_t *tcp);
814 static void tcp_ip_notify(tcp_t *tcp);
815 static mblk_t *tcp_ire_mp(mblk_t *mp);
816 static void tcp_iss_init(tcp_t *tcp);
817 static void tcp_keepalive_killer(void *arg);
818 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
819 static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
820 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
821 int *do_disconnectp, int *t_errorp, int *sys_errorp);
822 static boolean_t tcp_allow_connopt_set(int level, int name);
823 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
824 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
825 int tcp_opt_set(queue_t *q, uint_t optset_context, int level,
826 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
827 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
828 mblk_t *mblk);
829 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
830 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
831 uchar_t *ptr, uint_t len);
832 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
833 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
834 tcp_stack_t *);
835 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value,
836 caddr_t cp, cred_t *cr);
837 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
838 caddr_t cp, cred_t *cr);
839 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
840 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
841 caddr_t cp, cred_t *cr);
842 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
843 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
844 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
845 static void tcp_reinit(tcp_t *tcp);
846 static void tcp_reinit_values(tcp_t *tcp);
847 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
848 tcp_t *thisstream, cred_t *cr);
849
850 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp);
851 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
852 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
853 static void tcp_ss_rexmit(tcp_t *tcp);
854 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
855 static void tcp_process_options(tcp_t *, tcph_t *);
856 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp);
857 static void tcp_rsrv(queue_t *q);
858 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
859 static int tcp_snmp_state(tcp_t *tcp);
860 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
861 cred_t *cr);
862 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
863 cred_t *cr);
864 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
865 cred_t *cr);
866 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
867 cred_t *cr);
868 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
869 cred_t *cr);
870 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value,
871 caddr_t cp, cred_t *cr);
872 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value,
873 caddr_t cp, cred_t *cr);
874 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp,
875 cred_t *cr);
876 static void tcp_timer(void *arg);
877 static void tcp_timer_callback(void *);
878 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
879 boolean_t random);
880 static in_port_t tcp_get_next_priv_port(const tcp_t *);
881 static void tcp_wput_sock(queue_t *q, mblk_t *mp);
882 void tcp_wput_accept(queue_t *q, mblk_t *mp);
883 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
884 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
885 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
886 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss,
887 const int tcp_hdr_len, const int tcp_tcp_hdr_len,
888 const int num_sack_blk, int *usable, uint_t *snxt,
889 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
890 const int mdt_thres);
891 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
892 const int tcp_hdr_len, const int tcp_tcp_hdr_len,
893 const int num_sack_blk, int *usable, uint_t *snxt,
894 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
895 const int mdt_thres);
896 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
897 int num_sack_blk);
898 static void tcp_wsrv(queue_t *q);
899 static int tcp_xmit_end(tcp_t *tcp);
900 static void tcp_ack_timer(void *arg);
901 static mblk_t *tcp_ack_mp(tcp_t *tcp);
902 static void tcp_xmit_early_reset(char *str, mblk_t *mp,
903 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
904 zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
905 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
906 uint32_t ack, int ctl);
907 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *);
908 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *);
909 static int setmaxps(queue_t *q, int maxpsz);
910 static void tcp_set_rto(tcp_t *, time_t);
911 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
912 boolean_t, boolean_t);
913 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
914 boolean_t ipsec_mctl);
915 static mblk_t *tcp_setsockopt_mp(int level, int cmd,
916 char *opt, int optlen);
917 static int tcp_build_hdrs(queue_t *, tcp_t *);
918 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
919 uint32_t seg_seq, uint32_t seg_ack, int seg_len,
920 tcph_t *tcph);
921 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
922 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *);
923 boolean_t tcp_reserved_port_del(in_port_t, in_port_t);
924 boolean_t tcp_reserved_port_check(in_port_t, tcp_stack_t *);
925 static tcp_t *tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *);
926 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
927 static mblk_t *tcp_mdt_info_mp(mblk_t *);
928 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
929 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
930 const boolean_t, const uint32_t, const uint32_t,
931 const uint32_t, const uint32_t, tcp_stack_t *);
932 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
933 const uint_t, const uint_t, boolean_t *);
934 static mblk_t *tcp_lso_info_mp(mblk_t *);
935 static void tcp_lso_update(tcp_t *, ill_lso_capab_t *);
936 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *);
937 extern mblk_t *tcp_timermp_alloc(int);
938 extern void tcp_timermp_free(tcp_t *);
939 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp);
940 static void tcp_stop_lingering(tcp_t *tcp);
941 static void tcp_close_linger_timeout(void *arg);
942 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns);
943 static void tcp_stack_shutdown(netstackid_t stackid, void *arg);
944 static void tcp_stack_fini(netstackid_t stackid, void *arg);
945 static void *tcp_g_kstat_init(tcp_g_stat_t *);
946 static void tcp_g_kstat_fini(kstat_t *);
947 static void *tcp_kstat_init(netstackid_t, tcp_stack_t *);
948 static void tcp_kstat_fini(netstackid_t, kstat_t *);
949 static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *);
950 static void tcp_kstat2_fini(netstackid_t, kstat_t *);
951 static int tcp_kstat_update(kstat_t *kp, int rw);
952 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
953 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
954 tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
955 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
956 tcph_t *tcph, mblk_t *idmp);
957 static squeue_func_t tcp_squeue_switch(int);
958
959 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
960 static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
961 static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
962 static int tcp_close(queue_t *, int);
963 static int tcpclose_accept(queue_t *);
964
965 static void tcp_squeue_add(squeue_t *);
966 static boolean_t tcp_zcopy_check(tcp_t *);
967 static void tcp_zcopy_notify(tcp_t *);
968 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *);
969 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
970 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
971
972 extern void tcp_kssl_input(tcp_t *, mblk_t *);
973
974 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
975 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
976
977 /*
978 * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
979 *
980 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
981 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
982 * (defined in tcp.h) needs to be filled in and passed into the kernel
983 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
984 * structure contains the four-tuple of a TCP connection and a range of TCP
985 * states (specified by ac_start and ac_end). The use of wildcard addresses
986 * and ports is allowed. Connections with a matching four tuple and a state
987 * within the specified range will be aborted. The valid states for the
988 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
989 * inclusive.
990 *
991 * An application which has its connection aborted by this ioctl will receive
992 * an error that is dependent on the connection state at the time of the abort.
993 * If the connection state is < TCPS_TIME_WAIT, an application should behave as
994 * though a RST packet has been received. If the connection state is equal to
995 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
996 * and all resources associated with the connection will be freed.
997 */
998 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
999 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
1000 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
1001 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
1002 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
1003 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
1004 boolean_t, tcp_stack_t *);
1005
1006 static struct module_info tcp_rinfo = {
1007 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
1008 };
1009
1010 static struct module_info tcp_winfo = {
1011 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
1012 };
1013
1014 /*
1015 * Entry points for TCP as a device. The normal case which supports
1016 * the TCP functionality.
1017 * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
1018 */
1019 struct qinit tcp_rinitv4 = {
1020 NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, NULL, &tcp_rinfo
1021 };
1022
1023 struct qinit tcp_rinitv6 = {
1024 NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_close, NULL, &tcp_rinfo
1025 };
1026
1027 struct qinit tcp_winit = {
1028 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1029 };
1030
1031 /* Initial entry point for TCP in socket mode. */
1032 struct qinit tcp_sock_winit = {
1033 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1034 };
1035
1036 /*
1037 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
1038 * an accept. Avoid allocating data structures since eager has already
1039 * been created.
1040 */
1041 struct qinit tcp_acceptor_rinit = {
1042 NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo
1043 };
1044
1045 struct qinit tcp_acceptor_winit = {
1046 (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
1047 };
1048
1049 /*
1050 * Entry points for TCP loopback (read side only)
1051 * The open routine is only used for reopens, thus no need to
1052 * have a separate one for tcp_openv6.
1053 */
1054 struct qinit tcp_loopback_rinit = {
1055 (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, (pfi_t)0,
1056 &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
1057 };
1058
1059 /* For AF_INET aka /dev/tcp */
1060 struct streamtab tcpinfov4 = {
1061 &tcp_rinitv4, &tcp_winit
1062 };
1063
1064 /* For AF_INET6 aka /dev/tcp6 */
1065 struct streamtab tcpinfov6 = {
1066 &tcp_rinitv6, &tcp_winit
1067 };
1068
1069 /*
1070 * Have to ensure that tcp_g_q_close is not done by an
1071 * interrupt thread.
1072 */
1073 static taskq_t *tcp_taskq;
1074
1075 /*
1076 * TCP has a private interface for other kernel modules to reserve a
1077 * port range for them to use. Once reserved, TCP will not use any ports
1078 * in the range. This interface relies on the TCP_EXCLBIND feature. If
1079 * the semantics of TCP_EXCLBIND is changed, implementation of this interface
1080 * has to be verified.
1081 *
1082 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port
1083 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port
1084 * range is [port a, port b] inclusive. And each port range is between
1085 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive.
1086 *
1087 * Note that the default anonymous port range starts from 32768. There is
1088 * no port "collision" between that and the reserved port range. If there
1089 * is port collision (because the default smallest anonymous port is lowered
1090 * or some apps specifically bind to ports in the reserved port range), the
1091 * system may not be able to reserve a port range even there are enough
1092 * unbound ports as a reserved port range contains consecutive ports .
1093 */
1094 #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5
1095 #define TCP_RESERVED_PORTS_RANGE_MAX 1000
1096 #define TCP_SMALLEST_RESERVED_PORT 10240
1097 #define TCP_LARGEST_RESERVED_PORT 20480
1098
1099 /* Structure to represent those reserved port ranges. */
1100 typedef struct tcp_rport_s {
1101 in_port_t lo_port;
1102 in_port_t hi_port;
1103 tcp_t **temp_tcp_array;
1104 } tcp_rport_t;
1105
1106 /* Setable only in /etc/system. Move to ndd? */
1107 boolean_t tcp_icmp_source_quench = B_FALSE;
1108
1109 /*
1110 * Following assumes TPI alignment requirements stay along 32 bit
1111 * boundaries
1112 */
1113 #define ROUNDUP32(x) \
1114 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
1115
1116 /* Template for response to info request. */
1117 static struct T_info_ack tcp_g_t_info_ack = {
1118 T_INFO_ACK, /* PRIM_type */
1119 0, /* TSDU_size */
1120 T_INFINITE, /* ETSDU_size */
1121 T_INVALID, /* CDATA_size */
1122 T_INVALID, /* DDATA_size */
1123 sizeof (sin_t), /* ADDR_size */
1124 0, /* OPT_size - not initialized here */
1125 TIDUSZ, /* TIDU_size */
1126 T_COTS_ORD, /* SERV_type */
1127 TCPS_IDLE, /* CURRENT_state */
1128 (XPG4_1|EXPINLINE) /* PROVIDER_flag */
1129 };
1130
1131 static struct T_info_ack tcp_g_t_info_ack_v6 = {
1132 T_INFO_ACK, /* PRIM_type */
1133 0, /* TSDU_size */
1134 T_INFINITE, /* ETSDU_size */
1135 T_INVALID, /* CDATA_size */
1136 T_INVALID, /* DDATA_size */
1137 sizeof (sin6_t), /* ADDR_size */
1138 0, /* OPT_size - not initialized here */
1139 TIDUSZ, /* TIDU_size */
1140 T_COTS_ORD, /* SERV_type */
1141 TCPS_IDLE, /* CURRENT_state */
1142 (XPG4_1|EXPINLINE) /* PROVIDER_flag */
1143 };
1144
1145 #define MS 1L
1146 #define SECONDS (1000 * MS)
1147 #define MINUTES (60 * SECONDS)
1148 #define HOURS (60 * MINUTES)
1149 #define DAYS (24 * HOURS)
1150
1151 #define PARAM_MAX (~(uint32_t)0)
1152
1153 /* Max size IP datagram is 64k - 1 */
1154 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
1155 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
1156 /* Max of the above */
1157 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4
1158
1159 /* Largest TCP port number */
1160 #define TCP_MAX_PORT (64 * 1024 - 1)
1161
1162 /*
1163 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
1164 * layer header. It has to be a multiple of 4.
1165 */
1166 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
1167 #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val
1168
1169 /*
1170 * All of these are alterable, within the min/max values given, at run time.
1171 * Note that the default value of "tcp_time_wait_interval" is four minutes,
1172 * per the TCP spec.
1173 */
1174 /* BEGIN CSTYLED */
1175 static tcpparam_t lcl_tcp_param_arr[] = {
1176 /*min max value name */
1177 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"},
1178 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" },
1179 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" },
1180 { 1, 1024, 1, "tcp_conn_req_min" },
1181 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" },
1182 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" },
1183 { 0, 10, 0, "tcp_debug" },
1184 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"},
1185 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"},
1186 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"},
1187 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"},
1188 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"},
1189 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"},
1190 { 1, 255, 64, "tcp_ipv4_ttl"},
1191 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"},
1192 { 0, 100, 10, "tcp_maxpsz_multiplier" },
1193 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"},
1194 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
1195 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"},
1196 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"},
1197 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"},
1198 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"},
1199 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"},
1200 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" },
1201 { 0, 16, 0, "tcp_snd_lowat_fraction" },
1202 { 0, 128000, 0, "tcp_sth_rcv_hiwat" },
1203 { 0, 128000, 0, "tcp_sth_rcv_lowat" },
1204 { 1, 10000, 3, "tcp_dupack_fast_retransmit" },
1205 { 0, 1, 0, "tcp_ignore_path_mtu" },
1206 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"},
1207 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"},
1208 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
1209 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
1210 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
1211 { 1, 65536, 4, "tcp_recv_hiwat_minmss"},
1212 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"},
1213 { 0, TCP_MSS_MAX, 64, "tcp_co_min"},
1214 { 8192, (1<<30), 1024*1024, "tcp_max_buf"},
1215 /*
1216 * Question: What default value should I set for tcp_strong_iss?
1217 */
1218 { 0, 2, 1, "tcp_strong_iss"},
1219 { 0, 65536, 20, "tcp_rtt_updates"},
1220 { 0, 1, 1, "tcp_wscale_always"},
1221 { 0, 1, 0, "tcp_tstamp_always"},
1222 { 0, 1, 1, "tcp_tstamp_if_wscale"},
1223 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"},
1224 { 0, 16, 2, "tcp_deferred_acks_max"},
1225 { 1, 16384, 4, "tcp_slow_start_after_idle"},
1226 { 1, 4, 4, "tcp_slow_start_initial"},
1227 { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"},
1228 { 0, 2, 2, "tcp_sack_permitted"},
1229 { 0, 1, 0, "tcp_trace"},
1230 { 0, 1, 1, "tcp_compression_enabled"},
1231 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"},
1232 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"},
1233 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
1234 { 0, 1, 0, "tcp_rev_src_routes"},
1235 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"},
1236 { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"},
1237 { 0, 16, 8, "tcp_local_dacks_max"},
1238 { 0, 2, 1, "tcp_ecn_permitted"},
1239 { 0, 1, 1, "tcp_rst_sent_rate_enabled"},
1240 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"},
1241 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"},
1242 { 0, 1, 0, "tcp_use_smss_as_mss_opt"},
1243 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"},
1244 };
1245 /* END CSTYLED */
1246
1247 /*
1248 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
1249 * each header fragment in the header buffer. Each parameter value has
1250 * to be a multiple of 4 (32-bit aligned).
1251 */
1252 static tcpparam_t lcl_tcp_mdt_head_param =
1253 { 32, 256, 32, "tcp_mdt_hdr_head_min" };
1254 static tcpparam_t lcl_tcp_mdt_tail_param =
1255 { 0, 256, 32, "tcp_mdt_hdr_tail_min" };
1256 #define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val
1257 #define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val
1258
1259 /*
1260 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
1261 * the maximum number of payload buffers associated per Multidata.
1262 */
1263 static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
1264 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
1265 #define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val
1266
1267 /* Round up the value to the nearest mss. */
1268 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss))
1269
1270 /*
1271 * Set ECN capable transport (ECT) code point in IP header.
1272 *
1273 * Note that there are 2 ECT code points '01' and '10', which are called
1274 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code
1275 * point ECT(0) for TCP as described in RFC 2481.
1276 */
1277 #define SET_ECT(tcp, iph) \
1278 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1279 /* We need to clear the code point first. */ \
1280 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
1281 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
1282 } else { \
1283 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
1284 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
1285 }
1286
1287 /*
1288 * The format argument to pass to tcp_display().
1289 * DISP_PORT_ONLY means that the returned string has only port info.
1290 * DISP_ADDR_AND_PORT means that the returned string also contains the
1291 * remote and local IP address.
1292 */
1293 #define DISP_PORT_ONLY 1
1294 #define DISP_ADDR_AND_PORT 2
1295
1296 #define NDD_TOO_QUICK_MSG \
1297 "ndd get info rate too high for non-privileged users, try again " \
1298 "later.\n"
1299 #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n"
1300
1301 #define IS_VMLOANED_MBLK(mp) \
1302 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
1303
1304
1305 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
1306 boolean_t tcp_mdt_chain = B_TRUE;
1307
1308 /*
1309 * MDT threshold in the form of effective send MSS multiplier; we take
1310 * the MDT path if the amount of unsent data exceeds the threshold value
1311 * (default threshold is 1*SMSS).
1312 */
1313 uint_t tcp_mdt_smss_threshold = 1;
1314
1315 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */
1316
1317 /*
1318 * Forces all connections to obey the value of the tcps_maxpsz_multiplier
1319 * tunable settable via NDD. Otherwise, the per-connection behavior is
1320 * determined dynamically during tcp_adapt_ire(), which is the default.
1321 */
1322 boolean_t tcp_static_maxpsz = B_FALSE;
1323
1324 /* Setable in /etc/system */
1325 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
1326 uint32_t tcp_random_anon_port = 1;
1327
1328 /*
1329 * To reach to an eager in Q0 which can be dropped due to an incoming
1330 * new SYN request when Q0 is full, a new doubly linked list is
1331 * introduced. This list allows to select an eager from Q0 in O(1) time.
1332 * This is needed to avoid spending too much time walking through the
1333 * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
1334 * this new list has to be a member of Q0.
1335 * This list is headed by listener's tcp_t. When the list is empty,
1336 * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
1337 * of listener's tcp_t point to listener's tcp_t itself.
1338 *
1339 * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
1340 * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
1341 * These macros do not affect the eager's membership to Q0.
1342 */
1343
1344
1345 #define MAKE_DROPPABLE(listener, eager) \
1346 if ((eager)->tcp_eager_next_drop_q0 == NULL) { \
1347 (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
1348 = (eager); \
1349 (eager)->tcp_eager_prev_drop_q0 = (listener); \
1350 (eager)->tcp_eager_next_drop_q0 = \
1351 (listener)->tcp_eager_next_drop_q0; \
1352 (listener)->tcp_eager_next_drop_q0 = (eager); \
1353 }
1354
1355 #define MAKE_UNDROPPABLE(eager) \
1356 if ((eager)->tcp_eager_next_drop_q0 != NULL) { \
1357 (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \
1358 = (eager)->tcp_eager_prev_drop_q0; \
1359 (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \
1360 = (eager)->tcp_eager_next_drop_q0; \
1361 (eager)->tcp_eager_prev_drop_q0 = NULL; \
1362 (eager)->tcp_eager_next_drop_q0 = NULL; \
1363 }
1364
1365 /*
1366 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
1367 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
1368 * data, TCP will not respond with an ACK. RFC 793 requires that
1369 * TCP responds with an ACK for such a bogus ACK. By not following
1370 * the RFC, we prevent TCP from getting into an ACK storm if somehow
1371 * an attacker successfully spoofs an acceptable segment to our
1372 * peer; or when our peer is "confused."
1373 */
1374 uint32_t tcp_drop_ack_unsent_cnt = 10;
1375
1376 /*
1377 * Hook functions to enable cluster networking
1378 * On non-clustered systems these vectors must always be NULL.
1379 */
1380
1381 void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family,
1382 uint8_t *laddrp, in_port_t lport) = NULL;
1383 void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family,
1384 uint8_t *laddrp, in_port_t lport) = NULL;
1385 void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family,
1386 uint8_t *laddrp, in_port_t lport,
1387 uint8_t *faddrp, in_port_t fport) = NULL;
1388 void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family,
1389 uint8_t *laddrp, in_port_t lport,
1390 uint8_t *faddrp, in_port_t fport) = NULL;
1391
1392 /*
1393 * The following are defined in ip.c
1394 */
1395 extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family,
1396 uint8_t *laddrp);
1397 extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
1398 uint8_t *laddrp, uint8_t *faddrp);
1399
1400 #define CL_INET_CONNECT(tcp) { \
1401 if (cl_inet_connect != NULL) { \
1402 /* \
1403 * Running in cluster mode - register active connection \
1404 * information \
1405 */ \
1406 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1407 if ((tcp)->tcp_ipha->ipha_src != 0) { \
1408 (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\
1409 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
1410 (in_port_t)(tcp)->tcp_lport, \
1411 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
1412 (in_port_t)(tcp)->tcp_fport); \
1413 } \
1414 } else { \
1415 if (!IN6_IS_ADDR_UNSPECIFIED( \
1416 &(tcp)->tcp_ip6h->ip6_src)) {\
1417 (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\
1418 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
1419 (in_port_t)(tcp)->tcp_lport, \
1420 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
1421 (in_port_t)(tcp)->tcp_fport); \
1422 } \
1423 } \
1424 } \
1425 }
1426
1427 #define CL_INET_DISCONNECT(tcp) { \
1428 if (cl_inet_disconnect != NULL) { \
1429 /* \
1430 * Running in cluster mode - deregister active \
1431 * connection information \
1432 */ \
1433 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1434 if ((tcp)->tcp_ip_src != 0) { \
1435 (*cl_inet_disconnect)(IPPROTO_TCP, \
1436 AF_INET, \
1437 (uint8_t *)(&((tcp)->tcp_ip_src)),\
1438 (in_port_t)(tcp)->tcp_lport, \
1439 (uint8_t *) \
1440 (&((tcp)->tcp_ipha->ipha_dst)),\
1441 (in_port_t)(tcp)->tcp_fport); \
1442 } \
1443 } else { \
1444 if (!IN6_IS_ADDR_UNSPECIFIED( \
1445 &(tcp)->tcp_ip_src_v6)) { \
1446 (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\
1447 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
1448 (in_port_t)(tcp)->tcp_lport, \
1449 (uint8_t *) \
1450 (&((tcp)->tcp_ip6h->ip6_dst)),\
1451 (in_port_t)(tcp)->tcp_fport); \
1452 } \
1453 } \
1454 } \
1455 }
1456
1457 /*
1458 * Cluster networking hook for traversing current connection list.
1459 * This routine is used to extract the current list of live connections
1460 * which must continue to to be dispatched to this node.
1461 */
1462 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
1463
1464 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
1465 void *arg, tcp_stack_t *tcps);
1466
1467 #define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \
1468 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \
1469 iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \
1470 ip6_t *, ip6h, int, 0);
1471
1472 /*
1473 * Figure out the value of window scale opton. Note that the rwnd is
1474 * ASSUMED to be rounded up to the nearest MSS before the calculation.
1475 * We cannot find the scale value and then do a round up of tcp_rwnd
1476 * because the scale value may not be correct after that.
1477 *
1478 * Set the compiler flag to make this function inline.
1479 */
1480 static void
1481 tcp_set_ws_value(tcp_t *tcp)
1482 {
1483 int i;
1484 uint32_t rwnd = tcp->tcp_rwnd;
1485
1486 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
1487 i++, rwnd >>= 1)
1488 ;
1489 tcp->tcp_rcv_ws = i;
1490 }
1491
1492 /*
1493 * Remove a connection from the list of detached TIME_WAIT connections.
1494 * It returns B_FALSE if it can't remove the connection from the list
1495 * as the connection has already been removed from the list due to an
1496 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
1497 */
1498 static boolean_t
1499 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
1500 {
1501 boolean_t locked = B_FALSE;
1502
1503 if (tcp_time_wait == NULL) {
1504 tcp_time_wait = *((tcp_squeue_priv_t **)
1505 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
1506 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1507 locked = B_TRUE;
1508 } else {
1509 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
1510 }
1511
1512 if (tcp->tcp_time_wait_expire == 0) {
1513 ASSERT(tcp->tcp_time_wait_next == NULL);
1514 ASSERT(tcp->tcp_time_wait_prev == NULL);
1515 if (locked)
1516 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1517 return (B_FALSE);
1518 }
1519 ASSERT(TCP_IS_DETACHED(tcp));
1520 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1521
1522 if (tcp == tcp_time_wait->tcp_time_wait_head) {
1523 ASSERT(tcp->tcp_time_wait_prev == NULL);
1524 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
1525 if (tcp_time_wait->tcp_time_wait_head != NULL) {
1526 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
1527 NULL;
1528 } else {
1529 tcp_time_wait->tcp_time_wait_tail = NULL;
1530 }
1531 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
1532 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
1533 ASSERT(tcp->tcp_time_wait_next == NULL);
1534 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
1535 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1536 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
1537 } else {
1538 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
1539 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
1540 tcp->tcp_time_wait_prev->tcp_time_wait_next =
1541 tcp->tcp_time_wait_next;
1542 tcp->tcp_time_wait_next->tcp_time_wait_prev =
1543 tcp->tcp_time_wait_prev;
1544 }
1545 tcp->tcp_time_wait_next = NULL;
1546 tcp->tcp_time_wait_prev = NULL;
1547 tcp->tcp_time_wait_expire = 0;
1548
1549 if (locked)
1550 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1551 return (B_TRUE);
1552 }
1553
1554 /*
1555 * Add a connection to the list of detached TIME_WAIT connections
1556 * and set its time to expire.
1557 */
1558 static void
1559 tcp_time_wait_append(tcp_t *tcp)
1560 {
1561 tcp_stack_t *tcps = tcp->tcp_tcps;
1562 tcp_squeue_priv_t *tcp_time_wait =
1563 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
1564 SQPRIVATE_TCP));
1565
1566 tcp_timers_stop(tcp);
1567
1568 /* Freed above */
1569 ASSERT(tcp->tcp_timer_tid == 0);
1570 ASSERT(tcp->tcp_ack_tid == 0);
1571
1572 /* must have happened at the time of detaching the tcp */
1573 ASSERT(tcp->tcp_ptpahn == NULL);
1574 ASSERT(tcp->tcp_flow_stopped == 0);
1575 ASSERT(tcp->tcp_time_wait_next == NULL);
1576 ASSERT(tcp->tcp_time_wait_prev == NULL);
1577 ASSERT(tcp->tcp_time_wait_expire == NULL);
1578 ASSERT(tcp->tcp_listener == NULL);
1579
1580 tcp->tcp_time_wait_expire = ddi_get_lbolt();
1581 /*
1582 * The value computed below in tcp->tcp_time_wait_expire may
1583 * appear negative or wrap around. That is ok since our
1584 * interest is only in the difference between the current lbolt
1585 * value and tcp->tcp_time_wait_expire. But the value should not
1586 * be zero, since it means the tcp is not in the TIME_WAIT list.
1587 * The corresponding comparison in tcp_time_wait_collector() uses
1588 * modular arithmetic.
1589 */
1590 tcp->tcp_time_wait_expire +=
1591 drv_usectohz(tcps->tcps_time_wait_interval * 1000);
1592 if (tcp->tcp_time_wait_expire == 0)
1593 tcp->tcp_time_wait_expire = 1;
1594
1595 ASSERT(TCP_IS_DETACHED(tcp));
1596 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1597 ASSERT(tcp->tcp_time_wait_next == NULL);
1598 ASSERT(tcp->tcp_time_wait_prev == NULL);
1599 TCP_DBGSTAT(tcps, tcp_time_wait);
1600
1601 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1602 if (tcp_time_wait->tcp_time_wait_head == NULL) {
1603 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
1604 tcp_time_wait->tcp_time_wait_head = tcp;
1605 } else {
1606 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1607 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
1608 TCPS_TIME_WAIT);
1609 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
1610 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
1611 }
1612 tcp_time_wait->tcp_time_wait_tail = tcp;
1613 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1614 }
1615
1616 /* ARGSUSED */
1617 void
1618 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
1619 {
1620 conn_t *connp = (conn_t *)arg;
1621 tcp_t *tcp = connp->conn_tcp;
1622 tcp_stack_t *tcps = tcp->tcp_tcps;
1623
1624 ASSERT(tcp != NULL);
1625 if (tcp->tcp_state == TCPS_CLOSED) {
1626 return;
1627 }
1628
1629 ASSERT((tcp->tcp_family == AF_INET &&
1630 tcp->tcp_ipversion == IPV4_VERSION) ||
1631 (tcp->tcp_family == AF_INET6 &&
1632 (tcp->tcp_ipversion == IPV4_VERSION ||
1633 tcp->tcp_ipversion == IPV6_VERSION)));
1634 ASSERT(!tcp->tcp_listener);
1635
1636 TCP_STAT(tcps, tcp_time_wait_reap);
1637 ASSERT(TCP_IS_DETACHED(tcp));
1638
1639 /*
1640 * Because they have no upstream client to rebind or tcp_close()
1641 * them later, we axe the connection here and now.
1642 */
1643 tcp_close_detached(tcp);
1644 }
1645
1646 /*
1647 * Remove cached/latched IPsec references.
1648 */
1649 void
1650 tcp_ipsec_cleanup(tcp_t *tcp)
1651 {
1652 conn_t *connp = tcp->tcp_connp;
1653
1654 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1655
1656 if (connp->conn_latch != NULL) {
1657 IPLATCH_REFRELE(connp->conn_latch,
1658 connp->conn_netstack);
1659 connp->conn_latch = NULL;
1660 }
1661 if (connp->conn_policy != NULL) {
1662 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
1663 connp->conn_policy = NULL;
1664 }
1665 }
1666
1667 /*
1668 * Cleaup before placing on free list.
1669 * Disassociate from the netstack/tcp_stack_t since the freelist
1670 * is per squeue and not per netstack.
1671 */
1672 void
1673 tcp_cleanup(tcp_t *tcp)
1674 {
1675 mblk_t *mp;
1676 char *tcp_iphc;
1677 int tcp_iphc_len;
1678 int tcp_hdr_grown;
1679 tcp_sack_info_t *tcp_sack_info;
1680 conn_t *connp = tcp->tcp_connp;
1681 tcp_stack_t *tcps = tcp->tcp_tcps;
1682 netstack_t *ns = tcps->tcps_netstack;
1683
1684 tcp_bind_hash_remove(tcp);
1685
1686 /* Cleanup that which needs the netstack first */
1687 tcp_ipsec_cleanup(tcp);
1688
1689 tcp_free(tcp);
1690
1691 /* Release any SSL context */
1692 if (tcp->tcp_kssl_ent != NULL) {
1693 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
1694 tcp->tcp_kssl_ent = NULL;
1695 }
1696
1697 if (tcp->tcp_kssl_ctx != NULL) {
1698 kssl_release_ctx(tcp->tcp_kssl_ctx);
1699 tcp->tcp_kssl_ctx = NULL;
1700 }
1701 tcp->tcp_kssl_pending = B_FALSE;
1702
1703 conn_delete_ire(connp, NULL);
1704
1705 /*
1706 * Since we will bzero the entire structure, we need to
1707 * remove it and reinsert it in global hash list. We
1708 * know the walkers can't get to this conn because we
1709 * had set CONDEMNED flag earlier and checked reference
1710 * under conn_lock so walker won't pick it and when we
1711 * go the ipcl_globalhash_remove() below, no walker
1712 * can get to it.
1713 */
1714 ipcl_globalhash_remove(connp);
1715
1716 /*
1717 * Now it is safe to decrement the reference counts.
1718 * This might be the last reference on the netstack and TCPS
1719 * in which case it will cause the tcp_g_q_close and
1720 * the freeing of the IP Instance.
1721 */
1722 connp->conn_netstack = NULL;
1723 netstack_rele(ns);
1724 ASSERT(tcps != NULL);
1725 tcp->tcp_tcps = NULL;
1726 TCPS_REFRELE(tcps);
1727
1728 /* Save some state */
1729 mp = tcp->tcp_timercache;
1730
1731 tcp_sack_info = tcp->tcp_sack_info;
1732 tcp_iphc = tcp->tcp_iphc;
1733 tcp_iphc_len = tcp->tcp_iphc_len;
1734 tcp_hdr_grown = tcp->tcp_hdr_grown;
1735
1736 if (connp->conn_cred != NULL) {
1737 crfree(connp->conn_cred);
1738 connp->conn_cred = NULL;
1739 }
1740 if (connp->conn_peercred != NULL) {
1741 crfree(connp->conn_peercred);
1742 connp->conn_peercred = NULL;
1743 }
1744 ipcl_conn_cleanup(connp);
1745 connp->conn_flags = IPCL_TCPCONN;
1746 bzero(tcp, sizeof (tcp_t));
1747
1748 /* restore the state */
1749 tcp->tcp_timercache = mp;
1750
1751 tcp->tcp_sack_info = tcp_sack_info;
1752 tcp->tcp_iphc = tcp_iphc;
1753 tcp->tcp_iphc_len = tcp_iphc_len;
1754 tcp->tcp_hdr_grown = tcp_hdr_grown;
1755
1756 tcp->tcp_connp = connp;
1757
1758 ASSERT(connp->conn_tcp == tcp);
1759 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1760 connp->conn_state_flags = CONN_INCIPIENT;
1761 ASSERT(connp->conn_ulp == IPPROTO_TCP);
1762 ASSERT(connp->conn_ref == 1);
1763 }
1764
1765 /*
1766 * Blows away all tcps whose TIME_WAIT has expired. List traversal
1767 * is done forwards from the head.
1768 * This walks all stack instances since
1769 * tcp_time_wait remains global across all stacks.
1770 */
1771 /* ARGSUSED */
1772 void
1773 tcp_time_wait_collector(void *arg)
1774 {
1775 tcp_t *tcp;
1776 clock_t now;
1777 mblk_t *mp;
1778 conn_t *connp;
1779 kmutex_t *lock;
1780 boolean_t removed;
1781
1782 squeue_t *sqp = (squeue_t *)arg;
1783 tcp_squeue_priv_t *tcp_time_wait =
1784 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1785
1786 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1787 tcp_time_wait->tcp_time_wait_tid = 0;
1788
1789 if (tcp_time_wait->tcp_free_list != NULL &&
1790 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
1791 TCP_G_STAT(tcp_freelist_cleanup);
1792 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
1793 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1794 tcp->tcp_time_wait_next = NULL;
1795 tcp_time_wait->tcp_free_list_cnt--;
1796 ASSERT(tcp->tcp_tcps == NULL);
1797 CONN_DEC_REF(tcp->tcp_connp);
1798 }
1799 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
1800 }
1801
1802 /*
1803 * In order to reap time waits reliably, we should use a
1804 * source of time that is not adjustable by the user -- hence
1805 * the call to ddi_get_lbolt().
1806 */
1807 now = ddi_get_lbolt();
1808 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
1809 /*
1810 * Compare times using modular arithmetic, since
1811 * lbolt can wrapover.
1812 */
1813 if ((now - tcp->tcp_time_wait_expire) < 0) {
1814 break;
1815 }
1816
1817 removed = tcp_time_wait_remove(tcp, tcp_time_wait);
1818 ASSERT(removed);
1819
1820 connp = tcp->tcp_connp;
1821 ASSERT(connp->conn_fanout != NULL);
1822 lock = &connp->conn_fanout->connf_lock;
1823 /*
1824 * This is essentially a TW reclaim fast path optimization for
1825 * performance where the timewait collector checks under the
1826 * fanout lock (so that no one else can get access to the
1827 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
1828 * the classifier hash list. If ref count is indeed 2, we can
1829 * just remove the conn under the fanout lock and avoid
1830 * cleaning up the conn under the squeue, provided that
1831 * clustering callbacks are not enabled. If clustering is
1832 * enabled, we need to make the clustering callback before
1833 * setting the CONDEMNED flag and after dropping all locks and
1834 * so we forego this optimization and fall back to the slow
1835 * path. Also please see the comments in tcp_closei_local
1836 * regarding the refcnt logic.
1837 *
1838 * Since we are holding the tcp_time_wait_lock, its better
1839 * not to block on the fanout_lock because other connections
1840 * can't add themselves to time_wait list. So we do a
1841 * tryenter instead of mutex_enter.
1842 */
1843 if (mutex_tryenter(lock)) {
1844 mutex_enter(&connp->conn_lock);
1845 if ((connp->conn_ref == 2) &&
1846 (cl_inet_disconnect == NULL)) {
1847 ipcl_hash_remove_locked(connp,
1848 connp->conn_fanout);
1849 /*
1850 * Set the CONDEMNED flag now itself so that
1851 * the refcnt cannot increase due to any
1852 * walker. But we have still not cleaned up
1853 * conn_ire_cache. This is still ok since
1854 * we are going to clean it up in tcp_cleanup
1855 * immediately and any interface unplumb
1856 * thread will wait till the ire is blown away
1857 */
1858 connp->conn_state_flags |= CONN_CONDEMNED;
1859 mutex_exit(lock);
1860 mutex_exit(&connp->conn_lock);
1861 if (tcp_time_wait->tcp_free_list_cnt <
1862 tcp_free_list_max_cnt) {
1863 /* Add to head of tcp_free_list */
1864 mutex_exit(
1865 &tcp_time_wait->tcp_time_wait_lock);
1866 tcp_cleanup(tcp);
1867 ASSERT(connp->conn_latch == NULL);
1868 ASSERT(connp->conn_policy == NULL);
1869 ASSERT(tcp->tcp_tcps == NULL);
1870 ASSERT(connp->conn_netstack == NULL);
1871
1872 mutex_enter(
1873 &tcp_time_wait->tcp_time_wait_lock);
1874 tcp->tcp_time_wait_next =
1875 tcp_time_wait->tcp_free_list;
1876 tcp_time_wait->tcp_free_list = tcp;
1877 tcp_time_wait->tcp_free_list_cnt++;
1878 continue;
1879 } else {
1880 /* Do not add to tcp_free_list */
1881 mutex_exit(
1882 &tcp_time_wait->tcp_time_wait_lock);
1883 tcp_bind_hash_remove(tcp);
1884 conn_delete_ire(tcp->tcp_connp, NULL);
1885 tcp_ipsec_cleanup(tcp);
1886 CONN_DEC_REF(tcp->tcp_connp);
1887 }
1888 } else {
1889 CONN_INC_REF_LOCKED(connp);
1890 mutex_exit(lock);
1891 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1892 mutex_exit(&connp->conn_lock);
1893 /*
1894 * We can reuse the closemp here since conn has
1895 * detached (otherwise we wouldn't even be in
1896 * time_wait list). tcp_closemp_used can safely
1897 * be changed without taking a lock as no other
1898 * thread can concurrently access it at this
1899 * point in the connection lifecycle.
1900 */
1901
1902 if (tcp->tcp_closemp.b_prev == NULL)
1903 tcp->tcp_closemp_used = B_TRUE;
1904 else
1905 cmn_err(CE_PANIC,
1906 "tcp_timewait_collector: "
1907 "concurrent use of tcp_closemp: "
1908 "connp %p tcp %p\n", (void *)connp,
1909 (void *)tcp);
1910
1911 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1912 mp = &tcp->tcp_closemp;
1913 squeue_fill(connp->conn_sqp, mp,
1914 tcp_timewait_output, connp,
1915 SQTAG_TCP_TIMEWAIT);
1916 }
1917 } else {
1918 mutex_enter(&connp->conn_lock);
1919 CONN_INC_REF_LOCKED(connp);
1920 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1921 mutex_exit(&connp->conn_lock);
1922 /*
1923 * We can reuse the closemp here since conn has
1924 * detached (otherwise we wouldn't even be in
1925 * time_wait list). tcp_closemp_used can safely
1926 * be changed without taking a lock as no other
1927 * thread can concurrently access it at this
1928 * point in the connection lifecycle.
1929 */
1930
1931 if (tcp->tcp_closemp.b_prev == NULL)
1932 tcp->tcp_closemp_used = B_TRUE;
1933 else
1934 cmn_err(CE_PANIC, "tcp_timewait_collector: "
1935 "concurrent use of tcp_closemp: "
1936 "connp %p tcp %p\n", (void *)connp,
1937 (void *)tcp);
1938
1939 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1940 mp = &tcp->tcp_closemp;
1941 squeue_fill(connp->conn_sqp, mp,
1942 tcp_timewait_output, connp, 0);
1943 }
1944 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1945 }
1946
1947 if (tcp_time_wait->tcp_free_list != NULL)
1948 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
1949
1950 tcp_time_wait->tcp_time_wait_tid =
1951 timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY);
1952 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1953 }
1954 /*
1955 * Reply to a clients T_CONN_RES TPI message. This function
1956 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1957 * on the acceptor STREAM and processed in tcp_wput_accept().
1958 * Read the block comment on top of tcp_conn_request().
1959 */
1960 static void
1961 tcp_accept(tcp_t *listener, mblk_t *mp)
1962 {
1963 tcp_t *acceptor;
1964 tcp_t *eager;
1965 tcp_t *tcp;
1966 struct T_conn_res *tcr;
1967 t_uscalar_t acceptor_id;
1968 t_scalar_t seqnum;
1969 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */
1970 mblk_t *ok_mp;
1971 mblk_t *mp1;
1972 tcp_stack_t *tcps = listener->tcp_tcps;
1973
1974 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1975 tcp_err_ack(listener, mp, TPROTO, 0);
1976 return;
1977 }
1978 tcr = (struct T_conn_res *)mp->b_rptr;
1979
1980 /*
1981 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1982 * read side queue of the streams device underneath us i.e. the
1983 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1984 * look it up in the queue_hash. Under LP64 it sends down the
1985 * minor_t of the accepting endpoint.
1986 *
1987 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1988 * fanout hash lock is held.
1989 * This prevents any thread from entering the acceptor queue from
1990 * below (since it has not been hard bound yet i.e. any inbound
1991 * packets will arrive on the listener or default tcp queue and
1992 * go through tcp_lookup).
1993 * The CONN_INC_REF will prevent the acceptor from closing.
1994 *
1995 * XXX It is still possible for a tli application to send down data
1996 * on the accepting stream while another thread calls t_accept.
1997 * This should not be a problem for well-behaved applications since
1998 * the T_OK_ACK is sent after the queue swapping is completed.
1999 *
2000 * If the accepting fd is the same as the listening fd, avoid
2001 * queue hash lookup since that will return an eager listener in a
2002 * already established state.
2003 */
2004 acceptor_id = tcr->ACCEPTOR_id;
2005 mutex_enter(&listener->tcp_eager_lock);
2006 if (listener->tcp_acceptor_id == acceptor_id) {
2007 eager = listener->tcp_eager_next_q;
2008 /* only count how many T_CONN_INDs so don't count q0 */
2009 if ((listener->tcp_conn_req_cnt_q != 1) ||
2010 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
2011 mutex_exit(&listener->tcp_eager_lock);
2012 tcp_err_ack(listener, mp, TBADF, 0);
2013 return;
2014 }
2015 if (listener->tcp_conn_req_cnt_q0 != 0) {
2016 /* Throw away all the eagers on q0. */
2017 tcp_eager_cleanup(listener, 1);
2018 }
2019 if (listener->tcp_syn_defense) {
2020 listener->tcp_syn_defense = B_FALSE;
2021 if (listener->tcp_ip_addr_cache != NULL) {
2022 kmem_free(listener->tcp_ip_addr_cache,
2023 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2024 listener->tcp_ip_addr_cache = NULL;
2025 }
2026 }
2027 /*
2028 * Transfer tcp_conn_req_max to the eager so that when
2029 * a disconnect occurs we can revert the endpoint to the
2030 * listen state.
2031 */
2032 eager->tcp_conn_req_max = listener->tcp_conn_req_max;
2033 ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
2034 /*
2035 * Get a reference on the acceptor just like the
2036 * tcp_acceptor_hash_lookup below.
2037 */
2038 acceptor = listener;
2039 CONN_INC_REF(acceptor->tcp_connp);
2040 } else {
2041 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
2042 if (acceptor == NULL) {
2043 if (listener->tcp_debug) {
2044 (void) strlog(TCP_MOD_ID, 0, 1,
2045 SL_ERROR|SL_TRACE,
2046 "tcp_accept: did not find acceptor 0x%x\n",
2047 acceptor_id);
2048 }
2049 mutex_exit(&listener->tcp_eager_lock);
2050 tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
2051 return;
2052 }
2053 /*
2054 * Verify acceptor state. The acceptable states for an acceptor
2055 * include TCPS_IDLE and TCPS_BOUND.
2056 */
2057 switch (acceptor->tcp_state) {
2058 case TCPS_IDLE:
2059 /* FALLTHRU */
2060 case TCPS_BOUND:
2061 break;
2062 default:
2063 CONN_DEC_REF(acceptor->tcp_connp);
2064 mutex_exit(&listener->tcp_eager_lock);
2065 tcp_err_ack(listener, mp, TOUTSTATE, 0);
2066 return;
2067 }
2068 }
2069
2070 /* The listener must be in TCPS_LISTEN */
2071 if (listener->tcp_state != TCPS_LISTEN) {
2072 CONN_DEC_REF(acceptor->tcp_connp);
2073 mutex_exit(&listener->tcp_eager_lock);
2074 tcp_err_ack(listener, mp, TOUTSTATE, 0);
2075 return;
2076 }
2077
2078 /*
2079 * Rendezvous with an eager connection request packet hanging off
2080 * 'tcp' that has the 'seqnum' tag. We tagged the detached open
2081 * tcp structure when the connection packet arrived in
2082 * tcp_conn_request().
2083 */
2084 seqnum = tcr->SEQ_number;
2085 eager = listener;
2086 do {
2087 eager = eager->tcp_eager_next_q;
2088 if (eager == NULL) {
2089 CONN_DEC_REF(acceptor->tcp_connp);
2090 mutex_exit(&listener->tcp_eager_lock);
2091 tcp_err_ack(listener, mp, TBADSEQ, 0);
2092 return;
2093 }
2094 } while (eager->tcp_conn_req_seqnum != seqnum);
2095 mutex_exit(&listener->tcp_eager_lock);
2096
2097 /*
2098 * At this point, both acceptor and listener have 2 ref
2099 * that they begin with. Acceptor has one additional ref
2100 * we placed in lookup while listener has 3 additional
2101 * ref for being behind the squeue (tcp_accept() is
2102 * done on listener's squeue); being in classifier hash;
2103 * and eager's ref on listener.
2104 */
2105 ASSERT(listener->tcp_connp->conn_ref >= 5);
2106 ASSERT(acceptor->tcp_connp->conn_ref >= 3);
2107
2108 /*
2109 * The eager at this point is set in its own squeue and
2110 * could easily have been killed (tcp_accept_finish will
2111 * deal with that) because of a TH_RST so we can only
2112 * ASSERT for a single ref.
2113 */
2114 ASSERT(eager->tcp_connp->conn_ref >= 1);
2115
2116 /* Pre allocate the stroptions mblk also */
2117 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
2118 if (opt_mp == NULL) {
2119 CONN_DEC_REF(acceptor->tcp_connp);
2120 CONN_DEC_REF(eager->tcp_connp);
2121 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2122 return;
2123 }
2124 DB_TYPE(opt_mp) = M_SETOPTS;
2125 opt_mp->b_wptr += sizeof (struct stroptions);
2126
2127 /*
2128 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
2129 * from listener to acceptor. The message is chained on opt_mp
2130 * which will be sent onto eager's squeue.
2131 */
2132 if (listener->tcp_bound_if != 0) {
2133 /* allocate optmgmt req */
2134 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2135 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
2136 sizeof (int));
2137 if (mp1 != NULL)
2138 linkb(opt_mp, mp1);
2139 }
2140 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
2141 uint_t on = 1;
2142
2143 /* allocate optmgmt req */
2144 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2145 IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
2146 if (mp1 != NULL)
2147 linkb(opt_mp, mp1);
2148 }
2149
2150 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */
2151 if ((mp1 = copymsg(mp)) == NULL) {
2152 CONN_DEC_REF(acceptor->tcp_connp);
2153 CONN_DEC_REF(eager->tcp_connp);
2154 freemsg(opt_mp);
2155 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2156 return;
2157 }
2158
2159 tcr = (struct T_conn_res *)mp1->b_rptr;
2160
2161 /*
2162 * This is an expanded version of mi_tpi_ok_ack_alloc()
2163 * which allocates a larger mblk and appends the new
2164 * local address to the ok_ack. The address is copied by
2165 * soaccept() for getsockname().
2166 */
2167 {
2168 int extra;
2169
2170 extra = (eager->tcp_family == AF_INET) ?
2171 sizeof (sin_t) : sizeof (sin6_t);
2172
2173 /*
2174 * Try to re-use mp, if possible. Otherwise, allocate
2175 * an mblk and return it as ok_mp. In any case, mp
2176 * is no longer usable upon return.
2177 */
2178 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
2179 CONN_DEC_REF(acceptor->tcp_connp);
2180 CONN_DEC_REF(eager->tcp_connp);
2181 freemsg(opt_mp);
2182 /* Original mp has been freed by now, so use mp1 */
2183 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
2184 return;
2185 }
2186
2187 mp = NULL; /* We should never use mp after this point */
2188
2189 switch (extra) {
2190 case sizeof (sin_t): {
2191 sin_t *sin = (sin_t *)ok_mp->b_wptr;
2192
2193 ok_mp->b_wptr += extra;
2194 sin->sin_family = AF_INET;
2195 sin->sin_port = eager->tcp_lport;
2196 sin->sin_addr.s_addr =
2197 eager->tcp_ipha->ipha_src;
2198 break;
2199 }
2200 case sizeof (sin6_t): {
2201 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
2202
2203 ok_mp->b_wptr += extra;
2204 sin6->sin6_family = AF_INET6;
2205 sin6->sin6_port = eager->tcp_lport;
2206 if (eager->tcp_ipversion == IPV4_VERSION) {
2207 sin6->sin6_flowinfo = 0;
2208 IN6_IPADDR_TO_V4MAPPED(
2209 eager->tcp_ipha->ipha_src,
2210 &sin6->sin6_addr);
2211 } else {
2212 ASSERT(eager->tcp_ip6h != NULL);
2213 sin6->sin6_flowinfo =
2214 eager->tcp_ip6h->ip6_vcf &
2215 ~IPV6_VERS_AND_FLOW_MASK;
2216 sin6->sin6_addr =
2217 eager->tcp_ip6h->ip6_src;
2218 }
2219 sin6->sin6_scope_id = 0;
2220 sin6->__sin6_src_id = 0;
2221 break;
2222 }
2223 default:
2224 break;
2225 }
2226 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
2227 }
2228
2229 /*
2230 * If there are no options we know that the T_CONN_RES will
2231 * succeed. However, we can't send the T_OK_ACK upstream until
2232 * the tcp_accept_swap is done since it would be dangerous to
2233 * let the application start using the new fd prior to the swap.
2234 */
2235 tcp_accept_swap(listener, acceptor, eager);
2236
2237 /*
2238 * tcp_accept_swap unlinks eager from listener but does not drop
2239 * the eager's reference on the listener.
2240 */
2241 ASSERT(eager->tcp_listener == NULL);
2242 ASSERT(listener->tcp_connp->conn_ref >= 5);
2243
2244 /*
2245 * The eager is now associated with its own queue. Insert in
2246 * the hash so that the connection can be reused for a future
2247 * T_CONN_RES.
2248 */
2249 tcp_acceptor_hash_insert(acceptor_id, eager);
2250
2251 /*
2252 * We now do the processing of options with T_CONN_RES.
2253 * We delay till now since we wanted to have queue to pass to
2254 * option processing routines that points back to the right
2255 * instance structure which does not happen until after
2256 * tcp_accept_swap().
2257 *
2258 * Note:
2259 * The sanity of the logic here assumes that whatever options
2260 * are appropriate to inherit from listner=>eager are done
2261 * before this point, and whatever were to be overridden (or not)
2262 * in transfer logic from eager=>acceptor in tcp_accept_swap().
2263 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
2264 * before its ACCEPTOR_id comes down in T_CONN_RES ]
2265 * This may not be true at this point in time but can be fixed
2266 * independently. This option processing code starts with
2267 * the instantiated acceptor instance and the final queue at
2268 * this point.
2269 */
2270
2271 if (tcr->OPT_length != 0) {
2272 /* Options to process */
2273 int t_error = 0;
2274 int sys_error = 0;
2275 int do_disconnect = 0;
2276
2277 if (tcp_conprim_opt_process(eager, mp1,
2278 &do_disconnect, &t_error, &sys_error) < 0) {
2279 eager->tcp_accept_error = 1;
2280 if (do_disconnect) {
2281 /*
2282 * An option failed which does not allow
2283 * connection to be accepted.
2284 *
2285 * We allow T_CONN_RES to succeed and
2286 * put a T_DISCON_IND on the eager queue.
2287 */
2288 ASSERT(t_error == 0 && sys_error == 0);
2289 eager->tcp_send_discon_ind = 1;
2290 } else {
2291 ASSERT(t_error != 0);
2292 freemsg(ok_mp);
2293 /*
2294 * Original mp was either freed or set
2295 * to ok_mp above, so use mp1 instead.
2296 */
2297 tcp_err_ack(listener, mp1, t_error, sys_error);
2298 goto finish;
2299 }
2300 }
2301 /*
2302 * Most likely success in setting options (except if
2303 * eager->tcp_send_discon_ind set).
2304 * mp1 option buffer represented by OPT_length/offset
2305 * potentially modified and contains results of setting
2306 * options at this point
2307 */
2308 }
2309
2310 /* We no longer need mp1, since all options processing has passed */
2311 freemsg(mp1);
2312
2313 putnext(listener->tcp_rq, ok_mp);
2314
2315 mutex_enter(&listener->tcp_eager_lock);
2316 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
2317 tcp_t *tail;
2318 mblk_t *conn_ind;
2319
2320 /*
2321 * This path should not be executed if listener and
2322 * acceptor streams are the same.
2323 */
2324 ASSERT(listener != acceptor);
2325
2326 tcp = listener->tcp_eager_prev_q0;
2327 /*
2328 * listener->tcp_eager_prev_q0 points to the TAIL of the
2329 * deferred T_conn_ind queue. We need to get to the head of
2330 * the queue in order to send up T_conn_ind the same order as
2331 * how the 3WHS is completed.
2332 */
2333 while (tcp != listener) {
2334 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
2335 break;
2336 else
2337 tcp = tcp->tcp_eager_prev_q0;
2338 }
2339 ASSERT(tcp != listener);
2340 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
2341 ASSERT(conn_ind != NULL);
2342 tcp->tcp_conn.tcp_eager_conn_ind = NULL;
2343
2344 /* Move from q0 to q */
2345 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
2346 listener->tcp_conn_req_cnt_q0--;
2347 listener->tcp_conn_req_cnt_q++;
2348 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2349 tcp->tcp_eager_prev_q0;
2350 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2351 tcp->tcp_eager_next_q0;
2352 tcp->tcp_eager_prev_q0 = NULL;
2353 tcp->tcp_eager_next_q0 = NULL;
2354 tcp->tcp_conn_def_q0 = B_FALSE;
2355
2356 /* Make sure the tcp isn't in the list of droppables */
2357 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
2358 tcp->tcp_eager_prev_drop_q0 == NULL);
2359
2360 /*
2361 * Insert at end of the queue because sockfs sends
2362 * down T_CONN_RES in chronological order. Leaving
2363 * the older conn indications at front of the queue
2364 * helps reducing search time.
2365 */
2366 tail = listener->tcp_eager_last_q;
2367 if (tail != NULL)
2368 tail->tcp_eager_next_q = tcp;
2369 else
2370 listener->tcp_eager_next_q = tcp;
2371 listener->tcp_eager_last_q = tcp;
2372 tcp->tcp_eager_next_q = NULL;
2373 mutex_exit(&listener->tcp_eager_lock);
2374 putnext(tcp->tcp_rq, conn_ind);
2375 } else {
2376 mutex_exit(&listener->tcp_eager_lock);
2377 }
2378
2379 /*
2380 * Done with the acceptor - free it
2381 *
2382 * Note: from this point on, no access to listener should be made
2383 * as listener can be equal to acceptor.
2384 */
2385 finish:
2386 ASSERT(acceptor->tcp_detached);
2387 ASSERT(tcps->tcps_g_q != NULL);
2388 acceptor->tcp_rq = tcps->tcps_g_q;
2389 acceptor->tcp_wq = WR(tcps->tcps_g_q);
2390 (void) tcp_clean_death(acceptor, 0, 2);
2391 CONN_DEC_REF(acceptor->tcp_connp);
2392
2393 /*
2394 * In case we already received a FIN we have to make tcp_rput send
2395 * the ordrel_ind. This will also send up a window update if the window
2396 * has opened up.
2397 *
2398 * In the normal case of a successful connection acceptance
2399 * we give the O_T_BIND_REQ to the read side put procedure as an
2400 * indication that this was just accepted. This tells tcp_rput to
2401 * pass up any data queued in tcp_rcv_list.
2402 *
2403 * In the fringe case where options sent with T_CONN_RES failed and
2404 * we required, we would be indicating a T_DISCON_IND to blow
2405 * away this connection.
2406 */
2407
2408 /*
2409 * XXX: we currently have a problem if XTI application closes the
2410 * acceptor stream in between. This problem exists in on10-gate also
2411 * and is well know but nothing can be done short of major rewrite
2412 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
2413 * eager same squeue as listener (we can distinguish non socket
2414 * listeners at the time of handling a SYN in tcp_conn_request)
2415 * and do most of the work that tcp_accept_finish does here itself
2416 * and then get behind the acceptor squeue to access the acceptor
2417 * queue.
2418 */
2419 /*
2420 * We already have a ref on tcp so no need to do one before squeue_fill
2421 */
2422 squeue_fill(eager->tcp_connp->conn_sqp, opt_mp,
2423 tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH);
2424 }
2425
2426 /*
2427 * Swap information between the eager and acceptor for a TLI/XTI client.
2428 * The sockfs accept is done on the acceptor stream and control goes
2429 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
2430 * called. In either case, both the eager and listener are in their own
2431 * perimeter (squeue) and the code has to deal with potential race.
2432 *
2433 * See the block comment on top of tcp_accept() and tcp_wput_accept().
2434 */
2435 static void
2436 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
2437 {
2438 conn_t *econnp, *aconnp;
2439
2440 ASSERT(eager->tcp_rq == listener->tcp_rq);
2441 ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
2442 ASSERT(!eager->tcp_hard_bound);
2443 ASSERT(!TCP_IS_SOCKET(acceptor));
2444 ASSERT(!TCP_IS_SOCKET(eager));
2445 ASSERT(!TCP_IS_SOCKET(listener));
2446
2447 acceptor->tcp_detached = B_TRUE;
2448 /*
2449 * To permit stream re-use by TLI/XTI, the eager needs a copy of
2450 * the acceptor id.
2451 */
2452 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
2453
2454 /* remove eager from listen list... */
2455 mutex_enter(&listener->tcp_eager_lock);
2456 tcp_eager_unlink(eager);
2457 ASSERT(eager->tcp_eager_next_q == NULL &&
2458 eager->tcp_eager_last_q == NULL);
2459 ASSERT(eager->tcp_eager_next_q0 == NULL &&
2460 eager->tcp_eager_prev_q0 == NULL);
2461 mutex_exit(&listener->tcp_eager_lock);
2462 eager->tcp_rq = acceptor->tcp_rq;
2463 eager->tcp_wq = acceptor->tcp_wq;
2464
2465 econnp = eager->tcp_connp;
2466 aconnp = acceptor->tcp_connp;
2467
2468 eager->tcp_rq->q_ptr = econnp;
2469 eager->tcp_wq->q_ptr = econnp;
2470
2471 /*
2472 * In the TLI/XTI loopback case, we are inside the listener's squeue,
2473 * which might be a different squeue from our peer TCP instance.
2474 * For TCP Fusion, the peer expects that whenever tcp_detached is
2475 * clear, our TCP queues point to the acceptor's queues. Thus, use
2476 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
2477 * above reach global visibility prior to the clearing of tcp_detached.
2478 */
2479 membar_producer();
2480 eager->tcp_detached = B_FALSE;
2481
2482 ASSERT(eager->tcp_ack_tid == 0);
2483
2484 econnp->conn_dev = aconnp->conn_dev;
2485 econnp->conn_minor_arena = aconnp->conn_minor_arena;
2486 ASSERT(econnp->conn_minor_arena != NULL);
2487 if (eager->tcp_cred != NULL)
2488 crfree(eager->tcp_cred);
2489 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
2490 ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
2491 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
2492
2493 aconnp->conn_cred = NULL;
2494
2495 econnp->conn_zoneid = aconnp->conn_zoneid;
2496 econnp->conn_allzones = aconnp->conn_allzones;
2497
2498 econnp->conn_mac_exempt = aconnp->conn_mac_exempt;
2499 aconnp->conn_mac_exempt = B_FALSE;
2500
2501 ASSERT(aconnp->conn_peercred == NULL);
2502
2503 /* Do the IPC initialization */
2504 CONN_INC_REF(econnp);
2505
2506 econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
2507 econnp->conn_af_isv6 = aconnp->conn_af_isv6;
2508 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
2509
2510 /* Done with old IPC. Drop its ref on its connp */
2511 CONN_DEC_REF(aconnp);
2512 }
2513
2514
2515 /*
2516 * Adapt to the information, such as rtt and rtt_sd, provided from the
2517 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
2518 *
2519 * Checks for multicast and broadcast destination address.
2520 * Returns zero on failure; non-zero if ok.
2521 *
2522 * Note that the MSS calculation here is based on the info given in
2523 * the IRE. We do not do any calculation based on TCP options. They
2524 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP
2525 * knows which options to use.
2526 *
2527 * Note on how TCP gets its parameters for a connection.
2528 *
2529 * When a tcp_t structure is allocated, it gets all the default parameters.
2530 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
2531 * spipe, rpipe, ... from the route metrics. Route metric overrides the
2532 * default. But if there is an associated tcp_host_param, it will override
2533 * the metrics.
2534 *
2535 * An incoming SYN with a multicast or broadcast destination address, is dropped
2536 * in 1 of 2 places.
2537 *
2538 * 1. If the packet was received over the wire it is dropped in
2539 * ip_rput_process_broadcast()
2540 *
2541 * 2. If the packet was received through internal IP loopback, i.e. the packet
2542 * was generated and received on the same machine, it is dropped in
2543 * ip_wput_local()
2544 *
2545 * An incoming SYN with a multicast or broadcast source address is always
2546 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
2547 * reject an attempt to connect to a broadcast or multicast (destination)
2548 * address.
2549 */
2550 static int
2551 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
2552 {
2553 tcp_hsp_t *hsp;
2554 ire_t *ire;
2555 ire_t *sire = NULL;
2556 iulp_t *ire_uinfo = NULL;
2557 uint32_t mss_max;
2558 uint32_t mss;
2559 boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
2560 conn_t *connp = tcp->tcp_connp;
2561 boolean_t ire_cacheable = B_FALSE;
2562 zoneid_t zoneid = connp->conn_zoneid;
2563 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
2564 MATCH_IRE_SECATTR;
2565 ts_label_t *tsl = crgetlabel(CONN_CRED(connp));
2566 ill_t *ill = NULL;
2567 boolean_t incoming = (ire_mp == NULL);
2568 tcp_stack_t *tcps = tcp->tcp_tcps;
2569 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
2570
2571 ASSERT(connp->conn_ire_cache == NULL);
2572
2573 if (tcp->tcp_ipversion == IPV4_VERSION) {
2574
2575 if (CLASSD(tcp->tcp_connp->conn_rem)) {
2576 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
2577 return (0);
2578 }
2579 /*
2580 * If IP_NEXTHOP is set, then look for an IRE_CACHE
2581 * for the destination with the nexthop as gateway.
2582 * ire_ctable_lookup() is used because this particular
2583 * ire, if it exists, will be marked private.
2584 * If that is not available, use the interface ire
2585 * for the nexthop.
2586 *
2587 * TSol: tcp_update_label will detect label mismatches based
2588 * only on the destination's label, but that would not
2589 * detect label mismatches based on the security attributes
2590 * of routes or next hop gateway. Hence we need to pass the
2591 * label to ire_ftable_lookup below in order to locate the
2592 * right prefix (and/or) ire cache. Similarly we also need
2593 * pass the label to the ire_cache_lookup below to locate
2594 * the right ire that also matches on the label.
2595 */
2596 if (tcp->tcp_connp->conn_nexthop_set) {
2597 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
2598 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
2599 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW,
2600 ipst);
2601 if (ire == NULL) {
2602 ire = ire_ftable_lookup(
2603 tcp->tcp_connp->conn_nexthop_v4,
2604 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
2605 tsl, match_flags, ipst);
2606 if (ire == NULL)
2607 return (0);
2608 } else {
2609 ire_uinfo = &ire->ire_uinfo;
2610 }
2611 } else {
2612 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
2613 zoneid, tsl, ipst);
2614 if (ire != NULL) {
2615 ire_cacheable = B_TRUE;
2616 ire_uinfo = (ire_mp != NULL) ?
2617 &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2618 &ire->ire_uinfo;
2619
2620 } else {
2621 if (ire_mp == NULL) {
2622 ire = ire_ftable_lookup(
2623 tcp->tcp_connp->conn_rem,
2624 0, 0, 0, NULL, &sire, zoneid, 0,
2625 tsl, (MATCH_IRE_RECURSIVE |
2626 MATCH_IRE_DEFAULT), ipst);
2627 if (ire == NULL)
2628 return (0);
2629 ire_uinfo = (sire != NULL) ?
2630 &sire->ire_uinfo :
2631 &ire->ire_uinfo;
2632 } else {
2633 ire = (ire_t *)ire_mp->b_rptr;
2634 ire_uinfo =
2635 &((ire_t *)
2636 ire_mp->b_rptr)->ire_uinfo;
2637 }
2638 }
2639 }
2640 ASSERT(ire != NULL);
2641
2642 if ((ire->ire_src_addr == INADDR_ANY) ||
2643 (ire->ire_type & IRE_BROADCAST)) {
2644 /*
2645 * ire->ire_mp is non null when ire_mp passed in is used
2646 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2647 */
2648 if (ire->ire_mp == NULL)
2649 ire_refrele(ire);
2650 if (sire != NULL)
2651 ire_refrele(sire);
2652 return (0);
2653 }
2654
2655 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
2656 ipaddr_t src_addr;
2657
2658 /*
2659 * ip_bind_connected() has stored the correct source
2660 * address in conn_src.
2661 */
2662 src_addr = tcp->tcp_connp->conn_src;
2663 tcp->tcp_ipha->ipha_src = src_addr;
2664 /*
2665 * Copy of the src addr. in tcp_t is needed
2666 * for the lookup funcs.
2667 */
2668 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
2669 }
2670 /*
2671 * Set the fragment bit so that IP will tell us if the MTU
2672 * should change. IP tells us the latest setting of
2673 * ip_path_mtu_discovery through ire_frag_flag.
2674 */
2675 if (ipst->ips_ip_path_mtu_discovery) {
2676 tcp->tcp_ipha->ipha_fragment_offset_and_flags =
2677 htons(IPH_DF);
2678 }
2679 /*
2680 * If ire_uinfo is NULL, this is the IRE_INTERFACE case
2681 * for IP_NEXTHOP. No cache ire has been found for the
2682 * destination and we are working with the nexthop's
2683 * interface ire. Since we need to forward all packets
2684 * to the nexthop first, we "blindly" set tcp_localnet
2685 * to false, eventhough the destination may also be
2686 * onlink.
2687 */
2688 if (ire_uinfo == NULL)
2689 tcp->tcp_localnet = 0;
2690 else
2691 tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
2692 } else {
2693 /*
2694 * For incoming connection ire_mp = NULL
2695 * For outgoing connection ire_mp != NULL
2696 * Technically we should check conn_incoming_ill
2697 * when ire_mp is NULL and conn_outgoing_ill when
2698 * ire_mp is non-NULL. But this is performance
2699 * critical path and for IPV*_BOUND_IF, outgoing
2700 * and incoming ill are always set to the same value.
2701 */
2702 ill_t *dst_ill = NULL;
2703 ipif_t *dst_ipif = NULL;
2704
2705 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
2706
2707 if (connp->conn_outgoing_ill != NULL) {
2708 /* Outgoing or incoming path */
2709 int err;
2710
2711 dst_ill = conn_get_held_ill(connp,
2712 &connp->conn_outgoing_ill, &err);
2713 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
2714 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
2715 return (0);
2716 }
2717 match_flags |= MATCH_IRE_ILL;
2718 dst_ipif = dst_ill->ill_ipif;
2719 }
2720 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
2721 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst);
2722
2723 if (ire != NULL) {
2724 ire_cacheable = B_TRUE;
2725 ire_uinfo = (ire_mp != NULL) ?
2726 &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2727 &ire->ire_uinfo;
2728 } else {
2729 if (ire_mp == NULL) {
2730 ire = ire_ftable_lookup_v6(
2731 &tcp->tcp_connp->conn_remv6,
2732 0, 0, 0, dst_ipif, &sire, zoneid,
2733 0, tsl, match_flags, ipst);
2734 if (ire == NULL) {
2735 if (dst_ill != NULL)
2736 ill_refrele(dst_ill);
2737 return (0);
2738 }
2739 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
2740 &ire->ire_uinfo;
2741 } else {
2742 ire = (ire_t *)ire_mp->b_rptr;
2743 ire_uinfo =
2744 &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
2745 }
2746 }
2747 if (dst_ill != NULL)
2748 ill_refrele(dst_ill);
2749
2750 ASSERT(ire != NULL);
2751 ASSERT(ire_uinfo != NULL);
2752
2753 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
2754 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
2755 /*
2756 * ire->ire_mp is non null when ire_mp passed in is used
2757 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2758 */
2759 if (ire->ire_mp == NULL)
2760 ire_refrele(ire);
2761 if (sire != NULL)
2762 ire_refrele(sire);
2763 return (0);
2764 }
2765
2766 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
2767 in6_addr_t src_addr;
2768
2769 /*
2770 * ip_bind_connected_v6() has stored the correct source
2771 * address per IPv6 addr. selection policy in
2772 * conn_src_v6.
2773 */
2774 src_addr = tcp->tcp_connp->conn_srcv6;
2775
2776 tcp->tcp_ip6h->ip6_src = src_addr;
2777 /*
2778 * Copy of the src addr. in tcp_t is needed
2779 * for the lookup funcs.
2780 */
2781 tcp->tcp_ip_src_v6 = src_addr;
2782 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src,
2783 &connp->conn_srcv6));
2784 }
2785 tcp->tcp_localnet =
2786 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
2787 }
2788
2789 /*
2790 * This allows applications to fail quickly when connections are made
2791 * to dead hosts. Hosts can be labeled dead by adding a reject route
2792 * with both the RTF_REJECT and RTF_PRIVATE flags set.
2793 */
2794 if ((ire->ire_flags & RTF_REJECT) &&
2795 (ire->ire_flags & RTF_PRIVATE))
2796 goto error;
2797
2798 /*
2799 * Make use of the cached rtt and rtt_sd values to calculate the
2800 * initial RTO. Note that they are already initialized in
2801 * tcp_init_values().
2802 * If ire_uinfo is NULL, i.e., we do not have a cache ire for
2803 * IP_NEXTHOP, but instead are using the interface ire for the
2804 * nexthop, then we do not use the ire_uinfo from that ire to
2805 * do any initializations.
2806 */
2807 if (ire_uinfo != NULL) {
2808 if (ire_uinfo->iulp_rtt != 0) {
2809 clock_t rto;
2810
2811 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
2812 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
2813 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
2814 tcps->tcps_rexmit_interval_extra +
2815 (tcp->tcp_rtt_sa >> 5);
2816
2817 if (rto > tcps->tcps_rexmit_interval_max) {
2818 tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
2819 } else if (rto < tcps->tcps_rexmit_interval_min) {
2820 tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
2821 } else {
2822 tcp->tcp_rto = rto;
2823 }
2824 }
2825 if (ire_uinfo->iulp_ssthresh != 0)
2826 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
2827 else
2828 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2829 if (ire_uinfo->iulp_spipe > 0) {
2830 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
2831 tcps->tcps_max_buf);
2832 if (tcps->tcps_snd_lowat_fraction != 0)
2833 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2834 tcps->tcps_snd_lowat_fraction;
2835 (void) tcp_maxpsz_set(tcp, B_TRUE);
2836 }
2837 /*
2838 * Note that up till now, acceptor always inherits receive
2839 * window from the listener. But if there is a metrics
2840 * associated with a host, we should use that instead of
2841 * inheriting it from listener. Thus we need to pass this
2842 * info back to the caller.
2843 */
2844 if (ire_uinfo->iulp_rpipe > 0) {
2845 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe,
2846 tcps->tcps_max_buf);
2847 }
2848
2849 if (ire_uinfo->iulp_rtomax > 0) {
2850 tcp->tcp_second_timer_threshold =
2851 ire_uinfo->iulp_rtomax;
2852 }
2853
2854 /*
2855 * Use the metric option settings, iulp_tstamp_ok and
2856 * iulp_wscale_ok, only for active open. What this means
2857 * is that if the other side uses timestamp or window
2858 * scale option, TCP will also use those options. That
2859 * is for passive open. If the application sets a
2860 * large window, window scale is enabled regardless of
2861 * the value in iulp_wscale_ok. This is the behavior
2862 * since 2.6. So we keep it.
2863 * The only case left in passive open processing is the
2864 * check for SACK.
2865 * For ECN, it should probably be like SACK. But the
2866 * current value is binary, so we treat it like the other
2867 * cases. The metric only controls active open.For passive
2868 * open, the ndd param, tcp_ecn_permitted, controls the
2869 * behavior.
2870 */
2871 if (!tcp_detached) {
2872 /*
2873 * The if check means that the following can only
2874 * be turned on by the metrics only IRE, but not off.
2875 */
2876 if (ire_uinfo->iulp_tstamp_ok)
2877 tcp->tcp_snd_ts_ok = B_TRUE;
2878 if (ire_uinfo->iulp_wscale_ok)
2879 tcp->tcp_snd_ws_ok = B_TRUE;
2880 if (ire_uinfo->iulp_sack == 2)
2881 tcp->tcp_snd_sack_ok = B_TRUE;
2882 if (ire_uinfo->iulp_ecn_ok)
2883 tcp->tcp_ecn_ok = B_TRUE;
2884 } else {
2885 /*
2886 * Passive open.
2887 *
2888 * As above, the if check means that SACK can only be
2889 * turned on by the metric only IRE.
2890 */
2891 if (ire_uinfo->iulp_sack > 0) {
2892 tcp->tcp_snd_sack_ok = B_TRUE;
2893 }
2894 }
2895 }
2896
2897
2898 /*
2899 * XXX: Note that currently, ire_max_frag can be as small as 68
2900 * because of PMTUd. So tcp_mss may go to negative if combined
2901 * length of all those options exceeds 28 bytes. But because
2902 * of the tcp_mss_min check below, we may not have a problem if
2903 * tcp_mss_min is of a reasonable value. The default is 1 so
2904 * the negative problem still exists. And the check defeats PMTUd.
2905 * In fact, if PMTUd finds that the MSS should be smaller than
2906 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
2907 * value.
2908 *
2909 * We do not deal with that now. All those problems related to
2910 * PMTUd will be fixed later.
2911 */
2912 ASSERT(ire->ire_max_frag != 0);
2913 mss = tcp->tcp_if_mtu = ire->ire_max_frag;
2914 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) {
2915 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) {
2916 mss = MIN(mss, IPV6_MIN_MTU);
2917 }
2918 }
2919
2920 /* Sanity check for MSS value. */
2921 if (tcp->tcp_ipversion == IPV4_VERSION)
2922 mss_max = tcps->tcps_mss_max_ipv4;
2923 else
2924 mss_max = tcps->tcps_mss_max_ipv6;
2925
2926 if (tcp->tcp_ipversion == IPV6_VERSION &&
2927 (ire->ire_frag_flag & IPH_FRAG_HDR)) {
2928 /*
2929 * After receiving an ICMPv6 "packet too big" message with a
2930 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
2931 * will insert a 8-byte fragment header in every packet; we
2932 * reduce the MSS by that amount here.
2933 */
2934 mss -= sizeof (ip6_frag_t);
2935 }
2936
2937 if (tcp->tcp_ipsec_overhead == 0)
2938 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
2939
2940 mss -= tcp->tcp_ipsec_overhead;
2941
2942 if (mss < tcps->tcps_mss_min)
2943 mss = tcps->tcps_mss_min;
2944 if (mss > mss_max)
2945 mss = mss_max;
2946
2947 /* Note that this is the maximum MSS, excluding all options. */
2948 tcp->tcp_mss = mss;
2949
2950 /*
2951 * Initialize the ISS here now that we have the full connection ID.
2952 * The RFC 1948 method of initial sequence number generation requires
2953 * knowledge of the full connection ID before setting the ISS.
2954 */
2955
2956 tcp_iss_init(tcp);
2957
2958 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL))
2959 tcp->tcp_loopback = B_TRUE;
2960
2961 if (tcp->tcp_ipversion == IPV4_VERSION) {
2962 hsp = tcp_hsp_lookup(tcp->tcp_remote, tcps);
2963 } else {
2964 hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6, tcps);
2965 }
2966
2967 if (hsp != NULL) {
2968 /* Only modify if we're going to make them bigger */
2969 if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) {
2970 tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace;
2971 if (tcps->tcps_snd_lowat_fraction != 0)
2972 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2973 tcps->tcps_snd_lowat_fraction;
2974 }
2975
2976 if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) {
2977 tcp->tcp_rwnd = hsp->tcp_hsp_recvspace;
2978 }
2979
2980 /* Copy timestamp flag only for active open */
2981 if (!tcp_detached)
2982 tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp;
2983 }
2984
2985 if (sire != NULL)
2986 IRE_REFRELE(sire);
2987
2988 /*
2989 * If we got an IRE_CACHE and an ILL, go through their properties;
2990 * otherwise, this is deferred until later when we have an IRE_CACHE.
2991 */
2992 if (tcp->tcp_loopback ||
2993 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) {
2994 /*
2995 * For incoming, see if this tcp may be MDT-capable. For
2996 * outgoing, this process has been taken care of through
2997 * tcp_rput_other.
2998 */
2999 tcp_ire_ill_check(tcp, ire, ill, incoming);
3000 tcp->tcp_ire_ill_check_done = B_TRUE;
3001 }
3002
3003 mutex_enter(&connp->conn_lock);
3004 /*
3005 * Make sure that conn is not marked incipient
3006 * for incoming connections. A blind
3007 * removal of incipient flag is cheaper than
3008 * check and removal.
3009 */
3010 connp->conn_state_flags &= ~CONN_INCIPIENT;
3011
3012 /*
3013 * Must not cache forwarding table routes
3014 * or recache an IRE after the conn_t has
3015 * had conn_ire_cache cleared and is flagged
3016 * unusable, (see the CONN_CACHE_IRE() macro).
3017 */
3018 if (ire_cacheable && CONN_CACHE_IRE(connp)) {
3019 rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
3020 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
3021 connp->conn_ire_cache = ire;
3022 IRE_UNTRACE_REF(ire);
3023 rw_exit(&ire->ire_bucket->irb_lock);
3024 mutex_exit(&connp->conn_lock);
3025 return (1);
3026 }
3027 rw_exit(&ire->ire_bucket->irb_lock);
3028 }
3029 mutex_exit(&connp->conn_lock);
3030
3031 if (ire->ire_mp == NULL)
3032 ire_refrele(ire);
3033 return (1);
3034
3035 error:
3036 if (ire->ire_mp == NULL)
3037 ire_refrele(ire);
3038 if (sire != NULL)
3039 ire_refrele(sire);
3040 return (0);
3041 }
3042
3043 /*
3044 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a
3045 * O_T_BIND_REQ/T_BIND_REQ message.
3046 */
3047 static void
3048 tcp_bind(tcp_t *tcp, mblk_t *mp)
3049 {
3050 sin_t *sin;
3051 sin6_t *sin6;
3052 mblk_t *mp1;
3053 in_port_t requested_port;
3054 in_port_t allocated_port;
3055 struct T_bind_req *tbr;
3056 boolean_t bind_to_req_port_only;
3057 boolean_t backlog_update = B_FALSE;
3058 boolean_t user_specified;
3059 in6_addr_t v6addr;
3060 ipaddr_t v4addr;
3061 uint_t origipversion;
3062 int err;
3063 queue_t *q = tcp->tcp_wq;
3064 conn_t *connp = tcp->tcp_connp;
3065 mlp_type_t addrtype, mlptype;
3066 zone_t *zone;
3067 cred_t *cr;
3068 in_port_t mlp_port;
3069 tcp_stack_t *tcps = tcp->tcp_tcps;
3070
3071 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
3072 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
3073 if (tcp->tcp_debug) {
3074 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3075 "tcp_bind: bad req, len %u",
3076 (uint_t)(mp->b_wptr - mp->b_rptr));
3077 }
3078 tcp_err_ack(tcp, mp, TPROTO, 0);
3079 return;
3080 }
3081 /* Make sure the largest address fits */
3082 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
3083 if (mp1 == NULL) {
3084 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3085 return;
3086 }
3087 mp = mp1;
3088 tbr = (struct T_bind_req *)mp->b_rptr;
3089 if (tcp->tcp_state >= TCPS_BOUND) {
3090 if ((tcp->tcp_state == TCPS_BOUND ||
3091 tcp->tcp_state == TCPS_LISTEN) &&
3092 tcp->tcp_conn_req_max != tbr->CONIND_number &&
3093 tbr->CONIND_number > 0) {
3094 /*
3095 * Handle listen() increasing CONIND_number.
3096 * This is more "liberal" then what the TPI spec
3097 * requires but is needed to avoid a t_unbind
3098 * when handling listen() since the port number
3099 * might be "stolen" between the unbind and bind.
3100 */
3101 backlog_update = B_TRUE;
3102 goto do_bind;
3103 }
3104 if (tcp->tcp_debug) {
3105 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3106 "tcp_bind: bad state, %d", tcp->tcp_state);
3107 }
3108 tcp_err_ack(tcp, mp, TOUTSTATE, 0);
3109 return;
3110 }
3111 origipversion = tcp->tcp_ipversion;
3112
3113 switch (tbr->ADDR_length) {
3114 case 0: /* request for a generic port */
3115 tbr->ADDR_offset = sizeof (struct T_bind_req);
3116 if (tcp->tcp_family == AF_INET) {
3117 tbr->ADDR_length = sizeof (sin_t);
3118 sin = (sin_t *)&tbr[1];
3119 *sin = sin_null;
3120 sin->sin_family = AF_INET;
3121 mp->b_wptr = (uchar_t *)&sin[1];
3122 tcp->tcp_ipversion = IPV4_VERSION;
3123 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr);
3124 } else {
3125 ASSERT(tcp->tcp_family == AF_INET6);
3126 tbr->ADDR_length = sizeof (sin6_t);
3127 sin6 = (sin6_t *)&tbr[1];
3128 *sin6 = sin6_null;
3129 sin6->sin6_family = AF_INET6;
3130 mp->b_wptr = (uchar_t *)&sin6[1];
3131 tcp->tcp_ipversion = IPV6_VERSION;
3132 V6_SET_ZERO(v6addr);
3133 }
3134 requested_port = 0;
3135 break;
3136
3137 case sizeof (sin_t): /* Complete IPv4 address */
3138 sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
3139 sizeof (sin_t));
3140 if (sin == NULL || !OK_32PTR((char *)sin)) {
3141 if (tcp->tcp_debug) {
3142 (void) strlog(TCP_MOD_ID, 0, 1,
3143 SL_ERROR|SL_TRACE,
3144 "tcp_bind: bad address parameter, "
3145 "offset %d, len %d",
3146 tbr->ADDR_offset, tbr->ADDR_length);
3147 }
3148 tcp_err_ack(tcp, mp, TPROTO, 0);
3149 return;
3150 }
3151 /*
3152 * With sockets sockfs will accept bogus sin_family in
3153 * bind() and replace it with the family used in the socket
3154 * call.
3155 */
3156 if (sin->sin_family != AF_INET ||
3157 tcp->tcp_family != AF_INET) {
3158 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3159 return;
3160 }
3161 requested_port = ntohs(sin->sin_port);
3162 tcp->tcp_ipversion = IPV4_VERSION;
3163 v4addr = sin->sin_addr.s_addr;
3164 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
3165 break;
3166
3167 case sizeof (sin6_t): /* Complete IPv6 address */
3168 sin6 = (sin6_t *)mi_offset_param(mp,
3169 tbr->ADDR_offset, sizeof (sin6_t));
3170 if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
3171 if (tcp->tcp_debug) {
3172 (void) strlog(TCP_MOD_ID, 0, 1,
3173 SL_ERROR|SL_TRACE,
3174 "tcp_bind: bad IPv6 address parameter, "
3175 "offset %d, len %d", tbr->ADDR_offset,
3176 tbr->ADDR_length);
3177 }
3178 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
3179 return;
3180 }
3181 if (sin6->sin6_family != AF_INET6 ||
3182 tcp->tcp_family != AF_INET6) {
3183 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3184 return;
3185 }
3186 requested_port = ntohs(sin6->sin6_port);
3187 tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
3188 IPV4_VERSION : IPV6_VERSION;
3189 v6addr = sin6->sin6_addr;
3190 break;
3191
3192 default:
3193 if (tcp->tcp_debug) {
3194 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3195 "tcp_bind: bad address length, %d",
3196 tbr->ADDR_length);
3197 }
3198 tcp_err_ack(tcp, mp, TBADADDR, 0);
3199 return;
3200 }
3201 tcp->tcp_bound_source_v6 = v6addr;
3202
3203 /* Check for change in ipversion */
3204 if (origipversion != tcp->tcp_ipversion) {
3205 ASSERT(tcp->tcp_family == AF_INET6);
3206 err = tcp->tcp_ipversion == IPV6_VERSION ?
3207 tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
3208 if (err) {
3209 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3210 return;
3211 }
3212 }
3213
3214 /*
3215 * Initialize family specific fields. Copy of the src addr.
3216 * in tcp_t is needed for the lookup funcs.
3217 */
3218 if (tcp->tcp_ipversion == IPV6_VERSION) {
3219 tcp->tcp_ip6h->ip6_src = v6addr;
3220 } else {
3221 IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
3222 }
3223 tcp->tcp_ip_src_v6 = v6addr;
3224
3225 /*
3226 * For O_T_BIND_REQ:
3227 * Verify that the target port/addr is available, or choose
3228 * another.
3229 * For T_BIND_REQ:
3230 * Verify that the target port/addr is available or fail.
3231 * In both cases when it succeeds the tcp is inserted in the
3232 * bind hash table. This ensures that the operation is atomic
3233 * under the lock on the hash bucket.
3234 */
3235 bind_to_req_port_only = requested_port != 0 &&
3236 tbr->PRIM_type != O_T_BIND_REQ;
3237 /*
3238 * Get a valid port (within the anonymous range and should not
3239 * be a privileged one) to use if the user has not given a port.
3240 * If multiple threads are here, they may all start with
3241 * with the same initial port. But, it should be fine as long as
3242 * tcp_bindi will ensure that no two threads will be assigned
3243 * the same port.
3244 *
3245 * NOTE: XXX If a privileged process asks for an anonymous port, we
3246 * still check for ports only in the range > tcp_smallest_non_priv_port,
3247 * unless TCP_ANONPRIVBIND option is set.
3248 */
3249 mlptype = mlptSingle;
3250 mlp_port = requested_port;
3251 if (requested_port == 0) {
3252 requested_port = tcp->tcp_anon_priv_bind ?
3253 tcp_get_next_priv_port(tcp) :
3254 tcp_update_next_port(tcps->tcps_next_port_to_try,
3255 tcp, B_TRUE);
3256 if (requested_port == 0) {
3257 tcp_err_ack(tcp, mp, TNOADDR, 0);
3258 return;
3259 }
3260 user_specified = B_FALSE;
3261
3262 /*
3263 * If the user went through one of the RPC interfaces to create
3264 * this socket and RPC is MLP in this zone, then give him an
3265 * anonymous MLP.
3266 */
3267 cr = DB_CREDDEF(mp, tcp->tcp_cred);
3268 if (connp->conn_anon_mlp && is_system_labeled()) {
3269 zone = crgetzone(cr);
3270 addrtype = tsol_mlp_addr_type(zone->zone_id,
3271 IPV6_VERSION, &v6addr,
3272 tcps->tcps_netstack->netstack_ip);
3273 if (addrtype == mlptSingle) {
3274 tcp_err_ack(tcp, mp, TNOADDR, 0);
3275 return;
3276 }
3277 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3278 PMAPPORT, addrtype);
3279 mlp_port = PMAPPORT;
3280 }
3281 } else {
3282 int i;
3283 boolean_t priv = B_FALSE;
3284
3285 /*
3286 * If the requested_port is in the well-known privileged range,
3287 * verify that the stream was opened by a privileged user.
3288 * Note: No locks are held when inspecting tcp_g_*epriv_ports
3289 * but instead the code relies on:
3290 * - the fact that the address of the array and its size never
3291 * changes
3292 * - the atomic assignment of the elements of the array
3293 */
3294 cr = DB_CREDDEF(mp, tcp->tcp_cred);
3295 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
3296 priv = B_TRUE;
3297 } else {
3298 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
3299 if (requested_port ==
3300 tcps->tcps_g_epriv_ports[i]) {
3301 priv = B_TRUE;
3302 break;
3303 }
3304 }
3305 }
3306 if (priv) {
3307 if (secpolicy_net_privaddr(cr, requested_port,
3308 IPPROTO_TCP) != 0) {
3309 if (tcp->tcp_debug) {
3310 (void) strlog(TCP_MOD_ID, 0, 1,
3311 SL_ERROR|SL_TRACE,
3312 "tcp_bind: no priv for port %d",
3313 requested_port);
3314 }
3315 tcp_err_ack(tcp, mp, TACCES, 0);
3316 return;
3317 }
3318 }
3319 user_specified = B_TRUE;
3320
3321 if (is_system_labeled()) {
3322 zone = crgetzone(cr);
3323 addrtype = tsol_mlp_addr_type(zone->zone_id,
3324 IPV6_VERSION, &v6addr,
3325 tcps->tcps_netstack->netstack_ip);
3326 if (addrtype == mlptSingle) {
3327 tcp_err_ack(tcp, mp, TNOADDR, 0);
3328 return;
3329 }
3330 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3331 requested_port, addrtype);
3332 }
3333 }
3334
3335 if (mlptype != mlptSingle) {
3336 if (secpolicy_net_bindmlp(cr) != 0) {
3337 if (tcp->tcp_debug) {
3338 (void) strlog(TCP_MOD_ID, 0, 1,
3339 SL_ERROR|SL_TRACE,
3340 "tcp_bind: no priv for multilevel port %d",
3341 requested_port);
3342 }
3343 tcp_err_ack(tcp, mp, TACCES, 0);
3344 return;
3345 }
3346
3347 /*
3348 * If we're specifically binding a shared IP address and the
3349 * port is MLP on shared addresses, then check to see if this
3350 * zone actually owns the MLP. Reject if not.
3351 */
3352 if (mlptype == mlptShared && addrtype == mlptShared) {
3353 /*
3354 * No need to handle exclusive-stack zones since
3355 * ALL_ZONES only applies to the shared stack.
3356 */
3357 zoneid_t mlpzone;
3358
3359 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
3360 htons(mlp_port));
3361 if (connp->conn_zoneid != mlpzone) {
3362 if (tcp->tcp_debug) {
3363 (void) strlog(TCP_MOD_ID, 0, 1,
3364 SL_ERROR|SL_TRACE,
3365 "tcp_bind: attempt to bind port "
3366 "%d on shared addr in zone %d "
3367 "(should be %d)",
3368 mlp_port, connp->conn_zoneid,
3369 mlpzone);
3370 }
3371 tcp_err_ack(tcp, mp, TACCES, 0);
3372 return;
3373 }
3374 }
3375
3376 if (!user_specified) {
3377 err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3378 requested_port, B_TRUE);
3379 if (err != 0) {
3380 if (tcp->tcp_debug) {
3381 (void) strlog(TCP_MOD_ID, 0, 1,
3382 SL_ERROR|SL_TRACE,
3383 "tcp_bind: cannot establish anon "
3384 "MLP for port %d",
3385 requested_port);
3386 }
3387 tcp_err_ack(tcp, mp, TSYSERR, err);
3388 return;
3389 }
3390 connp->conn_anon_port = B_TRUE;
3391 }
3392 connp->conn_mlp_type = mlptype;
3393 }
3394
3395 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
3396 tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
3397
3398 if (allocated_port == 0) {
3399 connp->conn_mlp_type = mlptSingle;
3400 if (connp->conn_anon_port) {
3401 connp->conn_anon_port = B_FALSE;
3402 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3403 requested_port, B_FALSE);
3404 }
3405 if (bind_to_req_port_only) {
3406 if (tcp->tcp_debug) {
3407 (void) strlog(TCP_MOD_ID, 0, 1,
3408 SL_ERROR|SL_TRACE,
3409 "tcp_bind: requested addr busy");
3410 }
3411 tcp_err_ack(tcp, mp, TADDRBUSY, 0);
3412 } else {
3413 /* If we are out of ports, fail the bind. */
3414 if (tcp->tcp_debug) {
3415 (void) strlog(TCP_MOD_ID, 0, 1,
3416 SL_ERROR|SL_TRACE,
3417 "tcp_bind: out of ports?");
3418 }
3419 tcp_err_ack(tcp, mp, TNOADDR, 0);
3420 }
3421 return;
3422 }
3423 ASSERT(tcp->tcp_state == TCPS_BOUND);
3424 do_bind:
3425 if (!backlog_update) {
3426 if (tcp->tcp_family == AF_INET)
3427 sin->sin_port = htons(allocated_port);
3428 else
3429 sin6->sin6_port = htons(allocated_port);
3430 }
3431 if (tcp->tcp_family == AF_INET) {
3432 if (tbr->CONIND_number != 0) {
3433 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3434 sizeof (sin_t));
3435 } else {
3436 /* Just verify the local IP address */
3437 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN);
3438 }
3439 } else {
3440 if (tbr->CONIND_number != 0) {
3441 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3442 sizeof (sin6_t));
3443 } else {
3444 /* Just verify the local IP address */
3445 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3446 IPV6_ADDR_LEN);
3447 }
3448 }
3449 if (mp1 == NULL) {
3450 if (connp->conn_anon_port) {
3451 connp->conn_anon_port = B_FALSE;
3452 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3453 requested_port, B_FALSE);
3454 }
3455 connp->conn_mlp_type = mlptSingle;
3456 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3457 return;
3458 }
3459
3460 tbr->PRIM_type = T_BIND_ACK;
3461 mp->b_datap->db_type = M_PCPROTO;
3462
3463 /* Chain in the reply mp for tcp_rput() */
3464 mp1->b_cont = mp;
3465 mp = mp1;
3466
3467 tcp->tcp_conn_req_max = tbr->CONIND_number;
3468 if (tcp->tcp_conn_req_max) {
3469 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
3470 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
3471 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
3472 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
3473 /*
3474 * If this is a listener, do not reset the eager list
3475 * and other stuffs. Note that we don't check if the
3476 * existing eager list meets the new tcp_conn_req_max
3477 * requirement.
3478 */
3479 if (tcp->tcp_state != TCPS_LISTEN) {
3480 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3481 tcp_t *, tcp, int32_t, TCPS_LISTEN);
3482 tcp->tcp_state = TCPS_LISTEN;
3483 /* Initialize the chain. Don't need the eager_lock */
3484 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3485 tcp->tcp_eager_next_drop_q0 = tcp;
3486 tcp->tcp_eager_prev_drop_q0 = tcp;
3487 tcp->tcp_second_ctimer_threshold =
3488 tcps->tcps_ip_abort_linterval;
3489 }
3490 }
3491
3492 /*
3493 * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3494 * processing continues in tcp_rput_other().
3495 *
3496 * We need to make sure that the conn_recv is set to a non-null
3497 * value before we insert the conn into the classifier table.
3498 * This is to avoid a race with an incoming packet which does an
3499 * ipcl_classify().
3500 */
3501 connp->conn_recv = tcp_conn_request;
3502 if (tcp->tcp_family == AF_INET6) {
3503 ASSERT(tcp->tcp_connp->conn_af_isv6);
3504 mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp);
3505 } else {
3506 ASSERT(!tcp->tcp_connp->conn_af_isv6);
3507 mp = ip_bind_v4(q, mp, tcp->tcp_connp);
3508 }
3509 /*
3510 * If the bind cannot complete immediately
3511 * IP will arrange to call tcp_rput_other
3512 * when the bind completes.
3513 */
3514 if (mp != NULL) {
3515 tcp_rput_other(tcp, mp);
3516 } else {
3517 /*
3518 * Bind will be resumed later. Need to ensure
3519 * that conn doesn't disappear when that happens.
3520 * This will be decremented in ip_resume_tcp_bind().
3521 */
3522 CONN_INC_REF(tcp->tcp_connp);
3523 }
3524 }
3525
3526
3527 /*
3528 * If the "bind_to_req_port_only" parameter is set, if the requested port
3529 * number is available, return it, If not return 0
3530 *
3531 * If "bind_to_req_port_only" parameter is not set and
3532 * If the requested port number is available, return it. If not, return
3533 * the first anonymous port we happen across. If no anonymous ports are
3534 * available, return 0. addr is the requested local address, if any.
3535 *
3536 * In either case, when succeeding update the tcp_t to record the port number
3537 * and insert it in the bind hash table.
3538 *
3539 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
3540 * without setting SO_REUSEADDR. This is needed so that they
3541 * can be viewed as two independent transport protocols.
3542 */
3543 static in_port_t
3544 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
3545 int reuseaddr, boolean_t quick_connect,
3546 boolean_t bind_to_req_port_only, boolean_t user_specified)
3547 {
3548 /* number of times we have run around the loop */
3549 int count = 0;
3550 /* maximum number of times to run around the loop */
3551 int loopmax;
3552 conn_t *connp = tcp->tcp_connp;
3553 zoneid_t zoneid = connp->conn_zoneid;
3554 tcp_stack_t *tcps = tcp->tcp_tcps;
3555
3556 /*
3557 * Lookup for free addresses is done in a loop and "loopmax"
3558 * influences how long we spin in the loop
3559 */
3560 if (bind_to_req_port_only) {
3561 /*
3562 * If the requested port is busy, don't bother to look
3563 * for a new one. Setting loop maximum count to 1 has
3564 * that effect.
3565 */
3566 loopmax = 1;
3567 } else {
3568 /*
3569 * If the requested port is busy, look for a free one
3570 * in the anonymous port range.
3571 * Set loopmax appropriately so that one does not look
3572 * forever in the case all of the anonymous ports are in use.
3573 */
3574 if (tcp->tcp_anon_priv_bind) {
3575 /*
3576 * loopmax =
3577 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
3578 */
3579 loopmax = IPPORT_RESERVED -
3580 tcps->tcps_min_anonpriv_port;
3581 } else {
3582 loopmax = (tcps->tcps_largest_anon_port -
3583 tcps->tcps_smallest_anon_port + 1);
3584 }
3585 }
3586 do {
3587 uint16_t lport;
3588 tf_t *tbf;
3589 tcp_t *ltcp;
3590 conn_t *lconnp;
3591
3592 lport = htons(port);
3593
3594 /*
3595 * Ensure that the tcp_t is not currently in the bind hash.
3596 * Hold the lock on the hash bucket to ensure that
3597 * the duplicate check plus the insertion is an atomic
3598 * operation.
3599 *
3600 * This function does an inline lookup on the bind hash list
3601 * Make sure that we access only members of tcp_t
3602 * and that we don't look at tcp_tcp, since we are not
3603 * doing a CONN_INC_REF.
3604 */
3605 tcp_bind_hash_remove(tcp);
3606 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
3607 mutex_enter(&tbf->tf_lock);
3608 for (ltcp = tbf->tf_tcp; ltcp != NULL;
3609 ltcp = ltcp->tcp_bind_hash) {
3610 boolean_t not_socket;
3611 boolean_t exclbind;
3612
3613 if (lport != ltcp->tcp_lport)
3614 continue;
3615
3616 lconnp = ltcp->tcp_connp;
3617
3618 /*
3619 * On a labeled system, we must treat bindings to ports
3620 * on shared IP addresses by sockets with MAC exemption
3621 * privilege as being in all zones, as there's
3622 * otherwise no way to identify the right receiver.
3623 */
3624 if (!(IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) ||
3625 IPCL_ZONE_MATCH(connp,
3626 ltcp->tcp_connp->conn_zoneid)) &&
3627 !lconnp->conn_mac_exempt &&
3628 !connp->conn_mac_exempt)
3629 continue;
3630
3631 /*
3632 * If TCP_EXCLBIND is set for either the bound or
3633 * binding endpoint, the semantics of bind
3634 * is changed according to the following.
3635 *
3636 * spec = specified address (v4 or v6)
3637 * unspec = unspecified address (v4 or v6)
3638 * A = specified addresses are different for endpoints
3639 *
3640 * bound bind to allowed
3641 * -------------------------------------
3642 * unspec unspec no
3643 * unspec spec no
3644 * spec unspec no
3645 * spec spec yes if A
3646 *
3647 * For labeled systems, SO_MAC_EXEMPT behaves the same
3648 * as TCP_EXCLBIND, except that zoneid is ignored.
3649 *
3650 * Note:
3651 *
3652 * 1. Because of TLI semantics, an endpoint can go
3653 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
3654 * TCPS_BOUND, depending on whether it is originally
3655 * a listener or not. That is why we need to check
3656 * for states greater than or equal to TCPS_BOUND
3657 * here.
3658 *
3659 * 2. Ideally, we should only check for state equals
3660 * to TCPS_LISTEN. And the following check should be
3661 * added.
3662 *
3663 * if (ltcp->tcp_state == TCPS_LISTEN ||
3664 * !reuseaddr || !ltcp->tcp_reuseaddr) {
3665 * ...
3666 * }
3667 *
3668 * The semantics will be changed to this. If the
3669 * endpoint on the list is in state not equal to
3670 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
3671 * set, let the bind succeed.
3672 *
3673 * Because of (1), we cannot do that for TLI
3674 * endpoints. But we can do that for socket endpoints.
3675 * If in future, we can change this going back
3676 * semantics, we can use the above check for TLI also.
3677 */
3678 not_socket = !(TCP_IS_SOCKET(ltcp) &&
3679 TCP_IS_SOCKET(tcp));
3680 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind;
3681
3682 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt ||
3683 (exclbind && (not_socket ||
3684 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
3685 if (V6_OR_V4_INADDR_ANY(
3686 ltcp->tcp_bound_source_v6) ||
3687 V6_OR_V4_INADDR_ANY(*laddr) ||
3688 IN6_ARE_ADDR_EQUAL(laddr,
3689 <cp->tcp_bound_source_v6)) {
3690 break;
3691 }
3692 continue;
3693 }
3694
3695 /*
3696 * Check ipversion to allow IPv4 and IPv6 sockets to
3697 * have disjoint port number spaces, if *_EXCLBIND
3698 * is not set and only if the application binds to a
3699 * specific port. We use the same autoassigned port
3700 * number space for IPv4 and IPv6 sockets.
3701 */
3702 if (tcp->tcp_ipversion != ltcp->tcp_ipversion &&
3703 bind_to_req_port_only)
3704 continue;
3705
3706 /*
3707 * Ideally, we should make sure that the source
3708 * address, remote address, and remote port in the
3709 * four tuple for this tcp-connection is unique.
3710 * However, trying to find out the local source
3711 * address would require too much code duplication
3712 * with IP, since IP needs needs to have that code
3713 * to support userland TCP implementations.
3714 */
3715 if (quick_connect &&
3716 (ltcp->tcp_state > TCPS_LISTEN) &&
3717 ((tcp->tcp_fport != ltcp->tcp_fport) ||
3718 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
3719 <cp->tcp_remote_v6)))
3720 continue;
3721
3722 if (!reuseaddr) {
3723 /*
3724 * No socket option SO_REUSEADDR.
3725 * If existing port is bound to
3726 * a non-wildcard IP address
3727 * and the requesting stream is
3728 * bound to a distinct
3729 * different IP addresses
3730 * (non-wildcard, also), keep
3731 * going.
3732 */
3733 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
3734 !V6_OR_V4_INADDR_ANY(
3735 ltcp->tcp_bound_source_v6) &&
3736 !IN6_ARE_ADDR_EQUAL(laddr,
3737 <cp->tcp_bound_source_v6))
3738 continue;
3739 if (ltcp->tcp_state >= TCPS_BOUND) {
3740 /*
3741 * This port is being used and
3742 * its state is >= TCPS_BOUND,
3743 * so we can't bind to it.
3744 */
3745 break;
3746 }
3747 } else {
3748 /*
3749 * socket option SO_REUSEADDR is set on the
3750 * binding tcp_t.
3751 *
3752 * If two streams are bound to
3753 * same IP address or both addr
3754 * and bound source are wildcards
3755 * (INADDR_ANY), we want to stop
3756 * searching.
3757 * We have found a match of IP source
3758 * address and source port, which is
3759 * refused regardless of the
3760 * SO_REUSEADDR setting, so we break.
3761 */
3762 if (IN6_ARE_ADDR_EQUAL(laddr,
3763 <cp->tcp_bound_source_v6) &&
3764 (ltcp->tcp_state == TCPS_LISTEN ||
3765 ltcp->tcp_state == TCPS_BOUND))
3766 break;
3767 }
3768 }
3769 if (ltcp != NULL) {
3770 /* The port number is busy */
3771 mutex_exit(&tbf->tf_lock);
3772 } else {
3773 /*
3774 * This port is ours. Insert in fanout and mark as
3775 * bound to prevent others from getting the port
3776 * number.
3777 */
3778 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3779 tcp_t *, tcp, int32_t, TCPS_BOUND);
3780 tcp->tcp_state = TCPS_BOUND;
3781 tcp->tcp_lport = htons(port);
3782 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3783
3784 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
3785 tcp->tcp_lport)] == tbf);
3786 tcp_bind_hash_insert(tbf, tcp, 1);
3787
3788 mutex_exit(&tbf->tf_lock);
3789
3790 /*
3791 * We don't want tcp_next_port_to_try to "inherit"
3792 * a port number supplied by the user in a bind.
3793 */
3794 if (user_specified)
3795 return (port);
3796
3797 /*
3798 * This is the only place where tcp_next_port_to_try
3799 * is updated. After the update, it may or may not
3800 * be in the valid range.
3801 */
3802 if (!tcp->tcp_anon_priv_bind)
3803 tcps->tcps_next_port_to_try = port + 1;
3804 return (port);
3805 }
3806
3807 if (tcp->tcp_anon_priv_bind) {
3808 port = tcp_get_next_priv_port(tcp);
3809 } else {
3810 if (count == 0 && user_specified) {
3811 /*
3812 * We may have to return an anonymous port. So
3813 * get one to start with.
3814 */
3815 port =
3816 tcp_update_next_port(
3817 tcps->tcps_next_port_to_try,
3818 tcp, B_TRUE);
3819 user_specified = B_FALSE;
3820 } else {
3821 port = tcp_update_next_port(port + 1, tcp,
3822 B_FALSE);
3823 }
3824 }
3825 if (port == 0)
3826 break;
3827
3828 /*
3829 * Don't let this loop run forever in the case where
3830 * all of the anonymous ports are in use.
3831 */
3832 } while (++count < loopmax);
3833 return (0);
3834 }
3835
3836 /*
3837 * tcp_clean_death / tcp_close_detached must not be called more than once
3838 * on a tcp. Thus every function that potentially calls tcp_clean_death
3839 * must check for the tcp state before calling tcp_clean_death.
3840 * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper,
3841 * tcp_timer_handler, all check for the tcp state.
3842 */
3843 /* ARGSUSED */
3844 void
3845 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2)
3846 {
3847 tcp_t *tcp = ((conn_t *)arg)->conn_tcp;
3848
3849 freemsg(mp);
3850 if (tcp->tcp_state > TCPS_BOUND)
3851 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
3852 ETIMEDOUT, 5);
3853 }
3854
3855 /*
3856 * We are dying for some reason. Try to do it gracefully. (May be called
3857 * as writer.)
3858 *
3859 * Return -1 if the structure was not cleaned up (if the cleanup had to be
3860 * done by a service procedure).
3861 * TBD - Should the return value distinguish between the tcp_t being
3862 * freed and it being reinitialized?
3863 */
3864 static int
3865 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
3866 {
3867 mblk_t *mp;
3868 queue_t *q;
3869 tcp_stack_t *tcps = tcp->tcp_tcps;
3870 sodirect_t *sodp;
3871
3872 TCP_CLD_STAT(tag);
3873
3874 #if TCP_TAG_CLEAN_DEATH
3875 tcp->tcp_cleandeathtag = tag;
3876 #endif
3877
3878 if (tcp->tcp_fused)
3879 tcp_unfuse(tcp);
3880
3881 if (tcp->tcp_linger_tid != 0 &&
3882 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
3883 tcp_stop_lingering(tcp);
3884 }
3885
3886 ASSERT(tcp != NULL);
3887 ASSERT((tcp->tcp_family == AF_INET &&
3888 tcp->tcp_ipversion == IPV4_VERSION) ||
3889 (tcp->tcp_family == AF_INET6 &&
3890 (tcp->tcp_ipversion == IPV4_VERSION ||
3891 tcp->tcp_ipversion == IPV6_VERSION)));
3892
3893 if (TCP_IS_DETACHED(tcp)) {
3894 if (tcp->tcp_hard_binding) {
3895 /*
3896 * Its an eager that we are dealing with. We close the
3897 * eager but in case a conn_ind has already gone to the
3898 * listener, let tcp_accept_finish() send a discon_ind
3899 * to the listener and drop the last reference. If the
3900 * listener doesn't even know about the eager i.e. the
3901 * conn_ind hasn't gone up, blow away the eager and drop
3902 * the last reference as well. If the conn_ind has gone
3903 * up, state should be BOUND. tcp_accept_finish
3904 * will figure out that the connection has received a
3905 * RST and will send a DISCON_IND to the application.
3906 */
3907 tcp_closei_local(tcp);
3908 if (!tcp->tcp_tconnind_started) {
3909 CONN_DEC_REF(tcp->tcp_connp);
3910 } else {
3911 DTRACE_TCP4(state__change, void, NULL,
3912 conn_t *, NULL, tcp_t *, tcp, int32_t,
3913 TCPS_BOUND);
3914 tcp->tcp_state = TCPS_BOUND;
3915 }
3916 } else {
3917 tcp_close_detached(tcp);
3918 }
3919 return (0);
3920 }
3921
3922 TCP_STAT(tcps, tcp_clean_death_nondetached);
3923
3924 /*
3925 * If T_ORDREL_IND has not been sent yet (done when service routine
3926 * is run) postpone cleaning up the endpoint until service routine
3927 * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3928 * client_errno since tcp_close uses the client_errno field.
3929 */
3930 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3931 if (err != 0)
3932 tcp->tcp_client_errno = err;
3933
3934 tcp->tcp_deferred_clean_death = B_TRUE;
3935 return (-1);
3936 }
3937
3938 /* If sodirect, not anymore */
3939 SOD_PTR_ENTER(tcp, sodp);
3940 if (sodp != NULL) {
3941 tcp->tcp_sodirect = NULL;
3942 mutex_exit(sodp->sod_lock);
3943 }
3944
3945 q = tcp->tcp_rq;
3946
3947 /* Trash all inbound data */
3948 flushq(q, FLUSHALL);
3949
3950 /*
3951 * If we are at least part way open and there is error
3952 * (err==0 implies no error)
3953 * notify our client by a T_DISCON_IND.
3954 */
3955 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
3956 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
3957 !TCP_IS_SOCKET(tcp)) {
3958 /*
3959 * Send M_FLUSH according to TPI. Because sockets will
3960 * (and must) ignore FLUSHR we do that only for TPI
3961 * endpoints and sockets in STREAMS mode.
3962 */
3963 (void) putnextctl1(q, M_FLUSH, FLUSHR);
3964 }
3965 if (tcp->tcp_debug) {
3966 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
3967 "tcp_clean_death: discon err %d", err);
3968 }
3969 mp = mi_tpi_discon_ind(NULL, err, 0);
3970 if (mp != NULL) {
3971 putnext(q, mp);
3972 } else {
3973 if (tcp->tcp_debug) {
3974 (void) strlog(TCP_MOD_ID, 0, 1,
3975 SL_ERROR|SL_TRACE,
3976 "tcp_clean_death, sending M_ERROR");
3977 }
3978 (void) putnextctl1(q, M_ERROR, EPROTO);
3979 }
3980 if (tcp->tcp_state <= TCPS_SYN_RCVD) {
3981 /* SYN_SENT or SYN_RCVD */
3982 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
3983 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
3984 /* ESTABLISHED or CLOSE_WAIT */
3985 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
3986 }
3987 }
3988
3989 tcp_reinit(tcp);
3990 return (-1);
3991 }
3992
3993 /*
3994 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
3995 * to expire, stop the wait and finish the close.
3996 */
3997 static void
3998 tcp_stop_lingering(tcp_t *tcp)
3999 {
4000 clock_t delta = 0;
4001 tcp_stack_t *tcps = tcp->tcp_tcps;
4002
4003 tcp->tcp_linger_tid = 0;
4004 if (tcp->tcp_state > TCPS_LISTEN) {
4005 tcp_acceptor_hash_remove(tcp);
4006 mutex_enter(&tcp->tcp_non_sq_lock);
4007 if (tcp->tcp_flow_stopped) {
4008 tcp_clrqfull(tcp);
4009 }
4010 mutex_exit(&tcp->tcp_non_sq_lock);
4011
4012 if (tcp->tcp_timer_tid != 0) {
4013 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4014 tcp->tcp_timer_tid = 0;
4015 }
4016 /*
4017 * Need to cancel those timers which will not be used when
4018 * TCP is detached. This has to be done before the tcp_wq
4019 * is set to the global queue.
4020 */
4021 tcp_timers_stop(tcp);
4022
4023
4024 tcp->tcp_detached = B_TRUE;
4025 ASSERT(tcps->tcps_g_q != NULL);
4026 tcp->tcp_rq = tcps->tcps_g_q;
4027 tcp->tcp_wq = WR(tcps->tcps_g_q);
4028
4029 if (tcp->tcp_state == TCPS_TIME_WAIT) {
4030 tcp_time_wait_append(tcp);
4031 TCP_DBGSTAT(tcps, tcp_detach_time_wait);
4032 goto finish;
4033 }
4034
4035 /*
4036 * If delta is zero the timer event wasn't executed and was
4037 * successfully canceled. In this case we need to restart it
4038 * with the minimal delta possible.
4039 */
4040 if (delta >= 0) {
4041 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
4042 delta ? delta : 1);
4043 }
4044 } else {
4045 tcp_closei_local(tcp);
4046 CONN_DEC_REF(tcp->tcp_connp);
4047 }
4048 finish:
4049 /* Signal closing thread that it can complete close */
4050 mutex_enter(&tcp->tcp_closelock);
4051 tcp->tcp_detached = B_TRUE;
4052 ASSERT(tcps->tcps_g_q != NULL);
4053 tcp->tcp_rq = tcps->tcps_g_q;
4054 tcp->tcp_wq = WR(tcps->tcps_g_q);
4055 tcp->tcp_closed = 1;
4056 cv_signal(&tcp->tcp_closecv);
4057 mutex_exit(&tcp->tcp_closelock);
4058 }
4059
4060 /*
4061 * Handle lingering timeouts. This function is called when the SO_LINGER timeout
4062 * expires.
4063 */
4064 static void
4065 tcp_close_linger_timeout(void *arg)
4066 {
4067 conn_t *connp = (conn_t *)arg;
4068 tcp_t *tcp = connp->conn_tcp;
4069
4070 tcp->tcp_client_errno = ETIMEDOUT;
4071 tcp_stop_lingering(tcp);
4072 }
4073
4074 static int
4075 tcp_close(queue_t *q, int flags)
4076 {
4077 conn_t *connp = Q_TO_CONN(q);
4078 tcp_t *tcp = connp->conn_tcp;
4079 mblk_t *mp = &tcp->tcp_closemp;
4080 boolean_t conn_ioctl_cleanup_reqd = B_FALSE;
4081 mblk_t *bp;
4082
4083 ASSERT(WR(q)->q_next == NULL);
4084 ASSERT(connp->conn_ref >= 2);
4085
4086 /*
4087 * We are being closed as /dev/tcp or /dev/tcp6.
4088 *
4089 * Mark the conn as closing. ill_pending_mp_add will not
4090 * add any mp to the pending mp list, after this conn has
4091 * started closing. Same for sq_pending_mp_add
4092 */
4093 mutex_enter(&connp->conn_lock);
4094 connp->conn_state_flags |= CONN_CLOSING;
4095 if (connp->conn_oper_pending_ill != NULL)
4096 conn_ioctl_cleanup_reqd = B_TRUE;
4097 CONN_INC_REF_LOCKED(connp);
4098 mutex_exit(&connp->conn_lock);
4099 tcp->tcp_closeflags = (uint8_t)flags;
4100 ASSERT(connp->conn_ref >= 3);
4101
4102 /*
4103 * tcp_closemp_used is used below without any protection of a lock
4104 * as we don't expect any one else to use it concurrently at this
4105 * point otherwise it would be a major defect.
4106 */
4107
4108 if (mp->b_prev == NULL)
4109 tcp->tcp_closemp_used = B_TRUE;
4110 else
4111 cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: "
4112 "connp %p tcp %p\n", (void *)connp, (void *)tcp);
4113
4114 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
4115
4116 (*tcp_squeue_close_proc)(connp->conn_sqp, mp,
4117 tcp_close_output, connp, SQTAG_IP_TCP_CLOSE);
4118
4119 mutex_enter(&tcp->tcp_closelock);
4120 while (!tcp->tcp_closed) {
4121 if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
4122 /*
4123 * The cv_wait_sig() was interrupted. We now do the
4124 * following:
4125 *
4126 * 1) If the endpoint was lingering, we allow this
4127 * to be interrupted by cancelling the linger timeout
4128 * and closing normally.
4129 *
4130 * 2) Revert to calling cv_wait()
4131 *
4132 * We revert to using cv_wait() to avoid an
4133 * infinite loop which can occur if the calling
4134 * thread is higher priority than the squeue worker
4135 * thread and is bound to the same cpu.
4136 */
4137 if (tcp->tcp_linger && tcp->tcp_lingertime > 0) {
4138 mutex_exit(&tcp->tcp_closelock);
4139 /* Entering squeue, bump ref count. */
4140 CONN_INC_REF(connp);
4141 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
4142 squeue_enter(connp->conn_sqp, bp,
4143 tcp_linger_interrupted, connp,
4144 SQTAG_IP_TCP_CLOSE);
4145 mutex_enter(&tcp->tcp_closelock);
4146 }
4147 break;
4148 }
4149 }
4150 while (!tcp->tcp_closed)
4151 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
4152 mutex_exit(&tcp->tcp_closelock);
4153
4154 /*
4155 * In the case of listener streams that have eagers in the q or q0
4156 * we wait for the eagers to drop their reference to us. tcp_rq and
4157 * tcp_wq of the eagers point to our queues. By waiting for the
4158 * refcnt to drop to 1, we are sure that the eagers have cleaned
4159 * up their queue pointers and also dropped their references to us.
4160 */
4161 if (tcp->tcp_wait_for_eagers) {
4162 mutex_enter(&connp->conn_lock);
4163 while (connp->conn_ref != 1) {
4164 cv_wait(&connp->conn_cv, &connp->conn_lock);
4165 }
4166 mutex_exit(&connp->conn_lock);
4167 }
4168 /*
4169 * ioctl cleanup. The mp is queued in the
4170 * ill_pending_mp or in the sq_pending_mp.
4171 */
4172 if (conn_ioctl_cleanup_reqd)
4173 conn_ioctl_cleanup(connp);
4174
4175 qprocsoff(q);
4176 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4177
4178 tcp->tcp_cpid = -1;
4179
4180 /*
4181 * Drop IP's reference on the conn. This is the last reference
4182 * on the connp if the state was less than established. If the
4183 * connection has gone into timewait state, then we will have
4184 * one ref for the TCP and one more ref (total of two) for the
4185 * classifier connected hash list (a timewait connections stays
4186 * in connected hash till closed).
4187 *
4188 * We can't assert the references because there might be other
4189 * transient reference places because of some walkers or queued
4190 * packets in squeue for the timewait state.
4191 */
4192 CONN_DEC_REF(connp);
4193 q->q_ptr = WR(q)->q_ptr = NULL;
4194 return (0);
4195 }
4196
4197 static int
4198 tcpclose_accept(queue_t *q)
4199 {
4200 vmem_t *minor_arena;
4201 dev_t conn_dev;
4202
4203 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
4204
4205 /*
4206 * We had opened an acceptor STREAM for sockfs which is
4207 * now being closed due to some error.
4208 */
4209 qprocsoff(q);
4210
4211 minor_arena = (vmem_t *)WR(q)->q_ptr;
4212 conn_dev = (dev_t)RD(q)->q_ptr;
4213 ASSERT(minor_arena != NULL);
4214 ASSERT(conn_dev != 0);
4215 inet_minor_free(minor_arena, conn_dev);
4216 q->q_ptr = WR(q)->q_ptr = NULL;
4217 return (0);
4218 }
4219
4220 /*
4221 * Called by tcp_close() routine via squeue when lingering is
4222 * interrupted by a signal.
4223 */
4224
4225 /* ARGSUSED */
4226 static void
4227 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
4228 {
4229 conn_t *connp = (conn_t *)arg;
4230 tcp_t *tcp = connp->conn_tcp;
4231
4232 freeb(mp);
4233 if (tcp->tcp_linger_tid != 0 &&
4234 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
4235 tcp_stop_lingering(tcp);
4236 tcp->tcp_client_errno = EINTR;
4237 }
4238 }
4239
4240 /*
4241 * Called by streams close routine via squeues when our client blows off her
4242 * descriptor, we take this to mean: "close the stream state NOW, close the tcp
4243 * connection politely" When SO_LINGER is set (with a non-zero linger time and
4244 * it is not a nonblocking socket) then this routine sleeps until the FIN is
4245 * acked.
4246 *
4247 * NOTE: tcp_close potentially returns error when lingering.
4248 * However, the stream head currently does not pass these errors
4249 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
4250 * errors to the application (from tsleep()) and not errors
4251 * like ECONNRESET caused by receiving a reset packet.
4252 */
4253
4254 /* ARGSUSED */
4255 static void
4256 tcp_close_output(void *arg, mblk_t *mp, void *arg2)
4257 {
4258 char *msg;
4259 conn_t *connp = (conn_t *)arg;
4260 tcp_t *tcp = connp->conn_tcp;
4261 clock_t delta = 0;
4262 tcp_stack_t *tcps = tcp->tcp_tcps;
4263
4264 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
4265 (connp->conn_fanout == NULL && connp->conn_ref >= 3));
4266
4267 /* Cancel any pending timeout */
4268 if (tcp->tcp_ordrelid != 0) {
4269 if (tcp->tcp_timeout) {
4270 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid);
4271 }
4272 tcp->tcp_ordrelid = 0;
4273 tcp->tcp_timeout = B_FALSE;
4274 }
4275
4276 mutex_enter(&tcp->tcp_eager_lock);
4277 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
4278 /* Cleanup for listener */
4279 tcp_eager_cleanup(tcp, 0);
4280 tcp->tcp_wait_for_eagers = 1;
4281 }
4282 mutex_exit(&tcp->tcp_eager_lock);
4283
4284 connp->conn_mdt_ok = B_FALSE;
4285 tcp->tcp_mdt = B_FALSE;
4286
4287 connp->conn_lso_ok = B_FALSE;
4288 tcp->tcp_lso = B_FALSE;
4289
4290 msg = NULL;
4291 switch (tcp->tcp_state) {
4292 case TCPS_CLOSED:
4293 case TCPS_IDLE:
4294 case TCPS_BOUND:
4295 case TCPS_LISTEN:
4296 break;
4297 case TCPS_SYN_SENT:
4298 msg = "tcp_close, during connect";
4299 break;
4300 case TCPS_SYN_RCVD:
4301 /*
4302 * Close during the connect 3-way handshake
4303 * but here there may or may not be pending data
4304 * already on queue. Process almost same as in
4305 * the ESTABLISHED state.
4306 */
4307 /* FALLTHRU */
4308 default:
4309 if (tcp->tcp_sodirect != NULL) {
4310 /* Ok, no more sodirect */
4311 tcp->tcp_sodirect = NULL;
4312 }
4313
4314 if (tcp->tcp_fused)
4315 tcp_unfuse(tcp);
4316
4317 /*
4318 * If SO_LINGER has set a zero linger time, abort the
4319 * connection with a reset.
4320 */
4321 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) {
4322 msg = "tcp_close, zero lingertime";
4323 break;
4324 }
4325
4326 ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding);
4327 /*
4328 * Abort connection if there is unread data queued.
4329 */
4330 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
4331 msg = "tcp_close, unread data";
4332 break;
4333 }
4334 /*
4335 * tcp_hard_bound is now cleared thus all packets go through
4336 * tcp_lookup. This fact is used by tcp_detach below.
4337 *
4338 * We have done a qwait() above which could have possibly
4339 * drained more messages in turn causing transition to a
4340 * different state. Check whether we have to do the rest
4341 * of the processing or not.
4342 */
4343 if (tcp->tcp_state <= TCPS_LISTEN)
4344 break;
4345
4346 /*
4347 * Transmit the FIN before detaching the tcp_t.
4348 * After tcp_detach returns this queue/perimeter
4349 * no longer owns the tcp_t thus others can modify it.
4350 */
4351 (void) tcp_xmit_end(tcp);
4352
4353 /*
4354 * If lingering on close then wait until the fin is acked,
4355 * the SO_LINGER time passes, or a reset is sent/received.
4356 */
4357 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 &&
4358 !(tcp->tcp_fin_acked) &&
4359 tcp->tcp_state >= TCPS_ESTABLISHED) {
4360 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
4361 tcp->tcp_client_errno = EWOULDBLOCK;
4362 } else if (tcp->tcp_client_errno == 0) {
4363
4364 ASSERT(tcp->tcp_linger_tid == 0);
4365
4366 tcp->tcp_linger_tid = TCP_TIMER(tcp,
4367 tcp_close_linger_timeout,
4368 tcp->tcp_lingertime * hz);
4369
4370 /* tcp_close_linger_timeout will finish close */
4371 if (tcp->tcp_linger_tid == 0)
4372 tcp->tcp_client_errno = ENOSR;
4373 else
4374 return;
4375 }
4376
4377 /*
4378 * Check if we need to detach or just close
4379 * the instance.
4380 */
4381 if (tcp->tcp_state <= TCPS_LISTEN)
4382 break;
4383 }
4384
4385 /*
4386 * Make sure that no other thread will access the tcp_rq of
4387 * this instance (through lookups etc.) as tcp_rq will go
4388 * away shortly.
4389 */
4390 tcp_acceptor_hash_remove(tcp);
4391
4392 mutex_enter(&tcp->tcp_non_sq_lock);
4393 if (tcp->tcp_flow_stopped) {
4394 tcp_clrqfull(tcp);
4395 }
4396 mutex_exit(&tcp->tcp_non_sq_lock);
4397
4398 if (tcp->tcp_timer_tid != 0) {
4399 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4400 tcp->tcp_timer_tid = 0;
4401 }
4402 /*
4403 * Need to cancel those timers which will not be used when
4404 * TCP is detached. This has to be done before the tcp_wq
4405 * is set to the global queue.
4406 */
4407 tcp_timers_stop(tcp);
4408
4409 tcp->tcp_detached = B_TRUE;
4410 if (tcp->tcp_state == TCPS_TIME_WAIT) {
4411 tcp_time_wait_append(tcp);
4412 TCP_DBGSTAT(tcps, tcp_detach_time_wait);
4413 ASSERT(connp->conn_ref >= 3);
4414 goto finish;
4415 }
4416
4417 /*
4418 * If delta is zero the timer event wasn't executed and was
4419 * successfully canceled. In this case we need to restart it
4420 * with the minimal delta possible.
4421 */
4422 if (delta >= 0)
4423 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
4424 delta ? delta : 1);
4425
4426 ASSERT(connp->conn_ref >= 3);
4427 goto finish;
4428 }
4429
4430 /* Detach did not complete. Still need to remove q from stream. */
4431 if (msg) {
4432 if (tcp->tcp_state == TCPS_ESTABLISHED ||
4433 tcp->tcp_state == TCPS_CLOSE_WAIT)
4434 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
4435 if (tcp->tcp_state == TCPS_SYN_SENT ||
4436 tcp->tcp_state == TCPS_SYN_RCVD)
4437 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
4438 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST);
4439 }
4440
4441 tcp_closei_local(tcp);
4442 CONN_DEC_REF(connp);
4443 ASSERT(connp->conn_ref >= 2);
4444
4445 finish:
4446 /*
4447 * Although packets are always processed on the correct
4448 * tcp's perimeter and access is serialized via squeue's,
4449 * IP still needs a queue when sending packets in time_wait
4450 * state so use WR(tcps_g_q) till ip_output() can be
4451 * changed to deal with just connp. For read side, we
4452 * could have set tcp_rq to NULL but there are some cases
4453 * in tcp_rput_data() from early days of this code which
4454 * do a putnext without checking if tcp is closed. Those
4455 * need to be identified before both tcp_rq and tcp_wq
4456 * can be set to NULL and tcps_g_q can disappear forever.
4457 */
4458 mutex_enter(&tcp->tcp_closelock);
4459 /*
4460 * Don't change the queues in the case of a listener that has
4461 * eagers in its q or q0. It could surprise the eagers.
4462 * Instead wait for the eagers outside the squeue.
4463 */
4464 if (!tcp->tcp_wait_for_eagers) {
4465 tcp->tcp_detached = B_TRUE;
4466 /*
4467 * When default queue is closing we set tcps_g_q to NULL
4468 * after the close is done.
4469 */
4470 ASSERT(tcps->tcps_g_q != NULL);
4471 tcp->tcp_rq = tcps->tcps_g_q;
4472 tcp->tcp_wq = WR(tcps->tcps_g_q);
4473 }
4474
4475 /* Signal tcp_close() to finish closing. */
4476 tcp->tcp_closed = 1;
4477 cv_signal(&tcp->tcp_closecv);
4478 mutex_exit(&tcp->tcp_closelock);
4479 }
4480
4481
4482 /*
4483 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp.
4484 * Some stream heads get upset if they see these later on as anything but NULL.
4485 */
4486 static void
4487 tcp_close_mpp(mblk_t **mpp)
4488 {
4489 mblk_t *mp;
4490
4491 if ((mp = *mpp) != NULL) {
4492 do {
4493 mp->b_next = NULL;
4494 mp->b_prev = NULL;
4495 } while ((mp = mp->b_cont) != NULL);
4496
4497 mp = *mpp;
4498 *mpp = NULL;
4499 freemsg(mp);
4500 }
4501 }
4502
4503 /* Do detached close. */
4504 static void
4505 tcp_close_detached(tcp_t *tcp)
4506 {
4507 if (tcp->tcp_fused)
4508 tcp_unfuse(tcp);
4509
4510 /*
4511 * Clustering code serializes TCP disconnect callbacks and
4512 * cluster tcp list walks by blocking a TCP disconnect callback
4513 * if a cluster tcp list walk is in progress. This ensures
4514 * accurate accounting of TCPs in the cluster code even though
4515 * the TCP list walk itself is not atomic.
4516 */
4517 tcp_closei_local(tcp);
4518 CONN_DEC_REF(tcp->tcp_connp);
4519 }
4520
4521 /*
4522 * Stop all TCP timers, and free the timer mblks if requested.
4523 */
4524 void
4525 tcp_timers_stop(tcp_t *tcp)
4526 {
4527 if (tcp->tcp_timer_tid != 0) {
4528 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4529 tcp->tcp_timer_tid = 0;
4530 }
4531 if (tcp->tcp_ka_tid != 0) {
4532 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
4533 tcp->tcp_ka_tid = 0;
4534 }
4535 if (tcp->tcp_ack_tid != 0) {
4536 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4537 tcp->tcp_ack_tid = 0;
4538 }
4539 if (tcp->tcp_push_tid != 0) {
4540 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
4541 tcp->tcp_push_tid = 0;
4542 }
4543 }
4544
4545 /*
4546 * The tcp_t is going away. Remove it from all lists and set it
4547 * to TCPS_CLOSED. The freeing up of memory is deferred until
4548 * tcp_inactive. This is needed since a thread in tcp_rput might have
4549 * done a CONN_INC_REF on this structure before it was removed from the
4550 * hashes.
4551 */
4552 static void
4553 tcp_closei_local(tcp_t *tcp)
4554 {
4555 ire_t *ire;
4556 conn_t *connp = tcp->tcp_connp;
4557 tcp_stack_t *tcps = tcp->tcp_tcps;
4558
4559 if (!TCP_IS_SOCKET(tcp))
4560 tcp_acceptor_hash_remove(tcp);
4561
4562 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
4563 tcp->tcp_ibsegs = 0;
4564 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
4565 tcp->tcp_obsegs = 0;
4566
4567 /*
4568 * If we are an eager connection hanging off a listener that
4569 * hasn't formally accepted the connection yet, get off his
4570 * list and blow off any data that we have accumulated.
4571 */
4572 if (tcp->tcp_listener != NULL) {
4573 tcp_t *listener = tcp->tcp_listener;
4574 mutex_enter(&listener->tcp_eager_lock);
4575 /*
4576 * tcp_tconnind_started == B_TRUE means that the
4577 * conn_ind has already gone to listener. At
4578 * this point, eager will be closed but we
4579 * leave it in listeners eager list so that
4580 * if listener decides to close without doing
4581 * accept, we can clean this up. In tcp_wput_accept
4582 * we take care of the case of accept on closed
4583 * eager.
4584 */
4585 if (!tcp->tcp_tconnind_started) {
4586 tcp_eager_unlink(tcp);
4587 mutex_exit(&listener->tcp_eager_lock);
4588 /*
4589 * We don't want to have any pointers to the
4590 * listener queue, after we have released our
4591 * reference on the listener
4592 */
4593 ASSERT(tcps->tcps_g_q != NULL);
4594 tcp->tcp_rq = tcps->tcps_g_q;
4595 tcp->tcp_wq = WR(tcps->tcps_g_q);
4596 CONN_DEC_REF(listener->tcp_connp);
4597 } else {
4598 mutex_exit(&listener->tcp_eager_lock);
4599 }
4600 }
4601
4602 /* Stop all the timers */
4603 tcp_timers_stop(tcp);
4604
4605 if (tcp->tcp_state == TCPS_LISTEN) {
4606 if (tcp->tcp_ip_addr_cache) {
4607 kmem_free((void *)tcp->tcp_ip_addr_cache,
4608 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
4609 tcp->tcp_ip_addr_cache = NULL;
4610 }
4611 }
4612 mutex_enter(&tcp->tcp_non_sq_lock);
4613 if (tcp->tcp_flow_stopped)
4614 tcp_clrqfull(tcp);
4615 mutex_exit(&tcp->tcp_non_sq_lock);
4616
4617 tcp_bind_hash_remove(tcp);
4618 /*
4619 * If the tcp_time_wait_collector (which runs outside the squeue)
4620 * is trying to remove this tcp from the time wait list, we will
4621 * block in tcp_time_wait_remove while trying to acquire the
4622 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also
4623 * requires the ipcl_hash_remove to be ordered after the
4624 * tcp_time_wait_remove for the refcnt checks to work correctly.
4625 */
4626 if (tcp->tcp_state == TCPS_TIME_WAIT)
4627 (void) tcp_time_wait_remove(tcp, NULL);
4628 CL_INET_DISCONNECT(tcp);
4629 ipcl_hash_remove(connp);
4630
4631 /*
4632 * Delete the cached ire in conn_ire_cache and also mark
4633 * the conn as CONDEMNED
4634 */
4635 mutex_enter(&connp->conn_lock);
4636 connp->conn_state_flags |= CONN_CONDEMNED;
4637 ire = connp->conn_ire_cache;
4638 connp->conn_ire_cache = NULL;
4639 mutex_exit(&connp->conn_lock);
4640 if (ire != NULL)
4641 IRE_REFRELE_NOTR(ire);
4642
4643 /* Need to cleanup any pending ioctls */
4644 ASSERT(tcp->tcp_time_wait_next == NULL);
4645 ASSERT(tcp->tcp_time_wait_prev == NULL);
4646 ASSERT(tcp->tcp_time_wait_expire == 0);
4647 if (connp->conn_fully_bound) {
4648 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
4649 tcp_t *, tcp, int32_t, TCPS_CLOSED);
4650 }
4651 tcp->tcp_state = TCPS_CLOSED;
4652
4653 /* Release any SSL context */
4654 if (tcp->tcp_kssl_ent != NULL) {
4655 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4656 tcp->tcp_kssl_ent = NULL;
4657 }
4658 if (tcp->tcp_kssl_ctx != NULL) {
4659 kssl_release_ctx(tcp->tcp_kssl_ctx);
4660 tcp->tcp_kssl_ctx = NULL;
4661 }
4662 tcp->tcp_kssl_pending = B_FALSE;
4663
4664 tcp_ipsec_cleanup(tcp);
4665 }
4666
4667 /*
4668 * tcp is dying (called from ipcl_conn_destroy and error cases).
4669 * Free the tcp_t in either case.
4670 */
4671 void
4672 tcp_free(tcp_t *tcp)
4673 {
4674 mblk_t *mp;
4675 ip6_pkt_t *ipp;
4676
4677 ASSERT(tcp != NULL);
4678 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
4679
4680 tcp->tcp_rq = NULL;
4681 tcp->tcp_wq = NULL;
4682
4683 tcp_close_mpp(&tcp->tcp_xmit_head);
4684 tcp_close_mpp(&tcp->tcp_reass_head);
4685 if (tcp->tcp_rcv_list != NULL) {
4686 /* Free b_next chain */
4687 tcp_close_mpp(&tcp->tcp_rcv_list);
4688 }
4689 if ((mp = tcp->tcp_urp_mp) != NULL) {
4690 freemsg(mp);
4691 }
4692 if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
4693 freemsg(mp);
4694 }
4695
4696 if (tcp->tcp_fused_sigurg_mp != NULL) {
4697 freeb(tcp->tcp_fused_sigurg_mp);
4698 tcp->tcp_fused_sigurg_mp = NULL;
4699 }
4700
4701 if (tcp->tcp_sack_info != NULL) {
4702 if (tcp->tcp_notsack_list != NULL) {
4703 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
4704 }
4705 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
4706 }
4707
4708 if (tcp->tcp_hopopts != NULL) {
4709 mi_free(tcp->tcp_hopopts);
4710 tcp->tcp_hopopts = NULL;
4711 tcp->tcp_hopoptslen = 0;
4712 }
4713 ASSERT(tcp->tcp_hopoptslen == 0);
4714 if (tcp->tcp_dstopts != NULL) {
4715 mi_free(tcp->tcp_dstopts);
4716 tcp->tcp_dstopts = NULL;
4717 tcp->tcp_dstoptslen = 0;
4718 }
4719 ASSERT(tcp->tcp_dstoptslen == 0);
4720 if (tcp->tcp_rtdstopts != NULL) {
4721 mi_free(tcp->tcp_rtdstopts);
4722 tcp->tcp_rtdstopts = NULL;
4723 tcp->tcp_rtdstoptslen = 0;
4724 }
4725 ASSERT(tcp->tcp_rtdstoptslen == 0);
4726 if (tcp->tcp_rthdr != NULL) {
4727 mi_free(tcp->tcp_rthdr);
4728 tcp->tcp_rthdr = NULL;
4729 tcp->tcp_rthdrlen = 0;
4730 }
4731 ASSERT(tcp->tcp_rthdrlen == 0);
4732
4733 ipp = &tcp->tcp_sticky_ipp;
4734 if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
4735 IPPF_RTHDR))
4736 ip6_pkt_free(ipp);
4737
4738 /*
4739 * Free memory associated with the tcp/ip header template.
4740 */
4741
4742 if (tcp->tcp_iphc != NULL)
4743 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
4744
4745 /*
4746 * Following is really a blowing away a union.
4747 * It happens to have exactly two members of identical size
4748 * the following code is enough.
4749 */
4750 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
4751
4752 if (tcp->tcp_tracebuf != NULL) {
4753 kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t));
4754 tcp->tcp_tracebuf = NULL;
4755 }
4756 }
4757
4758
4759 /*
4760 * Put a connection confirmation message upstream built from the
4761 * address information within 'iph' and 'tcph'. Report our success or failure.
4762 */
4763 static boolean_t
4764 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
4765 mblk_t **defermp)
4766 {
4767 sin_t sin;
4768 sin6_t sin6;
4769 mblk_t *mp;
4770 char *optp = NULL;
4771 int optlen = 0;
4772 cred_t *cr;
4773
4774 if (defermp != NULL)
4775 *defermp = NULL;
4776
4777 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
4778 /*
4779 * Return in T_CONN_CON results of option negotiation through
4780 * the T_CONN_REQ. Note: If there is an real end-to-end option
4781 * negotiation, then what is received from remote end needs
4782 * to be taken into account but there is no such thing (yet?)
4783 * in our TCP/IP.
4784 * Note: We do not use mi_offset_param() here as
4785 * tcp_opts_conn_req contents do not directly come from
4786 * an application and are either generated in kernel or
4787 * from user input that was already verified.
4788 */
4789 mp = tcp->tcp_conn.tcp_opts_conn_req;
4790 optp = (char *)(mp->b_rptr +
4791 ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
4792 optlen = (int)
4793 ((struct T_conn_req *)mp->b_rptr)->OPT_length;
4794 }
4795
4796 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
4797 ipha_t *ipha = (ipha_t *)iphdr;
4798
4799 /* packet is IPv4 */
4800 if (tcp->tcp_family == AF_INET) {
4801 sin = sin_null;
4802 sin.sin_addr.s_addr = ipha->ipha_src;
4803 sin.sin_port = *(uint16_t *)tcph->th_lport;
4804 sin.sin_family = AF_INET;
4805 mp = mi_tpi_conn_con(NULL, (char *)&sin,
4806 (int)sizeof (sin_t), optp, optlen);
4807 } else {
4808 sin6 = sin6_null;
4809 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4810 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4811 sin6.sin6_family = AF_INET6;
4812 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4813 (int)sizeof (sin6_t), optp, optlen);
4814
4815 }
4816 } else {
4817 ip6_t *ip6h = (ip6_t *)iphdr;
4818
4819 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
4820 ASSERT(tcp->tcp_family == AF_INET6);
4821 sin6 = sin6_null;
4822 sin6.sin6_addr = ip6h->ip6_src;
4823 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4824 sin6.sin6_family = AF_INET6;
4825 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4826 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4827 (int)sizeof (sin6_t), optp, optlen);
4828 }
4829
4830 if (!mp)
4831 return (B_FALSE);
4832
4833 if ((cr = DB_CRED(idmp)) != NULL) {
4834 mblk_setcred(mp, cr);
4835 DB_CPID(mp) = DB_CPID(idmp);
4836 }
4837
4838 if (defermp == NULL)
4839 putnext(tcp->tcp_rq, mp);
4840 else
4841 *defermp = mp;
4842
4843 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
4844 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
4845 return (B_TRUE);
4846 }
4847
4848 /*
4849 * Defense for the SYN attack -
4850 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
4851 * one from the list of droppable eagers. This list is a subset of q0.
4852 * see comments before the definition of MAKE_DROPPABLE().
4853 * 2. Don't drop a SYN request before its first timeout. This gives every
4854 * request at least til the first timeout to complete its 3-way handshake.
4855 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
4856 * requests currently on the queue that has timed out. This will be used
4857 * as an indicator of whether an attack is under way, so that appropriate
4858 * actions can be taken. (It's incremented in tcp_timer() and decremented
4859 * either when eager goes into ESTABLISHED, or gets freed up.)
4860 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
4861 * # of timeout drops back to <= q0len/32 => SYN alert off
4862 */
4863 static boolean_t
4864 tcp_drop_q0(tcp_t *tcp)
4865 {
4866 tcp_t *eager;
4867 mblk_t *mp;
4868 tcp_stack_t *tcps = tcp->tcp_tcps;
4869
4870 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
4871 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
4872
4873 /* Pick oldest eager from the list of droppable eagers */
4874 eager = tcp->tcp_eager_prev_drop_q0;
4875
4876 /* If list is empty. return B_FALSE */
4877 if (eager == tcp) {
4878 return (B_FALSE);
4879 }
4880
4881 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
4882 if ((mp = allocb(0, BPRI_HI)) == NULL)
4883 return (B_FALSE);
4884
4885 /*
4886 * Take this eager out from the list of droppable eagers since we are
4887 * going to drop it.
4888 */
4889 MAKE_UNDROPPABLE(eager);
4890
4891 if (tcp->tcp_debug) {
4892 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
4893 "tcp_drop_q0: listen half-open queue (max=%d) overflow"
4894 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
4895 tcp->tcp_conn_req_cnt_q0,
4896 tcp_display(tcp, NULL, DISP_PORT_ONLY));
4897 }
4898
4899 BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop);
4900
4901 /* Put a reference on the conn as we are enqueueing it in the sqeue */
4902 CONN_INC_REF(eager->tcp_connp);
4903
4904 /* Mark the IRE created for this SYN request temporary */
4905 tcp_ip_ire_mark_advice(eager);
4906 squeue_fill(eager->tcp_connp->conn_sqp, mp,
4907 tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0);
4908
4909 return (B_TRUE);
4910 }
4911
4912 int
4913 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
4914 tcph_t *tcph, uint_t ipvers, mblk_t *idmp)
4915 {
4916 tcp_t *ltcp = lconnp->conn_tcp;
4917 tcp_t *tcp = connp->conn_tcp;
4918 mblk_t *tpi_mp;
4919 ipha_t *ipha;
4920 ip6_t *ip6h;
4921 sin6_t sin6;
4922 in6_addr_t v6dst;
4923 int err;
4924 int ifindex = 0;
4925 cred_t *cr;
4926 tcp_stack_t *tcps = tcp->tcp_tcps;
4927
4928 if (ipvers == IPV4_VERSION) {
4929 ipha = (ipha_t *)mp->b_rptr;
4930
4931 connp->conn_send = ip_output;
4932 connp->conn_recv = tcp_input;
4933
4934 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
4935 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
4936
4937 sin6 = sin6_null;
4938 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4939 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
4940 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4941 sin6.sin6_family = AF_INET6;
4942 sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst,
4943 lconnp->conn_zoneid, tcps->tcps_netstack);
4944 if (tcp->tcp_recvdstaddr) {
4945 sin6_t sin6d;
4946
4947 sin6d = sin6_null;
4948 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
4949 &sin6d.sin6_addr);
4950 sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4951 sin6d.sin6_family = AF_INET;
4952 tpi_mp = mi_tpi_extconn_ind(NULL,
4953 (char *)&sin6d, sizeof (sin6_t),
4954 (char *)&tcp,
4955 (t_scalar_t)sizeof (intptr_t),
4956 (char *)&sin6d, sizeof (sin6_t),
4957 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4958 } else {
4959 tpi_mp = mi_tpi_conn_ind(NULL,
4960 (char *)&sin6, sizeof (sin6_t),
4961 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4962 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4963 }
4964 } else {
4965 ip6h = (ip6_t *)mp->b_rptr;
4966
4967 connp->conn_send = ip_output_v6;
4968 connp->conn_recv = tcp_input;
4969
4970 connp->conn_srcv6 = ip6h->ip6_dst;
4971 connp->conn_remv6 = ip6h->ip6_src;
4972
4973 /* db_cksumstuff is set at ip_fanout_tcp_v6 */
4974 ifindex = (int)DB_CKSUMSTUFF(mp);
4975 DB_CKSUMSTUFF(mp) = 0;
4976
4977 sin6 = sin6_null;
4978 sin6.sin6_addr = ip6h->ip6_src;
4979 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4980 sin6.sin6_family = AF_INET6;
4981 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4982 sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
4983 lconnp->conn_zoneid, tcps->tcps_netstack);
4984
4985 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
4986 /* Pass up the scope_id of remote addr */
4987 sin6.sin6_scope_id = ifindex;
4988 } else {
4989 sin6.sin6_scope_id = 0;
4990 }
4991 if (tcp->tcp_recvdstaddr) {
4992 sin6_t sin6d;
4993
4994 sin6d = sin6_null;
4995 sin6.sin6_addr = ip6h->ip6_dst;
4996 sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4997 sin6d.sin6_family = AF_INET;
4998 tpi_mp = mi_tpi_extconn_ind(NULL,
4999 (char *)&sin6d, sizeof (sin6_t),
5000 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
5001 (char *)&sin6d, sizeof (sin6_t),
5002 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5003 } else {
5004 tpi_mp = mi_tpi_conn_ind(NULL,
5005 (char *)&sin6, sizeof (sin6_t),
5006 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
5007 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5008 }
5009 }
5010
5011 if (tpi_mp == NULL)
5012 return (ENOMEM);
5013
5014 connp->conn_fport = *(uint16_t *)tcph->th_lport;
5015 connp->conn_lport = *(uint16_t *)tcph->th_fport;
5016 connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER);
5017 connp->conn_fully_bound = B_FALSE;
5018
5019 if (tcps->tcps_trace)
5020 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
5021
5022 /* Inherit information from the "parent" */
5023 tcp->tcp_ipversion = ltcp->tcp_ipversion;
5024 tcp->tcp_family = ltcp->tcp_family;
5025 tcp->tcp_wq = ltcp->tcp_wq;
5026 tcp->tcp_rq = ltcp->tcp_rq;
5027 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
5028 tcp->tcp_detached = B_TRUE;
5029 if ((err = tcp_init_values(tcp)) != 0) {
5030 freemsg(tpi_mp);
5031 return (err);
5032 }
5033
5034 if (ipvers == IPV4_VERSION) {
5035 if ((err = tcp_header_init_ipv4(tcp)) != 0) {
5036 freemsg(tpi_mp);
5037 return (err);
5038 }
5039 ASSERT(tcp->tcp_ipha != NULL);
5040 } else {
5041 /* ifindex must be already set */
5042 ASSERT(ifindex != 0);
5043
5044 if (ltcp->tcp_bound_if != 0) {
5045 /*
5046 * Set newtcp's bound_if equal to
5047 * listener's value. If ifindex is
5048 * not the same as ltcp->tcp_bound_if,
5049 * it must be a packet for the ipmp group
5050 * of interfaces
5051 */
5052 tcp->tcp_bound_if = ltcp->tcp_bound_if;
5053 } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
5054 tcp->tcp_bound_if = ifindex;
5055 }
5056
5057 tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
5058 tcp->tcp_recvifindex = 0;
5059 tcp->tcp_recvhops = 0xffffffffU;
5060 ASSERT(tcp->tcp_ip6h != NULL);
5061 }
5062
5063 tcp->tcp_lport = ltcp->tcp_lport;
5064
5065 if (ltcp->tcp_ipversion == tcp->tcp_ipversion) {
5066 if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) {
5067 /*
5068 * Listener had options of some sort; eager inherits.
5069 * Free up the eager template and allocate one
5070 * of the right size.
5071 */
5072 if (tcp->tcp_hdr_grown) {
5073 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
5074 } else {
5075 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
5076 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
5077 }
5078 tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len,
5079 KM_NOSLEEP);
5080 if (tcp->tcp_iphc == NULL) {
5081 tcp->tcp_iphc_len = 0;
5082 freemsg(tpi_mp);
5083 return (ENOMEM);
5084 }
5085 tcp->tcp_iphc_len = ltcp->tcp_iphc_len;
5086 tcp->tcp_hdr_grown = B_TRUE;
5087 }
5088 tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
5089 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
5090 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5091 tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops;
5092 tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf;
5093
5094 /*
5095 * Copy the IP+TCP header template from listener to eager
5096 */
5097 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
5098 if (tcp->tcp_ipversion == IPV6_VERSION) {
5099 if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt ==
5100 IPPROTO_RAW) {
5101 tcp->tcp_ip6h =
5102 (ip6_t *)(tcp->tcp_iphc +
5103 sizeof (ip6i_t));
5104 } else {
5105 tcp->tcp_ip6h =
5106 (ip6_t *)(tcp->tcp_iphc);
5107 }
5108 tcp->tcp_ipha = NULL;
5109 } else {
5110 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
5111 tcp->tcp_ip6h = NULL;
5112 }
5113 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
5114 tcp->tcp_ip_hdr_len);
5115 } else {
5116 /*
5117 * only valid case when ipversion of listener and
5118 * eager differ is when listener is IPv6 and
5119 * eager is IPv4.
5120 * Eager header template has been initialized to the
5121 * maximum v4 header sizes, which includes space for
5122 * TCP and IP options.
5123 */
5124 ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) &&
5125 (tcp->tcp_ipversion == IPV4_VERSION));
5126 ASSERT(tcp->tcp_iphc_len >=
5127 TCP_MAX_COMBINED_HEADER_LENGTH);
5128 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5129 /* copy IP header fields individually */
5130 tcp->tcp_ipha->ipha_ttl =
5131 ltcp->tcp_ip6h->ip6_hops;
5132 bcopy(ltcp->tcp_tcph->th_lport,
5133 tcp->tcp_tcph->th_lport, sizeof (ushort_t));
5134 }
5135
5136 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
5137 bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport,
5138 sizeof (in_port_t));
5139
5140 if (ltcp->tcp_lport == 0) {
5141 tcp->tcp_lport = *(in_port_t *)tcph->th_fport;
5142 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport,
5143 sizeof (in_port_t));
5144 }
5145
5146 if (tcp->tcp_ipversion == IPV4_VERSION) {
5147 ASSERT(ipha != NULL);
5148 tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
5149 tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
5150
5151 /* Source routing option copyover (reverse it) */
5152 if (tcps->tcps_rev_src_routes)
5153 tcp_opt_reverse(tcp, ipha);
5154 } else {
5155 ASSERT(ip6h != NULL);
5156 tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src;
5157 tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst;
5158 }
5159
5160 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
5161 ASSERT(!tcp->tcp_tconnind_started);
5162 /*
5163 * If the SYN contains a credential, it's a loopback packet; attach
5164 * the credential to the TPI message.
5165 */
5166 if ((cr = DB_CRED(idmp)) != NULL) {
5167 mblk_setcred(tpi_mp, cr);
5168 DB_CPID(tpi_mp) = DB_CPID(idmp);
5169 }
5170 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
5171
5172 /* Inherit the listener's SSL protection state */
5173
5174 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
5175 kssl_hold_ent(tcp->tcp_kssl_ent);
5176 tcp->tcp_kssl_pending = B_TRUE;
5177 }
5178
5179 return (0);
5180 }
5181
5182
5183 int
5184 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
5185 tcph_t *tcph, mblk_t *idmp)
5186 {
5187 tcp_t *ltcp = lconnp->conn_tcp;
5188 tcp_t *tcp = connp->conn_tcp;
5189 sin_t sin;
5190 mblk_t *tpi_mp = NULL;
5191 int err;
5192 cred_t *cr;
5193 tcp_stack_t *tcps = tcp->tcp_tcps;
5194
5195 sin = sin_null;
5196 sin.sin_addr.s_addr = ipha->ipha_src;
5197 sin.sin_port = *(uint16_t *)tcph->th_lport;
5198 sin.sin_family = AF_INET;
5199 if (ltcp->tcp_recvdstaddr) {
5200 sin_t sind;
5201
5202 sind = sin_null;
5203 sind.sin_addr.s_addr = ipha->ipha_dst;
5204 sind.sin_port = *(uint16_t *)tcph->th_fport;
5205 sind.sin_family = AF_INET;
5206 tpi_mp = mi_tpi_extconn_ind(NULL,
5207 (char *)&sind, sizeof (sin_t), (char *)&tcp,
5208 (t_scalar_t)sizeof (intptr_t), (char *)&sind,
5209 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5210 } else {
5211 tpi_mp = mi_tpi_conn_ind(NULL,
5212 (char *)&sin, sizeof (sin_t),
5213 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
5214 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5215 }
5216
5217 if (tpi_mp == NULL) {
5218 return (ENOMEM);
5219 }
5220
5221 connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER);
5222 connp->conn_send = ip_output;
5223 connp->conn_recv = tcp_input;
5224 connp->conn_fully_bound = B_FALSE;
5225
5226 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
5227 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
5228 connp->conn_fport = *(uint16_t *)tcph->th_lport;
5229 connp->conn_lport = *(uint16_t *)tcph->th_fport;
5230
5231 if (tcps->tcps_trace) {
5232 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
5233 }
5234
5235 /* Inherit information from the "parent" */
5236 tcp->tcp_ipversion = ltcp->tcp_ipversion;
5237 tcp->tcp_family = ltcp->tcp_family;
5238 tcp->tcp_wq = ltcp->tcp_wq;
5239 tcp->tcp_rq = ltcp->tcp_rq;
5240 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
5241 tcp->tcp_detached = B_TRUE;
5242 if ((err = tcp_init_values(tcp)) != 0) {
5243 freemsg(tpi_mp);
5244 return (err);
5245 }
5246
5247 /*
5248 * Let's make sure that eager tcp template has enough space to
5249 * copy IPv4 listener's tcp template. Since the conn_t structure is
5250 * preserved and tcp_iphc_len is also preserved, an eager conn_t may
5251 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or
5252 * more (in case of re-allocation of conn_t with tcp-IPv6 template with
5253 * extension headers or with ip6i_t struct). Note that bcopy() below
5254 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_
5255 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener.
5256 */
5257 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
5258 ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH);
5259
5260 tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
5261 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
5262 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5263 tcp->tcp_ttl = ltcp->tcp_ttl;
5264 tcp->tcp_tos = ltcp->tcp_tos;
5265
5266 /* Copy the IP+TCP header template from listener to eager */
5267 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
5268 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
5269 tcp->tcp_ip6h = NULL;
5270 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
5271 tcp->tcp_ip_hdr_len);
5272
5273 /* Initialize the IP addresses and Ports */
5274 tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
5275 tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
5276 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
5277 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t));
5278
5279 /* Source routing option copyover (reverse it) */
5280 if (tcps->tcps_rev_src_routes)
5281 tcp_opt_reverse(tcp, ipha);
5282
5283 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
5284 ASSERT(!tcp->tcp_tconnind_started);
5285
5286 /*
5287 * If the SYN contains a credential, it's a loopback packet; attach
5288 * the credential to the TPI message.
5289 */
5290 if ((cr = DB_CRED(idmp)) != NULL) {
5291 mblk_setcred(tpi_mp, cr);
5292 DB_CPID(tpi_mp) = DB_CPID(idmp);
5293 }
5294 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
5295
5296 /* Inherit the listener's SSL protection state */
5297 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
5298 kssl_hold_ent(tcp->tcp_kssl_ent);
5299 tcp->tcp_kssl_pending = B_TRUE;
5300 }
5301
5302 return (0);
5303 }
5304
5305 /*
5306 * sets up conn for ipsec.
5307 * if the first mblk is M_CTL it is consumed and mpp is updated.
5308 * in case of error mpp is freed.
5309 */
5310 conn_t *
5311 tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
5312 {
5313 conn_t *connp = tcp->tcp_connp;
5314 conn_t *econnp;
5315 squeue_t *new_sqp;
5316 mblk_t *first_mp = *mpp;
5317 mblk_t *mp = *mpp;
5318 boolean_t mctl_present = B_FALSE;
5319 uint_t ipvers;
5320
5321 econnp = tcp_get_conn(sqp, tcp->tcp_tcps);
5322 if (econnp == NULL) {
5323 freemsg(first_mp);
5324 return (NULL);
5325 }
5326 if (DB_TYPE(mp) == M_CTL) {
5327 if (mp->b_cont == NULL ||
5328 mp->b_cont->b_datap->db_type != M_DATA) {
5329 freemsg(first_mp);
5330 return (NULL);
5331 }
5332 mp = mp->b_cont;
5333 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) {
5334 freemsg(first_mp);
5335 return (NULL);
5336 }
5337
5338 mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5339 first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5340 mctl_present = B_TRUE;
5341 } else {
5342 ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY);
5343 mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5344 }
5345
5346 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5347 DB_CKSUMSTART(mp) = 0;
5348
5349 ASSERT(OK_32PTR(mp->b_rptr));
5350 ipvers = IPH_HDR_VERSION(mp->b_rptr);
5351 if (ipvers == IPV4_VERSION) {
5352 uint16_t *up;
5353 uint32_t ports;
5354 ipha_t *ipha;
5355
5356 ipha = (ipha_t *)mp->b_rptr;
5357 up = (uint16_t *)((uchar_t *)ipha +
5358 IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET);
5359 ports = *(uint32_t *)up;
5360 IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP,
5361 ipha->ipha_dst, ipha->ipha_src, ports);
5362 } else {
5363 uint16_t *up;
5364 uint32_t ports;
5365 uint16_t ip_hdr_len;
5366 uint8_t *nexthdrp;
5367 ip6_t *ip6h;
5368 tcph_t *tcph;
5369
5370 ip6h = (ip6_t *)mp->b_rptr;
5371 if (ip6h->ip6_nxt == IPPROTO_TCP) {
5372 ip_hdr_len = IPV6_HDR_LEN;
5373 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len,
5374 &nexthdrp) || *nexthdrp != IPPROTO_TCP) {
5375 CONN_DEC_REF(econnp);
5376 freemsg(first_mp);
5377 return (NULL);
5378 }
5379 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5380 up = (uint16_t *)tcph->th_lport;
5381 ports = *(uint32_t *)up;
5382 IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP,
5383 ip6h->ip6_dst, ip6h->ip6_src, ports);
5384 }
5385
5386 /*
5387 * The caller already ensured that there is a sqp present.
5388 */
5389 econnp->conn_sqp = new_sqp;
5390
5391 if (connp->conn_policy != NULL) {
5392 ipsec_in_t *ii;
5393 ii = (ipsec_in_t *)(first_mp->b_rptr);
5394 ASSERT(ii->ipsec_in_policy == NULL);
5395 IPPH_REFHOLD(connp->conn_policy);
5396 ii->ipsec_in_policy = connp->conn_policy;
5397
5398 first_mp->b_datap->db_type = IPSEC_POLICY_SET;
5399 if (!ip_bind_ipsec_policy_set(econnp, first_mp)) {
5400 CONN_DEC_REF(econnp);
5401 freemsg(first_mp);
5402 return (NULL);
5403 }
5404 }
5405
5406 if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
5407 CONN_DEC_REF(econnp);
5408 freemsg(first_mp);
5409 return (NULL);
5410 }
5411
5412 /*
5413 * If we know we have some policy, pass the "IPSEC"
5414 * options size TCP uses this adjust the MSS.
5415 */
5416 econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp);
5417 if (mctl_present) {
5418 freeb(first_mp);
5419 *mpp = mp;
5420 }
5421
5422 return (econnp);
5423 }
5424
5425 /*
5426 * tcp_get_conn/tcp_free_conn
5427 *
5428 * tcp_get_conn is used to get a clean tcp connection structure.
5429 * It tries to reuse the connections put on the freelist by the
5430 * time_wait_collector failing which it goes to kmem_cache. This
5431 * way has two benefits compared to just allocating from and
5432 * freeing to kmem_cache.
5433 * 1) The time_wait_collector can free (which includes the cleanup)
5434 * outside the squeue. So when the interrupt comes, we have a clean
5435 * connection sitting in the freelist. Obviously, this buys us
5436 * performance.
5437 *
5438 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request
5439 * has multiple disadvantages - tying up the squeue during alloc, and the
5440 * fact that IPSec policy initialization has to happen here which
5441 * requires us sending a M_CTL and checking for it i.e. real ugliness.
5442 * But allocating the conn/tcp in IP land is also not the best since
5443 * we can't check the 'q' and 'q0' which are protected by squeue and
5444 * blindly allocate memory which might have to be freed here if we are
5445 * not allowed to accept the connection. By using the freelist and
5446 * putting the conn/tcp back in freelist, we don't pay a penalty for
5447 * allocating memory without checking 'q/q0' and freeing it if we can't
5448 * accept the connection.
5449 *
5450 * Care should be taken to put the conn back in the same squeue's freelist
5451 * from which it was allocated. Best results are obtained if conn is
5452 * allocated from listener's squeue and freed to the same. Time wait
5453 * collector will free up the freelist is the connection ends up sitting
5454 * there for too long.
5455 */
5456 void *
5457 tcp_get_conn(void *arg, tcp_stack_t *tcps)
5458 {
5459 tcp_t *tcp = NULL;
5460 conn_t *connp = NULL;
5461 squeue_t *sqp = (squeue_t *)arg;
5462 tcp_squeue_priv_t *tcp_time_wait;
5463 netstack_t *ns;
5464
5465 tcp_time_wait =
5466 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
5467
5468 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
5469 tcp = tcp_time_wait->tcp_free_list;
5470 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
5471 if (tcp != NULL) {
5472 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
5473 tcp_time_wait->tcp_free_list_cnt--;
5474 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5475 tcp->tcp_time_wait_next = NULL;
5476 connp = tcp->tcp_connp;
5477 connp->conn_flags |= IPCL_REUSED;
5478
5479 ASSERT(tcp->tcp_tcps == NULL);
5480 ASSERT(connp->conn_netstack == NULL);
5481 ns = tcps->tcps_netstack;
5482 netstack_hold(ns);
5483 connp->conn_netstack = ns;
5484 tcp->tcp_tcps = tcps;
5485 TCPS_REFHOLD(tcps);
5486 ipcl_globalhash_insert(connp);
5487 return ((void *)connp);
5488 }
5489 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5490 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
5491 tcps->tcps_netstack)) == NULL)
5492 return (NULL);
5493 tcp = connp->conn_tcp;
5494 tcp->tcp_tcps = tcps;
5495 TCPS_REFHOLD(tcps);
5496 return ((void *)connp);
5497 }
5498
5499 /*
5500 * Update the cached label for the given tcp_t. This should be called once per
5501 * connection, and before any packets are sent or tcp_process_options is
5502 * invoked. Returns B_FALSE if the correct label could not be constructed.
5503 */
5504 static boolean_t
5505 tcp_update_label(tcp_t *tcp, const cred_t *cr)
5506 {
5507 conn_t *connp = tcp->tcp_connp;
5508
5509 if (tcp->tcp_ipversion == IPV4_VERSION) {
5510 uchar_t optbuf[IP_MAX_OPT_LENGTH];
5511 int added;
5512
5513 if (tsol_compute_label(cr, tcp->tcp_remote, optbuf,
5514 connp->conn_mac_exempt,
5515 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
5516 return (B_FALSE);
5517
5518 added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len);
5519 if (added == -1)
5520 return (B_FALSE);
5521 tcp->tcp_hdr_len += added;
5522 tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added);
5523 tcp->tcp_ip_hdr_len += added;
5524 if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) {
5525 tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3;
5526 added = tsol_prepend_option(optbuf, tcp->tcp_ipha,
5527 tcp->tcp_hdr_len);
5528 if (added == -1)
5529 return (B_FALSE);
5530 tcp->tcp_hdr_len += added;
5531 tcp->tcp_tcph = (tcph_t *)
5532 ((uchar_t *)tcp->tcp_tcph + added);
5533 tcp->tcp_ip_hdr_len += added;
5534 }
5535 } else {
5536 uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
5537
5538 if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf,
5539 connp->conn_mac_exempt,
5540 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
5541 return (B_FALSE);
5542 if (tsol_update_sticky(&tcp->tcp_sticky_ipp,
5543 &tcp->tcp_label_len, optbuf) != 0)
5544 return (B_FALSE);
5545 if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0)
5546 return (B_FALSE);
5547 }
5548
5549 connp->conn_ulp_labeled = 1;
5550
5551 return (B_TRUE);
5552 }
5553
5554 /* BEGIN CSTYLED */
5555 /*
5556 *
5557 * The sockfs ACCEPT path:
5558 * =======================
5559 *
5560 * The eager is now established in its own perimeter as soon as SYN is
5561 * received in tcp_conn_request(). When sockfs receives conn_ind, it
5562 * completes the accept processing on the acceptor STREAM. The sending
5563 * of conn_ind part is common for both sockfs listener and a TLI/XTI
5564 * listener but a TLI/XTI listener completes the accept processing
5565 * on the listener perimeter.
5566 *
5567 * Common control flow for 3 way handshake:
5568 * ----------------------------------------
5569 *
5570 * incoming SYN (listener perimeter) -> tcp_rput_data()
5571 * -> tcp_conn_request()
5572 *
5573 * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data()
5574 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind()
5575 *
5576 * Sockfs ACCEPT Path:
5577 * -------------------
5578 *
5579 * open acceptor stream (tcp_open allocates tcp_wput_accept()
5580 * as STREAM entry point)
5581 *
5582 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept()
5583 *
5584 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager
5585 * association (we are not behind eager's squeue but sockfs is protecting us
5586 * and no one knows about this stream yet. The STREAMS entry point q->q_info
5587 * is changed to point at tcp_wput().
5588 *
5589 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to
5590 * listener (done on listener's perimeter).
5591 *
5592 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish
5593 * accept.
5594 *
5595 * TLI/XTI client ACCEPT path:
5596 * ---------------------------
5597 *
5598 * soaccept() sends T_CONN_RES on the listener STREAM.
5599 *
5600 * tcp_accept() -> tcp_accept_swap() complete the processing and send
5601 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()).
5602 *
5603 * Locks:
5604 * ======
5605 *
5606 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
5607 * and listeners->tcp_eager_next_q.
5608 *
5609 * Referencing:
5610 * ============
5611 *
5612 * 1) We start out in tcp_conn_request by eager placing a ref on
5613 * listener and listener adding eager to listeners->tcp_eager_next_q0.
5614 *
5615 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
5616 * doing so we place a ref on the eager. This ref is finally dropped at the
5617 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
5618 * reference is dropped by the squeue framework.
5619 *
5620 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
5621 *
5622 * The reference must be released by the same entity that added the reference
5623 * In the above scheme, the eager is the entity that adds and releases the
5624 * references. Note that tcp_accept_finish executes in the squeue of the eager
5625 * (albeit after it is attached to the acceptor stream). Though 1. executes
5626 * in the listener's squeue, the eager is nascent at this point and the
5627 * reference can be considered to have been added on behalf of the eager.
5628 *
5629 * Eager getting a Reset or listener closing:
5630 * ==========================================
5631 *
5632 * Once the listener and eager are linked, the listener never does the unlink.
5633 * If the listener needs to close, tcp_eager_cleanup() is called which queues
5634 * a message on all eager perimeter. The eager then does the unlink, clears
5635 * any pointers to the listener's queue and drops the reference to the
5636 * listener. The listener waits in tcp_close outside the squeue until its
5637 * refcount has dropped to 1. This ensures that the listener has waited for
5638 * all eagers to clear their association with the listener.
5639 *
5640 * Similarly, if eager decides to go away, it can unlink itself and close.
5641 * When the T_CONN_RES comes down, we check if eager has closed. Note that
5642 * the reference to eager is still valid because of the extra ref we put
5643 * in tcp_send_conn_ind.
5644 *
5645 * Listener can always locate the eager under the protection
5646 * of the listener->tcp_eager_lock, and then do a refhold
5647 * on the eager during the accept processing.
5648 *
5649 * The acceptor stream accesses the eager in the accept processing
5650 * based on the ref placed on eager before sending T_conn_ind.
5651 * The only entity that can negate this refhold is a listener close
5652 * which is mutually exclusive with an active acceptor stream.
5653 *
5654 * Eager's reference on the listener
5655 * ===================================
5656 *
5657 * If the accept happens (even on a closed eager) the eager drops its
5658 * reference on the listener at the start of tcp_accept_finish. If the
5659 * eager is killed due to an incoming RST before the T_conn_ind is sent up,
5660 * the reference is dropped in tcp_closei_local. If the listener closes,
5661 * the reference is dropped in tcp_eager_kill. In all cases the reference
5662 * is dropped while executing in the eager's context (squeue).
5663 */
5664 /* END CSTYLED */
5665
5666 /* Process the SYN packet, mp, directed at the listener 'tcp' */
5667
5668 /*
5669 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
5670 * tcp_rput_data will not see any SYN packets.
5671 */
5672 /* ARGSUSED */
5673 void
5674 tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
5675 {
5676 tcph_t *tcph;
5677 uint32_t seg_seq;
5678 tcp_t *eager;
5679 uint_t ipvers;
5680 ipha_t *ipha;
5681 ip6_t *ip6h;
5682 int err;
5683 conn_t *econnp = NULL;
5684 squeue_t *new_sqp;
5685 mblk_t *mp1;
5686 uint_t ip_hdr_len;
5687 conn_t *connp = (conn_t *)arg;
5688 tcp_t *tcp = connp->conn_tcp;
5689 cred_t *credp;
5690 tcp_stack_t *tcps = tcp->tcp_tcps;
5691 ip_stack_t *ipst;
5692
5693 if (tcp->tcp_state != TCPS_LISTEN)
5694 goto error2;
5695
5696 ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0);
5697
5698 mutex_enter(&tcp->tcp_eager_lock);
5699 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) {
5700 mutex_exit(&tcp->tcp_eager_lock);
5701 TCP_STAT(tcps, tcp_listendrop);
5702 BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
5703 if (tcp->tcp_debug) {
5704 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
5705 "tcp_conn_request: listen backlog (max=%d) "
5706 "overflow (%d pending) on %s",
5707 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
5708 tcp_display(tcp, NULL, DISP_PORT_ONLY));
5709 }
5710 goto error2;
5711 }
5712
5713 if (tcp->tcp_conn_req_cnt_q0 >=
5714 tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
5715 /*
5716 * Q0 is full. Drop a pending half-open req from the queue
5717 * to make room for the new SYN req. Also mark the time we
5718 * drop a SYN.
5719 *
5720 * A more aggressive defense against SYN attack will
5721 * be to set the "tcp_syn_defense" flag now.
5722 */
5723 TCP_STAT(tcps, tcp_listendropq0);
5724 tcp->tcp_last_rcv_lbolt = lbolt64;
5725 if (!tcp_drop_q0(tcp)) {
5726 mutex_exit(&tcp->tcp_eager_lock);
5727 BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
5728 if (tcp->tcp_debug) {
5729 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
5730 "tcp_conn_request: listen half-open queue "
5731 "(max=%d) full (%d pending) on %s",
5732 tcps->tcps_conn_req_max_q0,
5733 tcp->tcp_conn_req_cnt_q0,
5734 tcp_display(tcp, NULL,
5735 DISP_PORT_ONLY));
5736 }
5737 goto error2;
5738 }
5739 }
5740 mutex_exit(&tcp->tcp_eager_lock);
5741
5742 /*
5743 * IP adds STRUIO_EAGER and ensures that the received packet is
5744 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6
5745 * link local address. If IPSec is enabled, db_struioflag has
5746 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER);
5747 * otherwise an error case if neither of them is set.
5748 */
5749 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
5750 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5751 DB_CKSUMSTART(mp) = 0;
5752 mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5753 econnp = (conn_t *)tcp_get_conn(arg2, tcps);
5754 if (econnp == NULL)
5755 goto error2;
5756 ASSERT(econnp->conn_netstack == connp->conn_netstack);
5757 econnp->conn_sqp = new_sqp;
5758 } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
5759 /*
5760 * mp is updated in tcp_get_ipsec_conn().
5761 */
5762 econnp = tcp_get_ipsec_conn(tcp, arg2, &mp);
5763 if (econnp == NULL) {
5764 /*
5765 * mp freed by tcp_get_ipsec_conn.
5766 */
5767 return;
5768 }
5769 ASSERT(econnp->conn_netstack == connp->conn_netstack);
5770 } else {
5771 goto error2;
5772 }
5773
5774 ASSERT(DB_TYPE(mp) == M_DATA);
5775
5776 ipvers = IPH_HDR_VERSION(mp->b_rptr);
5777 ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
5778 ASSERT(OK_32PTR(mp->b_rptr));
5779 if (ipvers == IPV4_VERSION) {
5780 ipha = (ipha_t *)mp->b_rptr;
5781 ip_hdr_len = IPH_HDR_LENGTH(ipha);
5782 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5783 } else {
5784 ip6h = (ip6_t *)mp->b_rptr;
5785 ip_hdr_len = ip_hdr_length_v6(mp, ip6h);
5786 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5787 }
5788
5789 if (tcp->tcp_family == AF_INET) {
5790 ASSERT(ipvers == IPV4_VERSION);
5791 err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp);
5792 } else {
5793 err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp);
5794 }
5795
5796 if (err)
5797 goto error3;
5798
5799 eager = econnp->conn_tcp;
5800
5801 /* Inherit various TCP parameters from the listener */
5802 eager->tcp_naglim = tcp->tcp_naglim;
5803 eager->tcp_first_timer_threshold =
5804 tcp->tcp_first_timer_threshold;
5805 eager->tcp_second_timer_threshold =
5806 tcp->tcp_second_timer_threshold;
5807
5808 eager->tcp_first_ctimer_threshold =
5809 tcp->tcp_first_ctimer_threshold;
5810 eager->tcp_second_ctimer_threshold =
5811 tcp->tcp_second_ctimer_threshold;
5812
5813 /*
5814 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics.
5815 * If it does not, the eager's receive window will be set to the
5816 * listener's receive window later in this function.
5817 */
5818 eager->tcp_rwnd = 0;
5819
5820 /*
5821 * Inherit listener's tcp_init_cwnd. Need to do this before
5822 * calling tcp_process_options() where tcp_mss_set() is called
5823 * to set the initial cwnd.
5824 */
5825 eager->tcp_init_cwnd = tcp->tcp_init_cwnd;
5826
5827 /*
5828 * Zones: tcp_adapt_ire() and tcp_send_data() both need the
5829 * zone id before the accept is completed in tcp_wput_accept().
5830 */
5831 econnp->conn_zoneid = connp->conn_zoneid;
5832 econnp->conn_allzones = connp->conn_allzones;
5833
5834 /* Copy nexthop information from listener to eager */
5835 if (connp->conn_nexthop_set) {
5836 econnp->conn_nexthop_set = connp->conn_nexthop_set;
5837 econnp->conn_nexthop_v4 = connp->conn_nexthop_v4;
5838 }
5839
5840 /*
5841 * TSOL: tsol_input_proc() needs the eager's cred before the
5842 * eager is accepted
5843 */
5844 econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred;
5845 crhold(credp);
5846
5847 /*
5848 * If the caller has the process-wide flag set, then default to MAC
5849 * exempt mode. This allows read-down to unlabeled hosts.
5850 */
5851 if (getpflags(NET_MAC_AWARE, credp) != 0)
5852 econnp->conn_mac_exempt = B_TRUE;
5853
5854 if (is_system_labeled()) {
5855 cred_t *cr;
5856
5857 if (connp->conn_mlp_type != mlptSingle) {
5858 cr = econnp->conn_peercred = DB_CRED(mp);
5859 if (cr != NULL)
5860 crhold(cr);
5861 else
5862 cr = econnp->conn_cred;
5863 DTRACE_PROBE2(mlp_syn_accept, conn_t *,
5864 econnp, cred_t *, cr)
5865 } else {
5866 cr = econnp->conn_cred;
5867 DTRACE_PROBE2(syn_accept, conn_t *,
5868 econnp, cred_t *, cr)
5869 }
5870
5871 if (!tcp_update_label(eager, cr)) {
5872 DTRACE_PROBE3(
5873 tx__ip__log__error__connrequest__tcp,
5874 char *, "eager connp(1) label on SYN mp(2) failed",
5875 conn_t *, econnp, mblk_t *, mp);
5876 goto error3;
5877 }
5878 }
5879
5880 eager->tcp_hard_binding = B_TRUE;
5881
5882 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
5883 TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5884
5885 CL_INET_CONNECT(eager);
5886
5887 /*
5888 * No need to check for multicast destination since ip will only pass
5889 * up multicasts to those that have expressed interest
5890 * TODO: what about rejecting broadcasts?
5891 * Also check that source is not a multicast or broadcast address.
5892 *
5893 * DTrace tcp:::state-change is probed a little further down,
5894 * where it is set for the second time.
5895 */
5896 eager->tcp_state = TCPS_SYN_RCVD;
5897
5898
5899 /*
5900 * There should be no ire in the mp as we are being called after
5901 * receiving the SYN.
5902 */
5903 ASSERT(tcp_ire_mp(mp) == NULL);
5904
5905 /*
5906 * Adapt our mss, ttl, ... according to information provided in IRE.
5907 */
5908
5909 if (tcp_adapt_ire(eager, NULL) == 0) {
5910 /* Undo the bind_hash_insert */
5911 tcp_bind_hash_remove(eager);
5912 goto error3;
5913 }
5914
5915 /*
5916 * DTrace the first SYN as a tcp:::receive. This is placed after
5917 * tcp_adapt_ire() so that tcp->tcp_loopback has been set.
5918 */
5919 DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
5920 mp->b_rptr, tcp_t *, tcp, tcph_t *, tcph);
5921
5922 /* Process all TCP options. */
5923 tcp_process_options(eager, tcph);
5924
5925 /* Is the other end ECN capable? */
5926 if (tcps->tcps_ecn_permitted >= 1 &&
5927 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5928 eager->tcp_ecn_ok = B_TRUE;
5929 }
5930
5931 /*
5932 * listener->tcp_rq->q_hiwat should be the default window size or a
5933 * window size changed via SO_RCVBUF option. First round up the
5934 * eager's tcp_rwnd to the nearest MSS. Then find out the window
5935 * scale option value if needed. Call tcp_rwnd_set() to finish the
5936 * setting.
5937 *
5938 * Note if there is a rpipe metric associated with the remote host,
5939 * we should not inherit receive window size from listener.
5940 */
5941 eager->tcp_rwnd = MSS_ROUNDUP(
5942 (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat :
5943 eager->tcp_rwnd), eager->tcp_mss);
5944 if (eager->tcp_snd_ws_ok)
5945 tcp_set_ws_value(eager);
5946 /*
5947 * Note that this is the only place tcp_rwnd_set() is called for
5948 * accepting a connection. We need to call it here instead of
5949 * after the 3-way handshake because we need to tell the other
5950 * side our rwnd in the SYN-ACK segment.
5951 */
5952 (void) tcp_rwnd_set(eager, eager->tcp_rwnd);
5953
5954 /*
5955 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
5956 * via soaccept()->soinheritoptions() which essentially applies
5957 * all the listener options to the new STREAM. The options that we
5958 * need to take care of are:
5959 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
5960 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
5961 * SO_SNDBUF, SO_RCVBUF.
5962 *
5963 * SO_RCVBUF: tcp_rwnd_set() above takes care of it.
5964 * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When
5965 * tcp_maxpsz_set() gets called later from
5966 * tcp_accept_finish(), the option takes effect.
5967 *
5968 */
5969 /* Set the TCP options */
5970 eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater;
5971 eager->tcp_dgram_errind = tcp->tcp_dgram_errind;
5972 eager->tcp_oobinline = tcp->tcp_oobinline;
5973 eager->tcp_reuseaddr = tcp->tcp_reuseaddr;
5974 eager->tcp_broadcast = tcp->tcp_broadcast;
5975 eager->tcp_useloopback = tcp->tcp_useloopback;
5976 eager->tcp_dontroute = tcp->tcp_dontroute;
5977 eager->tcp_linger = tcp->tcp_linger;
5978 eager->tcp_lingertime = tcp->tcp_lingertime;
5979 if (tcp->tcp_ka_enabled)
5980 eager->tcp_ka_enabled = 1;
5981
5982 /* Set the IP options */
5983 econnp->conn_broadcast = connp->conn_broadcast;
5984 econnp->conn_loopback = connp->conn_loopback;
5985 econnp->conn_dontroute = connp->conn_dontroute;
5986 econnp->conn_reuseaddr = connp->conn_reuseaddr;
5987
5988 /* Put a ref on the listener for the eager. */
5989 CONN_INC_REF(connp);
5990 mutex_enter(&tcp->tcp_eager_lock);
5991 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
5992 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
5993 tcp->tcp_eager_next_q0 = eager;
5994 eager->tcp_eager_prev_q0 = tcp;
5995
5996 /* Set tcp_listener before adding it to tcp_conn_fanout */
5997 eager->tcp_listener = tcp;
5998 eager->tcp_saved_listener = tcp;
5999
6000 /*
6001 * Tag this detached tcp vector for later retrieval
6002 * by our listener client in tcp_accept().
6003 */
6004 eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum;
6005 tcp->tcp_conn_req_cnt_q0++;
6006 if (++tcp->tcp_conn_req_seqnum == -1) {
6007 /*
6008 * -1 is "special" and defined in TPI as something
6009 * that should never be used in T_CONN_IND
6010 */
6011 ++tcp->tcp_conn_req_seqnum;
6012 }
6013 mutex_exit(&tcp->tcp_eager_lock);
6014
6015 if (tcp->tcp_syn_defense) {
6016 /* Don't drop the SYN that comes from a good IP source */
6017 ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache);
6018 if (addr_cache != NULL && eager->tcp_remote ==
6019 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
6020 eager->tcp_dontdrop = B_TRUE;
6021 }
6022 }
6023
6024 /*
6025 * We need to insert the eager in its own perimeter but as soon
6026 * as we do that, we expose the eager to the classifier and
6027 * should not touch any field outside the eager's perimeter.
6028 * So do all the work necessary before inserting the eager
6029 * in its own perimeter. Be optimistic that ipcl_conn_insert()
6030 * will succeed but undo everything if it fails.
6031 */
6032 seg_seq = ABE32_TO_U32(tcph->th_seq);
6033 eager->tcp_irs = seg_seq;
6034 eager->tcp_rack = seg_seq;
6035 eager->tcp_rnxt = seg_seq + 1;
6036 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
6037 BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
6038 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, eager,
6039 int32_t, TCPS_SYN_RCVD);
6040 eager->tcp_state = TCPS_SYN_RCVD;
6041 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
6042 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
6043 if (mp1 == NULL) {
6044 /*
6045 * Increment the ref count as we are going to
6046 * enqueueing an mp in squeue
6047 */
6048 CONN_INC_REF(econnp);
6049 goto error;
6050 }
6051 DB_CPID(mp1) = tcp->tcp_cpid;
6052 eager->tcp_cpid = tcp->tcp_cpid;
6053 eager->tcp_open_time = lbolt64;
6054
6055 /*
6056 * We need to start the rto timer. In normal case, we start
6057 * the timer after sending the packet on the wire (or at
6058 * least believing that packet was sent by waiting for
6059 * CALL_IP_WPUT() to return). Since this is the first packet
6060 * being sent on the wire for the eager, our initial tcp_rto
6061 * is at least tcp_rexmit_interval_min which is a fairly
6062 * large value to allow the algorithm to adjust slowly to large
6063 * fluctuations of RTT during first few transmissions.
6064 *
6065 * Starting the timer first and then sending the packet in this
6066 * case shouldn't make much difference since tcp_rexmit_interval_min
6067 * is of the order of several 100ms and starting the timer
6068 * first and then sending the packet will result in difference
6069 * of few micro seconds.
6070 *
6071 * Without this optimization, we are forced to hold the fanout
6072 * lock across the ipcl_bind_insert() and sending the packet
6073 * so that we don't race against an incoming packet (maybe RST)
6074 * for this eager.
6075 *
6076 * It is necessary to acquire an extra reference on the eager
6077 * at this point and hold it until after tcp_send_data() to
6078 * ensure against an eager close race.
6079 */
6080
6081 CONN_INC_REF(eager->tcp_connp);
6082
6083 TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT);
6084 TCP_TIMER_RESTART(eager, eager->tcp_rto);
6085
6086
6087 /*
6088 * Insert the eager in its own perimeter now. We are ready to deal
6089 * with any packets on eager.
6090 */
6091 if (eager->tcp_ipversion == IPV4_VERSION) {
6092 if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) {
6093 goto error;
6094 }
6095 } else {
6096 if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) {
6097 goto error;
6098 }
6099 }
6100
6101 /* mark conn as fully-bound */
6102 econnp->conn_fully_bound = B_TRUE;
6103
6104 /* Send the SYN-ACK */
6105 tcp_send_data(eager, eager->tcp_wq, mp1);
6106 CONN_DEC_REF(eager->tcp_connp);
6107 freemsg(mp);
6108
6109 return;
6110 error:
6111 freemsg(mp1);
6112 eager->tcp_closemp_used = B_TRUE;
6113 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
6114 squeue_fill(econnp->conn_sqp, &eager->tcp_closemp, tcp_eager_kill,
6115 econnp, SQTAG_TCP_CONN_REQ_2);
6116
6117 /*
6118 * If a connection already exists, send the mp to that connections so
6119 * that it can be appropriately dealt with.
6120 */
6121 ipst = tcps->tcps_netstack->netstack_ip;
6122
6123 if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) {
6124 if (!IPCL_IS_CONNECTED(econnp)) {
6125 /*
6126 * Something bad happened. ipcl_conn_insert()
6127 * failed because a connection already existed
6128 * in connected hash but we can't find it
6129 * anymore (someone blew it away). Just
6130 * free this message and hopefully remote
6131 * will retransmit at which time the SYN can be
6132 * treated as a new connection or dealth with
6133 * a TH_RST if a connection already exists.
6134 */
6135 CONN_DEC_REF(econnp);
6136 freemsg(mp);
6137 } else {
6138 squeue_fill(econnp->conn_sqp, mp, tcp_input,
6139 econnp, SQTAG_TCP_CONN_REQ_1);
6140 }
6141 } else {
6142 /* Nobody wants this packet */
6143 freemsg(mp);
6144 }
6145 return;
6146 error3:
6147 CONN_DEC_REF(econnp);
6148 error2:
6149 /*
6150 * DTrace this tcp:::receive event, as we skipped the previous receive
6151 * probe. For DTrace only, we find the IP header length so that the
6152 * TCP header can be found.
6153 */
6154 ipvers = IPH_HDR_VERSION(mp->b_rptr);
6155 if (OK_32PTR(mp->b_rptr) &&
6156 (ipvers == IPV4_VERSION || ipvers == IPV6_VERSION)) {
6157 if (ipvers == IPV4_VERSION)
6158 ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)mp->b_rptr);
6159 else
6160 ip_hdr_len = ip_hdr_length_v6(mp, (ip6_t *)mp->b_rptr);
6161 DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL,
6162 void_ip_t *, mp->b_rptr, tcp_t *, NULL, tcph_t *,
6163 &mp->b_rptr[ip_hdr_len]);
6164 }
6165
6166 freemsg(mp);
6167 }
6168
6169 /*
6170 * In an ideal case of vertical partition in NUMA architecture, its
6171 * beneficial to have the listener and all the incoming connections
6172 * tied to the same squeue. The other constraint is that incoming
6173 * connections should be tied to the squeue attached to interrupted
6174 * CPU for obvious locality reason so this leaves the listener to
6175 * be tied to the same squeue. Our only problem is that when listener
6176 * is binding, the CPU that will get interrupted by the NIC whose
6177 * IP address the listener is binding to is not even known. So
6178 * the code below allows us to change that binding at the time the
6179 * CPU is interrupted by virtue of incoming connection's squeue.
6180 *
6181 * This is usefull only in case of a listener bound to a specific IP
6182 * address. For other kind of listeners, they get bound the
6183 * very first time and there is no attempt to rebind them.
6184 */
6185 void
6186 tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
6187 {
6188 conn_t *connp = (conn_t *)arg;
6189 squeue_t *sqp = (squeue_t *)arg2;
6190 squeue_t *new_sqp;
6191 uint32_t conn_flags;
6192
6193 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
6194 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
6195 } else {
6196 goto done;
6197 }
6198
6199 if (connp->conn_fanout == NULL)
6200 goto done;
6201
6202 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
6203 mutex_enter(&connp->conn_fanout->connf_lock);
6204 mutex_enter(&connp->conn_lock);
6205 /*
6206 * No one from read or write side can access us now
6207 * except for already queued packets on this squeue.
6208 * But since we haven't changed the squeue yet, they
6209 * can't execute. If they are processed after we have
6210 * changed the squeue, they are sent back to the
6211 * correct squeue down below.
6212 * But a listner close can race with processing of
6213 * incoming SYN. If incoming SYN processing changes
6214 * the squeue then the listener close which is waiting
6215 * to enter the squeue would operate on the wrong
6216 * squeue. Hence we don't change the squeue here unless
6217 * the refcount is exactly the minimum refcount. The
6218 * minimum refcount of 4 is counted as - 1 each for
6219 * TCP and IP, 1 for being in the classifier hash, and
6220 * 1 for the mblk being processed.
6221 */
6222
6223 if (connp->conn_ref != 4 ||
6224 connp->conn_tcp->tcp_state != TCPS_LISTEN) {
6225 mutex_exit(&connp->conn_lock);
6226 mutex_exit(&connp->conn_fanout->connf_lock);
6227 goto done;
6228 }
6229 if (connp->conn_sqp != new_sqp) {
6230 while (connp->conn_sqp != new_sqp)
6231 (void) casptr(&connp->conn_sqp, sqp, new_sqp);
6232 }
6233
6234 do {
6235 conn_flags = connp->conn_flags;
6236 conn_flags |= IPCL_FULLY_BOUND;
6237 (void) cas32(&connp->conn_flags, connp->conn_flags,
6238 conn_flags);
6239 } while (!(connp->conn_flags & IPCL_FULLY_BOUND));
6240
6241 mutex_exit(&connp->conn_fanout->connf_lock);
6242 mutex_exit(&connp->conn_lock);
6243 }
6244
6245 done:
6246 if (connp->conn_sqp != sqp) {
6247 CONN_INC_REF(connp);
6248 squeue_fill(connp->conn_sqp, mp,
6249 connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND);
6250 } else {
6251 tcp_conn_request(connp, mp, sqp);
6252 }
6253 }
6254
6255 /*
6256 * Successful connect request processing begins when our client passes
6257 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes
6258 * our T_OK_ACK reply message upstream. The control flow looks like this:
6259 * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP
6260 * upstream <- tcp_rput() <- IP
6261 * After various error checks are completed, tcp_connect() lays
6262 * the target address and port into the composite header template,
6263 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind
6264 * request followed by an IRE request, and passes the three mblk message
6265 * down to IP looking like this:
6266 * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client
6267 * Processing continues in tcp_rput() when we receive the following message:
6268 * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client
6269 * After consuming the first two mblks, tcp_rput() calls tcp_timer(),
6270 * to fire off the connection request, and then passes the T_OK_ACK mblk
6271 * upstream that we filled in below. There are, of course, numerous
6272 * error conditions along the way which truncate the processing described
6273 * above.
6274 */
6275 static void
6276 tcp_connect(tcp_t *tcp, mblk_t *mp)
6277 {
6278 sin_t *sin;
6279 sin6_t *sin6;
6280 queue_t *q = tcp->tcp_wq;
6281 struct T_conn_req *tcr;
6282 ipaddr_t *dstaddrp;
6283 in_port_t dstport;
6284 uint_t srcid;
6285
6286 tcr = (struct T_conn_req *)mp->b_rptr;
6287
6288 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
6289 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
6290 tcp_err_ack(tcp, mp, TPROTO, 0);
6291 return;
6292 }
6293
6294 /*
6295 * Determine packet type based on type of address passed in
6296 * the request should contain an IPv4 or IPv6 address.
6297 * Make sure that address family matches the type of
6298 * family of the the address passed down
6299 */
6300 switch (tcr->DEST_length) {
6301 default:
6302 tcp_err_ack(tcp, mp, TBADADDR, 0);
6303 return;
6304
6305 case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
6306 /*
6307 * XXX: The check for valid DEST_length was not there
6308 * in earlier releases and some buggy
6309 * TLI apps (e.g Sybase) got away with not feeding
6310 * in sin_zero part of address.
6311 * We allow that bug to keep those buggy apps humming.
6312 * Test suites require the check on DEST_length.
6313 * We construct a new mblk with valid DEST_length
6314 * free the original so the rest of the code does
6315 * not have to keep track of this special shorter
6316 * length address case.
6317 */
6318 mblk_t *nmp;
6319 struct T_conn_req *ntcr;
6320 sin_t *nsin;
6321
6322 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
6323 tcr->OPT_length, BPRI_HI);
6324 if (nmp == NULL) {
6325 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
6326 return;
6327 }
6328 ntcr = (struct T_conn_req *)nmp->b_rptr;
6329 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
6330 ntcr->PRIM_type = T_CONN_REQ;
6331 ntcr->DEST_length = sizeof (sin_t);
6332 ntcr->DEST_offset = sizeof (struct T_conn_req);
6333
6334 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
6335 *nsin = sin_null;
6336 /* Get pointer to shorter address to copy from original mp */
6337 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6338 tcr->DEST_length); /* extract DEST_length worth of sin_t */
6339 if (sin == NULL || !OK_32PTR((char *)sin)) {
6340 freemsg(nmp);
6341 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6342 return;
6343 }
6344 nsin->sin_family = sin->sin_family;
6345 nsin->sin_port = sin->sin_port;
6346 nsin->sin_addr = sin->sin_addr;
6347 /* Note:nsin->sin_zero zero-fill with sin_null assign above */
6348 nmp->b_wptr = (uchar_t *)&nsin[1];
6349 if (tcr->OPT_length != 0) {
6350 ntcr->OPT_length = tcr->OPT_length;
6351 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
6352 bcopy((uchar_t *)tcr + tcr->OPT_offset,
6353 (uchar_t *)ntcr + ntcr->OPT_offset,
6354 tcr->OPT_length);
6355 nmp->b_wptr += tcr->OPT_length;
6356 }
6357 freemsg(mp); /* original mp freed */
6358 mp = nmp; /* re-initialize original variables */
6359 tcr = ntcr;
6360 }
6361 /* FALLTHRU */
6362
6363 case sizeof (sin_t):
6364 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6365 sizeof (sin_t));
6366 if (sin == NULL || !OK_32PTR((char *)sin)) {
6367 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6368 return;
6369 }
6370 if (tcp->tcp_family != AF_INET ||
6371 sin->sin_family != AF_INET) {
6372 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6373 return;
6374 }
6375 if (sin->sin_port == 0) {
6376 tcp_err_ack(tcp, mp, TBADADDR, 0);
6377 return;
6378 }
6379 if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
6380 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6381 return;
6382 }
6383
6384 break;
6385
6386 case sizeof (sin6_t):
6387 sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
6388 sizeof (sin6_t));
6389 if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
6390 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6391 return;
6392 }
6393 if (tcp->tcp_family != AF_INET6 ||
6394 sin6->sin6_family != AF_INET6) {
6395 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6396 return;
6397 }
6398 if (sin6->sin6_port == 0) {
6399 tcp_err_ack(tcp, mp, TBADADDR, 0);
6400 return;
6401 }
6402 break;
6403 }
6404 /*
6405 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
6406 * should key on their sequence number and cut them loose.
6407 */
6408
6409 /*
6410 * If options passed in, feed it for verification and handling
6411 */
6412 if (tcr->OPT_length != 0) {
6413 mblk_t *ok_mp;
6414 mblk_t *discon_mp;
6415 mblk_t *conn_opts_mp;
6416 int t_error, sys_error, do_disconnect;
6417
6418 conn_opts_mp = NULL;
6419
6420 if (tcp_conprim_opt_process(tcp, mp,
6421 &do_disconnect, &t_error, &sys_error) < 0) {
6422 if (do_disconnect) {
6423 ASSERT(t_error == 0 && sys_error == 0);
6424 discon_mp = mi_tpi_discon_ind(NULL,
6425 ECONNREFUSED, 0);
6426 if (!discon_mp) {
6427 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6428 TSYSERR, ENOMEM);
6429 return;
6430 }
6431 ok_mp = mi_tpi_ok_ack_alloc(mp);
6432 if (!ok_mp) {
6433 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6434 TSYSERR, ENOMEM);
6435 return;
6436 }
6437 qreply(q, ok_mp);
6438 qreply(q, discon_mp); /* no flush! */
6439 } else {
6440 ASSERT(t_error != 0);
6441 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
6442 sys_error);
6443 }
6444 return;
6445 }
6446 /*
6447 * Success in setting options, the mp option buffer represented
6448 * by OPT_length/offset has been potentially modified and
6449 * contains results of option processing. We copy it in
6450 * another mp to save it for potentially influencing returning
6451 * it in T_CONN_CONN.
6452 */
6453 if (tcr->OPT_length != 0) { /* there are resulting options */
6454 conn_opts_mp = copyb(mp);
6455 if (!conn_opts_mp) {
6456 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6457 TSYSERR, ENOMEM);
6458 return;
6459 }
6460 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
6461 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
6462 /*
6463 * Note:
6464 * These resulting option negotiation can include any
6465 * end-to-end negotiation options but there no such
6466 * thing (yet?) in our TCP/IP.
6467 */
6468 }
6469 }
6470
6471 /*
6472 * If we're connecting to an IPv4-mapped IPv6 address, we need to
6473 * make sure that the template IP header in the tcp structure is an
6474 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We
6475 * need to this before we call tcp_bindi() so that the port lookup
6476 * code will look for ports in the correct port space (IPv4 and
6477 * IPv6 have separate port spaces).
6478 */
6479 if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
6480 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6481 int err = 0;
6482
6483 err = tcp_header_init_ipv4(tcp);
6484 if (err != 0) {
6485 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6486 goto connect_failed;
6487 }
6488 if (tcp->tcp_lport != 0)
6489 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
6490 }
6491
6492 if (tcp->tcp_issocket) {
6493 /*
6494 * TCP is _D_SODIRECT and sockfs is directly above so save
6495 * the shared sonode sodirect_t pointer (if any) to enable
6496 * TCP sodirect.
6497 */
6498 tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq);
6499 }
6500
6501 switch (tcp->tcp_state) {
6502 case TCPS_IDLE:
6503 /*
6504 * We support quick connect, refer to comments in
6505 * tcp_connect_*()
6506 */
6507 /* FALLTHRU */
6508 case TCPS_BOUND:
6509 case TCPS_LISTEN:
6510 if (tcp->tcp_family == AF_INET6) {
6511 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6512 tcp_connect_ipv6(tcp, mp,
6513 &sin6->sin6_addr,
6514 sin6->sin6_port, sin6->sin6_flowinfo,
6515 sin6->__sin6_src_id, sin6->sin6_scope_id);
6516 return;
6517 }
6518 /*
6519 * Destination adress is mapped IPv6 address.
6520 * Source bound address should be unspecified or
6521 * IPv6 mapped address as well.
6522 */
6523 if (!IN6_IS_ADDR_UNSPECIFIED(
6524 &tcp->tcp_bound_source_v6) &&
6525 !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
6526 mp = mi_tpi_err_ack_alloc(mp, TSYSERR,
6527 EADDRNOTAVAIL);
6528 break;
6529 }
6530 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
6531 dstport = sin6->sin6_port;
6532 srcid = sin6->__sin6_src_id;
6533 } else {
6534 dstaddrp = &sin->sin_addr.s_addr;
6535 dstport = sin->sin_port;
6536 srcid = 0;
6537 }
6538
6539 tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid);
6540 return;
6541 default:
6542 mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0);
6543 break;
6544 }
6545 /*
6546 * Note: Code below is the "failure" case
6547 */
6548 /* return error ack and blow away saved option results if any */
6549 connect_failed:
6550 if (mp != NULL)
6551 putnext(tcp->tcp_rq, mp);
6552 else {
6553 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6554 TSYSERR, ENOMEM);
6555 }
6556 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6557 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6558 }
6559
6560 /*
6561 * Handle connect to IPv4 destinations, including connections for AF_INET6
6562 * sockets connecting to IPv4 mapped IPv6 destinations.
6563 */
6564 static void
6565 tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport,
6566 uint_t srcid)
6567 {
6568 tcph_t *tcph;
6569 mblk_t *mp1;
6570 ipaddr_t dstaddr = *dstaddrp;
6571 int32_t oldstate;
6572 uint16_t lport;
6573 tcp_stack_t *tcps = tcp->tcp_tcps;
6574
6575 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
6576
6577 /* Check for attempt to connect to INADDR_ANY */
6578 if (dstaddr == INADDR_ANY) {
6579 /*
6580 * SunOS 4.x and 4.3 BSD allow an application
6581 * to connect a TCP socket to INADDR_ANY.
6582 * When they do this, the kernel picks the
6583 * address of one interface and uses it
6584 * instead. The kernel usually ends up
6585 * picking the address of the loopback
6586 * interface. This is an undocumented feature.
6587 * However, we provide the same thing here
6588 * in order to have source and binary
6589 * compatibility with SunOS 4.x.
6590 * Update the T_CONN_REQ (sin/sin6) since it is used to
6591 * generate the T_CONN_CON.
6592 */
6593 dstaddr = htonl(INADDR_LOOPBACK);
6594 *dstaddrp = dstaddr;
6595 }
6596
6597 /* Handle __sin6_src_id if socket not bound to an IP address */
6598 if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) {
6599 ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6,
6600 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
6601 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6,
6602 tcp->tcp_ipha->ipha_src);
6603 }
6604
6605 /*
6606 * Don't let an endpoint connect to itself. Note that
6607 * the test here does not catch the case where the
6608 * source IP addr was left unspecified by the user. In
6609 * this case, the source addr is set in tcp_adapt_ire()
6610 * using the reply to the T_BIND message that we send
6611 * down to IP here and the check is repeated in tcp_rput_other.
6612 */
6613 if (dstaddr == tcp->tcp_ipha->ipha_src &&
6614 dstport == tcp->tcp_lport) {
6615 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6616 goto failed;
6617 }
6618
6619 tcp->tcp_ipha->ipha_dst = dstaddr;
6620 IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6);
6621
6622 /*
6623 * Massage a source route if any putting the first hop
6624 * in iph_dst. Compute a starting value for the checksum which
6625 * takes into account that the original iph_dst should be
6626 * included in the checksum but that ip will include the
6627 * first hop in the source route in the tcp checksum.
6628 */
6629 tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack);
6630 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6631 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
6632 (tcp->tcp_ipha->ipha_dst & 0xffff));
6633 if ((int)tcp->tcp_sum < 0)
6634 tcp->tcp_sum--;
6635 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6636 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6637 (tcp->tcp_sum >> 16));
6638 tcph = tcp->tcp_tcph;
6639 *(uint16_t *)tcph->th_fport = dstport;
6640 tcp->tcp_fport = dstport;
6641
6642 oldstate = tcp->tcp_state;
6643 /*
6644 * At this point the remote destination address and remote port fields
6645 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6646 * have to see which state tcp was in so we can take apropriate action.
6647 */
6648 if (oldstate == TCPS_IDLE) {
6649 /*
6650 * We support a quick connect capability here, allowing
6651 * clients to transition directly from IDLE to SYN_SENT
6652 * tcp_bindi will pick an unused port, insert the connection
6653 * in the bind hash and transition to BOUND state.
6654 */
6655 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6656 tcp, B_TRUE);
6657 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6658 B_FALSE, B_FALSE);
6659 if (lport == 0) {
6660 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6661 goto failed;
6662 }
6663 }
6664 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6665 int32_t, TCPS_SYN_SENT);
6666 tcp->tcp_state = TCPS_SYN_SENT;
6667
6668 /*
6669 * TODO: allow data with connect requests
6670 * by unlinking M_DATA trailers here and
6671 * linking them in behind the T_OK_ACK mblk.
6672 * The tcp_rput() bind ack handler would then
6673 * feed them to tcp_wput_data() rather than call
6674 * tcp_timer().
6675 */
6676 mp = mi_tpi_ok_ack_alloc(mp);
6677 if (!mp) {
6678 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6679 tcp_t *, tcp, int32_t, oldstate);
6680 tcp->tcp_state = oldstate;
6681 goto failed;
6682 }
6683 if (tcp->tcp_family == AF_INET) {
6684 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6685 sizeof (ipa_conn_t));
6686 } else {
6687 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6688 sizeof (ipa6_conn_t));
6689 }
6690 if (mp1) {
6691 /*
6692 * We need to make sure that the conn_recv is set to a non-null
6693 * value before we insert the conn_t into the classifier table.
6694 * This is to avoid a race with an incoming packet which does
6695 * an ipcl_classify().
6696 */
6697 tcp->tcp_connp->conn_recv = tcp_input;
6698
6699 /* Hang onto the T_OK_ACK for later. */
6700 linkb(mp1, mp);
6701 mblk_setcred(mp1, tcp->tcp_cred);
6702 if (tcp->tcp_family == AF_INET)
6703 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6704 else {
6705 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6706 &tcp->tcp_sticky_ipp);
6707 }
6708 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6709 tcp->tcp_active_open = 1;
6710 /*
6711 * If the bind cannot complete immediately
6712 * IP will arrange to call tcp_rput_other
6713 * when the bind completes.
6714 */
6715 if (mp1 != NULL)
6716 tcp_rput_other(tcp, mp1);
6717 return;
6718 }
6719 /* Error case */
6720 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6721 int32_t, oldstate);
6722 tcp->tcp_state = oldstate;
6723 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6724
6725 failed:
6726 /* return error ack and blow away saved option results if any */
6727 if (mp != NULL)
6728 putnext(tcp->tcp_rq, mp);
6729 else {
6730 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6731 TSYSERR, ENOMEM);
6732 }
6733 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6734 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6735
6736 }
6737
6738 /*
6739 * Handle connect to IPv6 destinations.
6740 */
6741 static void
6742 tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
6743 in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
6744 {
6745 tcph_t *tcph;
6746 mblk_t *mp1;
6747 ip6_rthdr_t *rth;
6748 int32_t oldstate;
6749 uint16_t lport;
6750 tcp_stack_t *tcps = tcp->tcp_tcps;
6751
6752 ASSERT(tcp->tcp_family == AF_INET6);
6753
6754 /*
6755 * If we're here, it means that the destination address is a native
6756 * IPv6 address. Return an error if tcp_ipversion is not IPv6. A
6757 * reason why it might not be IPv6 is if the socket was bound to an
6758 * IPv4-mapped IPv6 address.
6759 */
6760 if (tcp->tcp_ipversion != IPV6_VERSION) {
6761 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6762 goto failed;
6763 }
6764
6765 /*
6766 * Interpret a zero destination to mean loopback.
6767 * Update the T_CONN_REQ (sin/sin6) since it is used to
6768 * generate the T_CONN_CON.
6769 */
6770 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) {
6771 *dstaddrp = ipv6_loopback;
6772 }
6773
6774 /* Handle __sin6_src_id if socket not bound to an IP address */
6775 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
6776 ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src,
6777 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
6778 tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src;
6779 }
6780
6781 /*
6782 * Take care of the scope_id now and add ip6i_t
6783 * if ip6i_t is not already allocated through TCP
6784 * sticky options. At this point tcp_ip6h does not
6785 * have dst info, thus use dstaddrp.
6786 */
6787 if (scope_id != 0 &&
6788 IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
6789 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
6790 ip6i_t *ip6i;
6791
6792 ipp->ipp_ifindex = scope_id;
6793 ip6i = (ip6i_t *)tcp->tcp_iphc;
6794
6795 if ((ipp->ipp_fields & IPPF_HAS_IP6I) &&
6796 ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) {
6797 /* Already allocated */
6798 ip6i->ip6i_flags |= IP6I_IFINDEX;
6799 ip6i->ip6i_ifindex = ipp->ipp_ifindex;
6800 ipp->ipp_fields |= IPPF_SCOPE_ID;
6801 } else {
6802 int reterr;
6803
6804 ipp->ipp_fields |= IPPF_SCOPE_ID;
6805 if (ipp->ipp_fields & IPPF_HAS_IP6I)
6806 ip2dbg(("tcp_connect_v6: SCOPE_ID set\n"));
6807 reterr = tcp_build_hdrs(tcp->tcp_rq, tcp);
6808 if (reterr != 0)
6809 goto failed;
6810 ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n"));
6811 }
6812 }
6813
6814 /*
6815 * Don't let an endpoint connect to itself. Note that
6816 * the test here does not catch the case where the
6817 * source IP addr was left unspecified by the user. In
6818 * this case, the source addr is set in tcp_adapt_ire()
6819 * using the reply to the T_BIND message that we send
6820 * down to IP here and the check is repeated in tcp_rput_other.
6821 */
6822 if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) &&
6823 (dstport == tcp->tcp_lport)) {
6824 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6825 goto failed;
6826 }
6827
6828 tcp->tcp_ip6h->ip6_dst = *dstaddrp;
6829 tcp->tcp_remote_v6 = *dstaddrp;
6830 tcp->tcp_ip6h->ip6_vcf =
6831 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
6832 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
6833
6834
6835 /*
6836 * Massage a routing header (if present) putting the first hop
6837 * in ip6_dst. Compute a starting value for the checksum which
6838 * takes into account that the original ip6_dst should be
6839 * included in the checksum but that ip will include the
6840 * first hop in the source route in the tcp checksum.
6841 */
6842 rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph);
6843 if (rth != NULL) {
6844 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth,
6845 tcps->tcps_netstack);
6846 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6847 (tcp->tcp_sum >> 16));
6848 } else {
6849 tcp->tcp_sum = 0;
6850 }
6851
6852 tcph = tcp->tcp_tcph;
6853 *(uint16_t *)tcph->th_fport = dstport;
6854 tcp->tcp_fport = dstport;
6855
6856 oldstate = tcp->tcp_state;
6857 /*
6858 * At this point the remote destination address and remote port fields
6859 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6860 * have to see which state tcp was in so we can take apropriate action.
6861 */
6862 if (oldstate == TCPS_IDLE) {
6863 /*
6864 * We support a quick connect capability here, allowing
6865 * clients to transition directly from IDLE to SYN_SENT
6866 * tcp_bindi will pick an unused port, insert the connection
6867 * in the bind hash and transition to BOUND state.
6868 */
6869 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6870 tcp, B_TRUE);
6871 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6872 B_FALSE, B_FALSE);
6873 if (lport == 0) {
6874 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6875 goto failed;
6876 }
6877 }
6878 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6879 int32_t, TCPS_SYN_SENT);
6880 tcp->tcp_state = TCPS_SYN_SENT;
6881 /*
6882 * TODO: allow data with connect requests
6883 * by unlinking M_DATA trailers here and
6884 * linking them in behind the T_OK_ACK mblk.
6885 * The tcp_rput() bind ack handler would then
6886 * feed them to tcp_wput_data() rather than call
6887 * tcp_timer().
6888 */
6889 mp = mi_tpi_ok_ack_alloc(mp);
6890 if (!mp) {
6891 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6892 tcp_t *, tcp, int32_t, oldstate);
6893 tcp->tcp_state = oldstate;
6894 goto failed;
6895 }
6896 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6897 if (mp1) {
6898 /*
6899 * We need to make sure that the conn_recv is set to a non-null
6900 * value before we insert the conn_t into the classifier table.
6901 * This is to avoid a race with an incoming packet which does
6902 * an ipcl_classify().
6903 */
6904 tcp->tcp_connp->conn_recv = tcp_input;
6905
6906 /* Hang onto the T_OK_ACK for later. */
6907 linkb(mp1, mp);
6908 mblk_setcred(mp1, tcp->tcp_cred);
6909 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6910 &tcp->tcp_sticky_ipp);
6911 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6912 tcp->tcp_active_open = 1;
6913 /* ip_bind_v6() may return ACK or ERROR */
6914 if (mp1 != NULL)
6915 tcp_rput_other(tcp, mp1);
6916 return;
6917 }
6918 /* Error case */
6919 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6920 int32_t, oldstate);
6921 tcp->tcp_state = oldstate;
6922 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6923
6924 failed:
6925 /* return error ack and blow away saved option results if any */
6926 if (mp != NULL)
6927 putnext(tcp->tcp_rq, mp);
6928 else {
6929 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6930 TSYSERR, ENOMEM);
6931 }
6932 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6933 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6934 }
6935
6936 /*
6937 * We need a stream q for detached closing tcp connections
6938 * to use. Our client hereby indicates that this q is the
6939 * one to use.
6940 */
6941 static void
6942 tcp_def_q_set(tcp_t *tcp, mblk_t *mp)
6943 {
6944 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
6945 queue_t *q = tcp->tcp_wq;
6946 tcp_stack_t *tcps = tcp->tcp_tcps;
6947
6948 #ifdef NS_DEBUG
6949 (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n",
6950 tcps->tcps_netstack->netstack_stackid);
6951 #endif
6952 mp->b_datap->db_type = M_IOCACK;
6953 iocp->ioc_count = 0;
6954 mutex_enter(&tcps->tcps_g_q_lock);
6955 if (tcps->tcps_g_q != NULL) {
6956 mutex_exit(&tcps->tcps_g_q_lock);
6957 iocp->ioc_error = EALREADY;
6958 } else {
6959 mblk_t *mp1;
6960
6961 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0);
6962 if (mp1 == NULL) {
6963 mutex_exit(&tcps->tcps_g_q_lock);
6964 iocp->ioc_error = ENOMEM;
6965 } else {
6966 tcps->tcps_g_q = tcp->tcp_rq;
6967 mutex_exit(&tcps->tcps_g_q_lock);
6968 iocp->ioc_error = 0;
6969 iocp->ioc_rval = 0;
6970 /*
6971 * We are passing tcp_sticky_ipp as NULL
6972 * as it is not useful for tcp_default queue
6973 *
6974 * Set conn_recv just in case.
6975 */
6976 tcp->tcp_connp->conn_recv = tcp_conn_request;
6977
6978 mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL);
6979 if (mp1 != NULL)
6980 tcp_rput_other(tcp, mp1);
6981 }
6982 }
6983 qreply(q, mp);
6984 }
6985
6986 /*
6987 * Our client hereby directs us to reject the connection request
6988 * that tcp_conn_request() marked with 'seqnum'. Rejection consists
6989 * of sending the appropriate RST, not an ICMP error.
6990 */
6991 static void
6992 tcp_disconnect(tcp_t *tcp, mblk_t *mp)
6993 {
6994 tcp_t *ltcp = NULL;
6995 t_scalar_t seqnum;
6996 conn_t *connp;
6997 tcp_stack_t *tcps = tcp->tcp_tcps;
6998
6999 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
7000 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
7001 tcp_err_ack(tcp, mp, TPROTO, 0);
7002 return;
7003 }
7004
7005 /*
7006 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
7007 * when the stream is in BOUND state. Do not send a reset,
7008 * since the destination IP address is not valid, and it can
7009 * be the initialized value of all zeros (broadcast address).
7010 *
7011 * If TCP has sent down a bind request to IP and has not
7012 * received the reply, reject the request. Otherwise, TCP
7013 * will be confused.
7014 */
7015 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
7016 if (tcp->tcp_debug) {
7017 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
7018 "tcp_disconnect: bad state, %d", tcp->tcp_state);
7019 }
7020 tcp_err_ack(tcp, mp, TOUTSTATE, 0);
7021 return;
7022 }
7023
7024 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
7025
7026 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
7027
7028 /*
7029 * According to TPI, for non-listeners, ignore seqnum
7030 * and disconnect.
7031 * Following interpretation of -1 seqnum is historical
7032 * and implied TPI ? (TPI only states that for T_CONN_IND,
7033 * a valid seqnum should not be -1).
7034 *
7035 * -1 means disconnect everything
7036 * regardless even on a listener.
7037 */
7038
7039 int old_state = tcp->tcp_state;
7040 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
7041
7042 /*
7043 * The connection can't be on the tcp_time_wait_head list
7044 * since it is not detached.
7045 */
7046 ASSERT(tcp->tcp_time_wait_next == NULL);
7047 ASSERT(tcp->tcp_time_wait_prev == NULL);
7048 ASSERT(tcp->tcp_time_wait_expire == 0);
7049 ltcp = NULL;
7050 /*
7051 * If it used to be a listener, check to make sure no one else
7052 * has taken the port before switching back to LISTEN state.
7053 */
7054 if (tcp->tcp_ipversion == IPV4_VERSION) {
7055 connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
7056 tcp->tcp_ipha->ipha_src,
7057 tcp->tcp_connp->conn_zoneid, ipst);
7058 if (connp != NULL)
7059 ltcp = connp->conn_tcp;
7060 } else {
7061 /* Allow tcp_bound_if listeners? */
7062 connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
7063 &tcp->tcp_ip6h->ip6_src, 0,
7064 tcp->tcp_connp->conn_zoneid, ipst);
7065 if (connp != NULL)
7066 ltcp = connp->conn_tcp;
7067 }
7068 if (tcp->tcp_conn_req_max && ltcp == NULL) {
7069 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7070 tcp_t *, tcp, int32_t, TCPS_LISTEN);
7071 tcp->tcp_state = TCPS_LISTEN;
7072 } else if (old_state > TCPS_BOUND) {
7073 tcp->tcp_conn_req_max = 0;
7074 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7075 tcp_t *, tcp, int32_t, TCPS_BOUND);
7076 tcp->tcp_state = TCPS_BOUND;
7077 }
7078 if (ltcp != NULL)
7079 CONN_DEC_REF(ltcp->tcp_connp);
7080 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
7081 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
7082 } else if (old_state == TCPS_ESTABLISHED ||
7083 old_state == TCPS_CLOSE_WAIT) {
7084 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
7085 }
7086
7087 if (tcp->tcp_fused)
7088 tcp_unfuse(tcp);
7089
7090 mutex_enter(&tcp->tcp_eager_lock);
7091 if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
7092 (tcp->tcp_conn_req_cnt_q != 0)) {
7093 tcp_eager_cleanup(tcp, 0);
7094 }
7095 mutex_exit(&tcp->tcp_eager_lock);
7096
7097 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt,
7098 tcp->tcp_rnxt, TH_RST | TH_ACK);
7099
7100 tcp_reinit(tcp);
7101
7102 if (old_state >= TCPS_ESTABLISHED) {
7103 /* Send M_FLUSH according to TPI */
7104 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
7105 }
7106 mp = mi_tpi_ok_ack_alloc(mp);
7107 if (mp)
7108 putnext(tcp->tcp_rq, mp);
7109 return;
7110 } else if (!tcp_eager_blowoff(tcp, seqnum)) {
7111 tcp_err_ack(tcp, mp, TBADSEQ, 0);
7112 return;
7113 }
7114 if (tcp->tcp_state >= TCPS_ESTABLISHED) {
7115 /* Send M_FLUSH according to TPI */
7116 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
7117 }
7118 mp = mi_tpi_ok_ack_alloc(mp);
7119 if (mp)
7120 putnext(tcp->tcp_rq, mp);
7121 }
7122
7123 /*
7124 * Diagnostic routine used to return a string associated with the tcp state.
7125 * Note that if the caller does not supply a buffer, it will use an internal
7126 * static string. This means that if multiple threads call this function at
7127 * the same time, output can be corrupted... Note also that this function
7128 * does not check the size of the supplied buffer. The caller has to make
7129 * sure that it is big enough.
7130 */
7131 static char *
7132 tcp_display(tcp_t *tcp, char *sup_buf, char format)
7133 {
7134 char buf1[30];
7135 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80];
7136 char *buf;
7137 char *cp;
7138 in6_addr_t local, remote;
7139 char local_addrbuf[INET6_ADDRSTRLEN];
7140 char remote_addrbuf[INET6_ADDRSTRLEN];
7141
7142 if (sup_buf != NULL)
7143 buf = sup_buf;
7144 else
7145 buf = priv_buf;
7146
7147 if (tcp == NULL)
7148 return ("NULL_TCP");
7149 switch (tcp->tcp_state) {
7150 case TCPS_CLOSED:
7151 cp = "TCP_CLOSED";
7152 break;
7153 case TCPS_IDLE:
7154 cp = "TCP_IDLE";
7155 break;
7156 case TCPS_BOUND:
7157 cp = "TCP_BOUND";
7158 break;
7159 case TCPS_LISTEN:
7160 cp = "TCP_LISTEN";
7161 break;
7162 case TCPS_SYN_SENT:
7163 cp = "TCP_SYN_SENT";
7164 break;
7165 case TCPS_SYN_RCVD:
7166 cp = "TCP_SYN_RCVD";
7167 break;
7168 case TCPS_ESTABLISHED:
7169 cp = "TCP_ESTABLISHED";
7170 break;
7171 case TCPS_CLOSE_WAIT:
7172 cp = "TCP_CLOSE_WAIT";
7173 break;
7174 case TCPS_FIN_WAIT_1:
7175 cp = "TCP_FIN_WAIT_1";
7176 break;
7177 case TCPS_CLOSING:
7178 cp = "TCP_CLOSING";
7179 break;
7180 case TCPS_LAST_ACK:
7181 cp = "TCP_LAST_ACK";
7182 break;
7183 case TCPS_FIN_WAIT_2:
7184 cp = "TCP_FIN_WAIT_2";
7185 break;
7186 case TCPS_TIME_WAIT:
7187 cp = "TCP_TIME_WAIT";
7188 break;
7189 default:
7190 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
7191 cp = buf1;
7192 break;
7193 }
7194 switch (format) {
7195 case DISP_ADDR_AND_PORT:
7196 if (tcp->tcp_ipversion == IPV4_VERSION) {
7197 /*
7198 * Note that we use the remote address in the tcp_b
7199 * structure. This means that it will print out
7200 * the real destination address, not the next hop's
7201 * address if source routing is used.
7202 */
7203 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local);
7204 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote);
7205
7206 } else {
7207 local = tcp->tcp_ip_src_v6;
7208 remote = tcp->tcp_remote_v6;
7209 }
7210 (void) inet_ntop(AF_INET6, &local, local_addrbuf,
7211 sizeof (local_addrbuf));
7212 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
7213 sizeof (remote_addrbuf));
7214 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
7215 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf,
7216 ntohs(tcp->tcp_fport), cp);
7217 break;
7218 case DISP_PORT_ONLY:
7219 default:
7220 (void) mi_sprintf(buf, "[%u, %u] %s",
7221 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp);
7222 break;
7223 }
7224
7225 return (buf);
7226 }
7227
7228 /*
7229 * Called via squeue to get on to eager's perimeter. It sends a
7230 * TH_RST if eager is in the fanout table. The listener wants the
7231 * eager to disappear either by means of tcp_eager_blowoff() or
7232 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
7233 * called (via squeue) if the eager cannot be inserted in the
7234 * fanout table in tcp_conn_request().
7235 */
7236 /* ARGSUSED */
7237 void
7238 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
7239 {
7240 conn_t *econnp = (conn_t *)arg;
7241 tcp_t *eager = econnp->conn_tcp;
7242 tcp_t *listener = eager->tcp_listener;
7243 tcp_stack_t *tcps = eager->tcp_tcps;
7244
7245 /*
7246 * We could be called because listener is closing. Since
7247 * the eager is using listener's queue's, its not safe.
7248 * Better use the default queue just to send the TH_RST
7249 * out.
7250 */
7251 ASSERT(tcps->tcps_g_q != NULL);
7252 eager->tcp_rq = tcps->tcps_g_q;
7253 eager->tcp_wq = WR(tcps->tcps_g_q);
7254
7255 /*
7256 * An eager's conn_fanout will be NULL if it's a duplicate
7257 * for an existing 4-tuples in the conn fanout table.
7258 * We don't want to send an RST out in such case.
7259 */
7260 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
7261 tcp_xmit_ctl("tcp_eager_kill, can't wait",
7262 eager, eager->tcp_snxt, 0, TH_RST);
7263 }
7264
7265 /* We are here because listener wants this eager gone */
7266 if (listener != NULL) {
7267 mutex_enter(&listener->tcp_eager_lock);
7268 tcp_eager_unlink(eager);
7269 if (eager->tcp_tconnind_started) {
7270 /*
7271 * The eager has sent a conn_ind up to the
7272 * listener but listener decides to close
7273 * instead. We need to drop the extra ref
7274 * placed on eager in tcp_rput_data() before
7275 * sending the conn_ind to listener.
7276 */
7277 CONN_DEC_REF(econnp);
7278 }
7279 mutex_exit(&listener->tcp_eager_lock);
7280 CONN_DEC_REF(listener->tcp_connp);
7281 }
7282
7283 if (eager->tcp_state > TCPS_BOUND)
7284 tcp_close_detached(eager);
7285 }
7286
7287 /*
7288 * Reset any eager connection hanging off this listener marked
7289 * with 'seqnum' and then reclaim it's resources.
7290 */
7291 static boolean_t
7292 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum)
7293 {
7294 tcp_t *eager;
7295 mblk_t *mp;
7296 tcp_stack_t *tcps = listener->tcp_tcps;
7297
7298 TCP_STAT(tcps, tcp_eager_blowoff_calls);
7299 eager = listener;
7300 mutex_enter(&listener->tcp_eager_lock);
7301 do {
7302 eager = eager->tcp_eager_next_q;
7303 if (eager == NULL) {
7304 mutex_exit(&listener->tcp_eager_lock);
7305 return (B_FALSE);
7306 }
7307 } while (eager->tcp_conn_req_seqnum != seqnum);
7308
7309 if (eager->tcp_closemp_used) {
7310 mutex_exit(&listener->tcp_eager_lock);
7311 return (B_TRUE);
7312 }
7313 eager->tcp_closemp_used = B_TRUE;
7314 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7315 CONN_INC_REF(eager->tcp_connp);
7316 mutex_exit(&listener->tcp_eager_lock);
7317 mp = &eager->tcp_closemp;
7318 squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
7319 eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF);
7320 return (B_TRUE);
7321 }
7322
7323 /*
7324 * Reset any eager connection hanging off this listener
7325 * and then reclaim it's resources.
7326 */
7327 static void
7328 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
7329 {
7330 tcp_t *eager;
7331 mblk_t *mp;
7332 tcp_stack_t *tcps = listener->tcp_tcps;
7333
7334 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
7335
7336 if (!q0_only) {
7337 /* First cleanup q */
7338 TCP_STAT(tcps, tcp_eager_blowoff_q);
7339 eager = listener->tcp_eager_next_q;
7340 while (eager != NULL) {
7341 if (!eager->tcp_closemp_used) {
7342 eager->tcp_closemp_used = B_TRUE;
7343 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7344 CONN_INC_REF(eager->tcp_connp);
7345 mp = &eager->tcp_closemp;
7346 squeue_fill(eager->tcp_connp->conn_sqp, mp,
7347 tcp_eager_kill, eager->tcp_connp,
7348 SQTAG_TCP_EAGER_CLEANUP);
7349 }
7350 eager = eager->tcp_eager_next_q;
7351 }
7352 }
7353 /* Then cleanup q0 */
7354 TCP_STAT(tcps, tcp_eager_blowoff_q0);
7355 eager = listener->tcp_eager_next_q0;
7356 while (eager != listener) {
7357 if (!eager->tcp_closemp_used) {
7358 eager->tcp_closemp_used = B_TRUE;
7359 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7360 CONN_INC_REF(eager->tcp_connp);
7361 mp = &eager->tcp_closemp;
7362 squeue_fill(eager->tcp_connp->conn_sqp, mp,
7363 tcp_eager_kill, eager->tcp_connp,
7364 SQTAG_TCP_EAGER_CLEANUP_Q0);
7365 }
7366 eager = eager->tcp_eager_next_q0;
7367 }
7368 }
7369
7370 /*
7371 * If we are an eager connection hanging off a listener that hasn't
7372 * formally accepted the connection yet, get off his list and blow off
7373 * any data that we have accumulated.
7374 */
7375 static void
7376 tcp_eager_unlink(tcp_t *tcp)
7377 {
7378 tcp_t *listener = tcp->tcp_listener;
7379
7380 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
7381 ASSERT(listener != NULL);
7382 if (tcp->tcp_eager_next_q0 != NULL) {
7383 ASSERT(tcp->tcp_eager_prev_q0 != NULL);
7384
7385 /* Remove the eager tcp from q0 */
7386 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
7387 tcp->tcp_eager_prev_q0;
7388 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
7389 tcp->tcp_eager_next_q0;
7390 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
7391 listener->tcp_conn_req_cnt_q0--;
7392
7393 tcp->tcp_eager_next_q0 = NULL;
7394 tcp->tcp_eager_prev_q0 = NULL;
7395
7396 /*
7397 * Take the eager out, if it is in the list of droppable
7398 * eagers.
7399 */
7400 MAKE_UNDROPPABLE(tcp);
7401
7402 if (tcp->tcp_syn_rcvd_timeout != 0) {
7403 /* we have timed out before */
7404 ASSERT(listener->tcp_syn_rcvd_timeout > 0);
7405 listener->tcp_syn_rcvd_timeout--;
7406 }
7407 } else {
7408 tcp_t **tcpp = &listener->tcp_eager_next_q;
7409 tcp_t *prev = NULL;
7410
7411 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
7412 if (tcpp[0] == tcp) {
7413 if (listener->tcp_eager_last_q == tcp) {
7414 /*
7415 * If we are unlinking the last
7416 * element on the list, adjust
7417 * tail pointer. Set tail pointer
7418 * to nil when list is empty.
7419 */
7420 ASSERT(tcp->tcp_eager_next_q == NULL);
7421 if (listener->tcp_eager_last_q ==
7422 listener->tcp_eager_next_q) {
7423 listener->tcp_eager_last_q =
7424 NULL;
7425 } else {
7426 /*
7427 * We won't get here if there
7428 * is only one eager in the
7429 * list.
7430 */
7431 ASSERT(prev != NULL);
7432 listener->tcp_eager_last_q =
7433 prev;
7434 }
7435 }
7436 tcpp[0] = tcp->tcp_eager_next_q;
7437 tcp->tcp_eager_next_q = NULL;
7438 tcp->tcp_eager_last_q = NULL;
7439 ASSERT(listener->tcp_conn_req_cnt_q > 0);
7440 listener->tcp_conn_req_cnt_q--;
7441 break;
7442 }
7443 prev = tcpp[0];
7444 }
7445 }
7446 tcp->tcp_listener = NULL;
7447 }
7448
7449 /* Shorthand to generate and send TPI error acks to our client */
7450 static void
7451 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
7452 {
7453 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
7454 putnext(tcp->tcp_rq, mp);
7455 }
7456
7457 /* Shorthand to generate and send TPI error acks to our client */
7458 static void
7459 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
7460 int t_error, int sys_error)
7461 {
7462 struct T_error_ack *teackp;
7463
7464 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
7465 M_PCPROTO, T_ERROR_ACK)) != NULL) {
7466 teackp = (struct T_error_ack *)mp->b_rptr;
7467 teackp->ERROR_prim = primitive;
7468 teackp->TLI_error = t_error;
7469 teackp->UNIX_error = sys_error;
7470 putnext(tcp->tcp_rq, mp);
7471 }
7472 }
7473
7474 /*
7475 * Note: No locks are held when inspecting tcp_g_*epriv_ports
7476 * but instead the code relies on:
7477 * - the fact that the address of the array and its size never changes
7478 * - the atomic assignment of the elements of the array
7479 */
7480 /* ARGSUSED */
7481 static int
7482 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
7483 {
7484 int i;
7485 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7486
7487 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7488 if (tcps->tcps_g_epriv_ports[i] != 0)
7489 (void) mi_mpprintf(mp, "%d ",
7490 tcps->tcps_g_epriv_ports[i]);
7491 }
7492 return (0);
7493 }
7494
7495 /*
7496 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7497 * threads from changing it at the same time.
7498 */
7499 /* ARGSUSED */
7500 static int
7501 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7502 cred_t *cr)
7503 {
7504 long new_value;
7505 int i;
7506 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7507
7508 /*
7509 * Fail the request if the new value does not lie within the
7510 * port number limits.
7511 */
7512 if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
7513 new_value <= 0 || new_value >= 65536) {
7514 return (EINVAL);
7515 }
7516
7517 mutex_enter(&tcps->tcps_epriv_port_lock);
7518 /* Check if the value is already in the list */
7519 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7520 if (new_value == tcps->tcps_g_epriv_ports[i]) {
7521 mutex_exit(&tcps->tcps_epriv_port_lock);
7522 return (EEXIST);
7523 }
7524 }
7525 /* Find an empty slot */
7526 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7527 if (tcps->tcps_g_epriv_ports[i] == 0)
7528 break;
7529 }
7530 if (i == tcps->tcps_g_num_epriv_ports) {
7531 mutex_exit(&tcps->tcps_epriv_port_lock);
7532 return (EOVERFLOW);
7533 }
7534 /* Set the new value */
7535 tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value;
7536 mutex_exit(&tcps->tcps_epriv_port_lock);
7537 return (0);
7538 }
7539
7540 /*
7541 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7542 * threads from changing it at the same time.
7543 */
7544 /* ARGSUSED */
7545 static int
7546 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7547 cred_t *cr)
7548 {
7549 long new_value;
7550 int i;
7551 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7552
7553 /*
7554 * Fail the request if the new value does not lie within the
7555 * port number limits.
7556 */
7557 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 ||
7558 new_value >= 65536) {
7559 return (EINVAL);
7560 }
7561
7562 mutex_enter(&tcps->tcps_epriv_port_lock);
7563 /* Check that the value is already in the list */
7564 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7565 if (tcps->tcps_g_epriv_ports[i] == new_value)
7566 break;
7567 }
7568 if (i == tcps->tcps_g_num_epriv_ports) {
7569 mutex_exit(&tcps->tcps_epriv_port_lock);
7570 return (ESRCH);
7571 }
7572 /* Clear the value */
7573 tcps->tcps_g_epriv_ports[i] = 0;
7574 mutex_exit(&tcps->tcps_epriv_port_lock);
7575 return (0);
7576 }
7577
7578 /* Return the TPI/TLI equivalent of our current tcp_state */
7579 static int
7580 tcp_tpistate(tcp_t *tcp)
7581 {
7582 switch (tcp->tcp_state) {
7583 case TCPS_IDLE:
7584 return (TS_UNBND);
7585 case TCPS_LISTEN:
7586 /*
7587 * Return whether there are outstanding T_CONN_IND waiting
7588 * for the matching T_CONN_RES. Therefore don't count q0.
7589 */
7590 if (tcp->tcp_conn_req_cnt_q > 0)
7591 return (TS_WRES_CIND);
7592 else
7593 return (TS_IDLE);
7594 case TCPS_BOUND:
7595 return (TS_IDLE);
7596 case TCPS_SYN_SENT:
7597 return (TS_WCON_CREQ);
7598 case TCPS_SYN_RCVD:
7599 /*
7600 * Note: assumption: this has to the active open SYN_RCVD.
7601 * The passive instance is detached in SYN_RCVD stage of
7602 * incoming connection processing so we cannot get request
7603 * for T_info_ack on it.
7604 */
7605 return (TS_WACK_CRES);
7606 case TCPS_ESTABLISHED:
7607 return (TS_DATA_XFER);
7608 case TCPS_CLOSE_WAIT:
7609 return (TS_WREQ_ORDREL);
7610 case TCPS_FIN_WAIT_1:
7611 return (TS_WIND_ORDREL);
7612 case TCPS_FIN_WAIT_2:
7613 return (TS_WIND_ORDREL);
7614
7615 case TCPS_CLOSING:
7616 case TCPS_LAST_ACK:
7617 case TCPS_TIME_WAIT:
7618 case TCPS_CLOSED:
7619 /*
7620 * Following TS_WACK_DREQ7 is a rendition of "not
7621 * yet TS_IDLE" TPI state. There is no best match to any
7622 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
7623 * choose a value chosen that will map to TLI/XTI level
7624 * state of TSTATECHNG (state is process of changing) which
7625 * captures what this dummy state represents.
7626 */
7627 return (TS_WACK_DREQ7);
7628 default:
7629 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
7630 tcp->tcp_state, tcp_display(tcp, NULL,
7631 DISP_PORT_ONLY));
7632 return (TS_UNBND);
7633 }
7634 }
7635
7636 static void
7637 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
7638 {
7639 tcp_stack_t *tcps = tcp->tcp_tcps;
7640
7641 if (tcp->tcp_family == AF_INET6)
7642 *tia = tcp_g_t_info_ack_v6;
7643 else
7644 *tia = tcp_g_t_info_ack;
7645 tia->CURRENT_state = tcp_tpistate(tcp);
7646 tia->OPT_size = tcp_max_optsize;
7647 if (tcp->tcp_mss == 0) {
7648 /* Not yet set - tcp_open does not set mss */
7649 if (tcp->tcp_ipversion == IPV4_VERSION)
7650 tia->TIDU_size = tcps->tcps_mss_def_ipv4;
7651 else
7652 tia->TIDU_size = tcps->tcps_mss_def_ipv6;
7653 } else {
7654 tia->TIDU_size = tcp->tcp_mss;
7655 }
7656 /* TODO: Default ETSDU is 1. Is that correct for tcp? */
7657 }
7658
7659 /*
7660 * This routine responds to T_CAPABILITY_REQ messages. It is called by
7661 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from
7662 * tcp_g_t_info_ack. The current state of the stream is copied from
7663 * tcp_state.
7664 */
7665 static void
7666 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
7667 {
7668 t_uscalar_t cap_bits1;
7669 struct T_capability_ack *tcap;
7670
7671 if (MBLKL(mp) < sizeof (struct T_capability_req)) {
7672 freemsg(mp);
7673 return;
7674 }
7675
7676 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
7677
7678 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
7679 mp->b_datap->db_type, T_CAPABILITY_ACK);
7680 if (mp == NULL)
7681 return;
7682
7683 tcap = (struct T_capability_ack *)mp->b_rptr;
7684 tcap->CAP_bits1 = 0;
7685
7686 if (cap_bits1 & TC1_INFO) {
7687 tcp_copy_info(&tcap->INFO_ack, tcp);
7688 tcap->CAP_bits1 |= TC1_INFO;
7689 }
7690
7691 if (cap_bits1 & TC1_ACCEPTOR_ID) {
7692 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
7693 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
7694 }
7695
7696 putnext(tcp->tcp_rq, mp);
7697 }
7698
7699 /*
7700 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput.
7701 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
7702 * The current state of the stream is copied from tcp_state.
7703 */
7704 static void
7705 tcp_info_req(tcp_t *tcp, mblk_t *mp)
7706 {
7707 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
7708 T_INFO_ACK);
7709 if (!mp) {
7710 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7711 return;
7712 }
7713 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
7714 putnext(tcp->tcp_rq, mp);
7715 }
7716
7717 /* Respond to the TPI addr request */
7718 static void
7719 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
7720 {
7721 sin_t *sin;
7722 mblk_t *ackmp;
7723 struct T_addr_ack *taa;
7724
7725 /* Make it large enough for worst case */
7726 ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
7727 2 * sizeof (sin6_t), 1);
7728 if (ackmp == NULL) {
7729 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7730 return;
7731 }
7732
7733 if (tcp->tcp_ipversion == IPV6_VERSION) {
7734 tcp_addr_req_ipv6(tcp, ackmp);
7735 return;
7736 }
7737 taa = (struct T_addr_ack *)ackmp->b_rptr;
7738
7739 bzero(taa, sizeof (struct T_addr_ack));
7740 ackmp->b_wptr = (uchar_t *)&taa[1];
7741
7742 taa->PRIM_type = T_ADDR_ACK;
7743 ackmp->b_datap->db_type = M_PCPROTO;
7744
7745 /*
7746 * Note: Following code assumes 32 bit alignment of basic
7747 * data structures like sin_t and struct T_addr_ack.
7748 */
7749 if (tcp->tcp_state >= TCPS_BOUND) {
7750 /*
7751 * Fill in local address
7752 */
7753 taa->LOCADDR_length = sizeof (sin_t);
7754 taa->LOCADDR_offset = sizeof (*taa);
7755
7756 sin = (sin_t *)&taa[1];
7757
7758 /* Fill zeroes and then intialize non-zero fields */
7759 *sin = sin_null;
7760
7761 sin->sin_family = AF_INET;
7762
7763 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
7764 sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport;
7765
7766 ackmp->b_wptr = (uchar_t *)&sin[1];
7767
7768 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7769 /*
7770 * Fill in Remote address
7771 */
7772 taa->REMADDR_length = sizeof (sin_t);
7773 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7774 taa->LOCADDR_length);
7775
7776 sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7777 *sin = sin_null;
7778 sin->sin_family = AF_INET;
7779 sin->sin_addr.s_addr = tcp->tcp_remote;
7780 sin->sin_port = tcp->tcp_fport;
7781
7782 ackmp->b_wptr = (uchar_t *)&sin[1];
7783 }
7784 }
7785 putnext(tcp->tcp_rq, ackmp);
7786 }
7787
7788 /* Assumes that tcp_addr_req gets enough space and alignment */
7789 static void
7790 tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
7791 {
7792 sin6_t *sin6;
7793 struct T_addr_ack *taa;
7794
7795 ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
7796 ASSERT(OK_32PTR(ackmp->b_rptr));
7797 ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) +
7798 2 * sizeof (sin6_t));
7799
7800 taa = (struct T_addr_ack *)ackmp->b_rptr;
7801
7802 bzero(taa, sizeof (struct T_addr_ack));
7803 ackmp->b_wptr = (uchar_t *)&taa[1];
7804
7805 taa->PRIM_type = T_ADDR_ACK;
7806 ackmp->b_datap->db_type = M_PCPROTO;
7807
7808 /*
7809 * Note: Following code assumes 32 bit alignment of basic
7810 * data structures like sin6_t and struct T_addr_ack.
7811 */
7812 if (tcp->tcp_state >= TCPS_BOUND) {
7813 /*
7814 * Fill in local address
7815 */
7816 taa->LOCADDR_length = sizeof (sin6_t);
7817 taa->LOCADDR_offset = sizeof (*taa);
7818
7819 sin6 = (sin6_t *)&taa[1];
7820 *sin6 = sin6_null;
7821
7822 sin6->sin6_family = AF_INET6;
7823 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
7824 sin6->sin6_port = tcp->tcp_lport;
7825
7826 ackmp->b_wptr = (uchar_t *)&sin6[1];
7827
7828 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7829 /*
7830 * Fill in Remote address
7831 */
7832 taa->REMADDR_length = sizeof (sin6_t);
7833 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7834 taa->LOCADDR_length);
7835
7836 sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7837 *sin6 = sin6_null;
7838 sin6->sin6_family = AF_INET6;
7839 sin6->sin6_flowinfo =
7840 tcp->tcp_ip6h->ip6_vcf &
7841 ~IPV6_VERS_AND_FLOW_MASK;
7842 sin6->sin6_addr = tcp->tcp_remote_v6;
7843 sin6->sin6_port = tcp->tcp_fport;
7844
7845 ackmp->b_wptr = (uchar_t *)&sin6[1];
7846 }
7847 }
7848 putnext(tcp->tcp_rq, ackmp);
7849 }
7850
7851 /*
7852 * Handle reinitialization of a tcp structure.
7853 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE.
7854 */
7855 static void
7856 tcp_reinit(tcp_t *tcp)
7857 {
7858 mblk_t *mp;
7859 int err;
7860 tcp_stack_t *tcps = tcp->tcp_tcps;
7861
7862 TCP_STAT(tcps, tcp_reinit_calls);
7863
7864 /* tcp_reinit should never be called for detached tcp_t's */
7865 ASSERT(tcp->tcp_listener == NULL);
7866 ASSERT((tcp->tcp_family == AF_INET &&
7867 tcp->tcp_ipversion == IPV4_VERSION) ||
7868 (tcp->tcp_family == AF_INET6 &&
7869 (tcp->tcp_ipversion == IPV4_VERSION ||
7870 tcp->tcp_ipversion == IPV6_VERSION)));
7871
7872 /* Cancel outstanding timers */
7873 tcp_timers_stop(tcp);
7874
7875 /*
7876 * Reset everything in the state vector, after updating global
7877 * MIB data from instance counters.
7878 */
7879 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
7880 tcp->tcp_ibsegs = 0;
7881 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
7882 tcp->tcp_obsegs = 0;
7883
7884 tcp_close_mpp(&tcp->tcp_xmit_head);
7885 if (tcp->tcp_snd_zcopy_aware)
7886 tcp_zcopy_notify(tcp);
7887 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
7888 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
7889 mutex_enter(&tcp->tcp_non_sq_lock);
7890 if (tcp->tcp_flow_stopped &&
7891 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
7892 tcp_clrqfull(tcp);
7893 }
7894 mutex_exit(&tcp->tcp_non_sq_lock);
7895 tcp_close_mpp(&tcp->tcp_reass_head);
7896 tcp->tcp_reass_tail = NULL;
7897 if (tcp->tcp_rcv_list != NULL) {
7898 /* Free b_next chain */
7899 tcp_close_mpp(&tcp->tcp_rcv_list);
7900 tcp->tcp_rcv_last_head = NULL;
7901 tcp->tcp_rcv_last_tail = NULL;
7902 tcp->tcp_rcv_cnt = 0;
7903 }
7904 tcp->tcp_rcv_last_tail = NULL;
7905
7906 if ((mp = tcp->tcp_urp_mp) != NULL) {
7907 freemsg(mp);
7908 tcp->tcp_urp_mp = NULL;
7909 }
7910 if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
7911 freemsg(mp);
7912 tcp->tcp_urp_mark_mp = NULL;
7913 }
7914 if (tcp->tcp_fused_sigurg_mp != NULL) {
7915 freeb(tcp->tcp_fused_sigurg_mp);
7916 tcp->tcp_fused_sigurg_mp = NULL;
7917 }
7918
7919 /*
7920 * Following is a union with two members which are
7921 * identical types and size so the following cleanup
7922 * is enough.
7923 */
7924 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
7925
7926 CL_INET_DISCONNECT(tcp);
7927
7928 /*
7929 * The connection can't be on the tcp_time_wait_head list
7930 * since it is not detached.
7931 */
7932 ASSERT(tcp->tcp_time_wait_next == NULL);
7933 ASSERT(tcp->tcp_time_wait_prev == NULL);
7934 ASSERT(tcp->tcp_time_wait_expire == 0);
7935
7936 if (tcp->tcp_kssl_pending) {
7937 tcp->tcp_kssl_pending = B_FALSE;
7938
7939 /* Don't reset if the initialized by bind. */
7940 if (tcp->tcp_kssl_ent != NULL) {
7941 kssl_release_ent(tcp->tcp_kssl_ent, NULL,
7942 KSSL_NO_PROXY);
7943 }
7944 }
7945 if (tcp->tcp_kssl_ctx != NULL) {
7946 kssl_release_ctx(tcp->tcp_kssl_ctx);
7947 tcp->tcp_kssl_ctx = NULL;
7948 }
7949
7950 /*
7951 * Reset/preserve other values
7952 */
7953 tcp_reinit_values(tcp);
7954 ipcl_hash_remove(tcp->tcp_connp);
7955 conn_delete_ire(tcp->tcp_connp, NULL);
7956 tcp_ipsec_cleanup(tcp);
7957
7958 if (tcp->tcp_conn_req_max != 0) {
7959 /*
7960 * This is the case when a TLI program uses the same
7961 * transport end point to accept a connection. This
7962 * makes the TCP both a listener and acceptor. When
7963 * this connection is closed, we need to set the state
7964 * back to TCPS_LISTEN. Make sure that the eager list
7965 * is reinitialized.
7966 *
7967 * Note that this stream is still bound to the four
7968 * tuples of the previous connection in IP. If a new
7969 * SYN with different foreign address comes in, IP will
7970 * not find it and will send it to the global queue. In
7971 * the global queue, TCP will do a tcp_lookup_listener()
7972 * to find this stream. This works because this stream
7973 * is only removed from connected hash.
7974 *
7975 */
7976 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7977 tcp_t *, tcp, int32_t, TCPS_LISTEN);
7978 tcp->tcp_state = TCPS_LISTEN;
7979 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7980 tcp->tcp_eager_next_drop_q0 = tcp;
7981 tcp->tcp_eager_prev_drop_q0 = tcp;
7982 tcp->tcp_connp->conn_recv = tcp_conn_request;
7983 if (tcp->tcp_family == AF_INET6) {
7984 ASSERT(tcp->tcp_connp->conn_af_isv6);
7985 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7986 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7987 } else {
7988 ASSERT(!tcp->tcp_connp->conn_af_isv6);
7989 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7990 tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7991 }
7992 } else {
7993 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7994 tcp_t *, tcp, int32_t, TCPS_BOUND);
7995 tcp->tcp_state = TCPS_BOUND;
7996 }
7997
7998 /*
7999 * Initialize to default values
8000 * Can't fail since enough header template space already allocated
8001 * at open().
8002 */
8003 err = tcp_init_values(tcp);
8004 ASSERT(err == 0);
8005 /* Restore state in tcp_tcph */
8006 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
8007 if (tcp->tcp_ipversion == IPV4_VERSION)
8008 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
8009 else
8010 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
8011 /*
8012 * Copy of the src addr. in tcp_t is needed in tcp_t
8013 * since the lookup funcs can only lookup on tcp_t
8014 */
8015 tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
8016
8017 ASSERT(tcp->tcp_ptpbhn != NULL);
8018 tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat;
8019 tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
8020 tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ?
8021 tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
8022 }
8023
8024 /*
8025 * Force values to zero that need be zero.
8026 * Do not touch values asociated with the BOUND or LISTEN state
8027 * since the connection will end up in that state after the reinit.
8028 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t
8029 * structure!
8030 */
8031 static void
8032 tcp_reinit_values(tcp)
8033 tcp_t *tcp;
8034 {
8035 tcp_stack_t *tcps = tcp->tcp_tcps;
8036
8037 #ifndef lint
8038 #define DONTCARE(x)
8039 #define PRESERVE(x)
8040 #else
8041 #define DONTCARE(x) ((x) = (x))
8042 #define PRESERVE(x) ((x) = (x))
8043 #endif /* lint */
8044
8045 PRESERVE(tcp->tcp_bind_hash);
8046 PRESERVE(tcp->tcp_ptpbhn);
8047 PRESERVE(tcp->tcp_acceptor_hash);
8048 PRESERVE(tcp->tcp_ptpahn);
8049
8050 /* Should be ASSERT NULL on these with new code! */
8051 ASSERT(tcp->tcp_time_wait_next == NULL);
8052 ASSERT(tcp->tcp_time_wait_prev == NULL);
8053 ASSERT(tcp->tcp_time_wait_expire == 0);
8054 PRESERVE(tcp->tcp_state);
8055 PRESERVE(tcp->tcp_rq);
8056 PRESERVE(tcp->tcp_wq);
8057
8058 ASSERT(tcp->tcp_xmit_head == NULL);
8059 ASSERT(tcp->tcp_xmit_last == NULL);
8060 ASSERT(tcp->tcp_unsent == 0);
8061 ASSERT(tcp->tcp_xmit_tail == NULL);
8062 ASSERT(tcp->tcp_xmit_tail_unsent == 0);
8063
8064 tcp->tcp_snxt = 0; /* Displayed in mib */
8065 tcp->tcp_suna = 0; /* Displayed in mib */
8066 tcp->tcp_swnd = 0;
8067 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */
8068
8069 ASSERT(tcp->tcp_ibsegs == 0);
8070 ASSERT(tcp->tcp_obsegs == 0);
8071
8072 if (tcp->tcp_iphc != NULL) {
8073 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8074 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
8075 }
8076
8077 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */
8078 DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */
8079 DONTCARE(tcp->tcp_ipha);
8080 DONTCARE(tcp->tcp_ip6h);
8081 DONTCARE(tcp->tcp_ip_hdr_len);
8082 DONTCARE(tcp->tcp_tcph);
8083 DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */
8084 tcp->tcp_valid_bits = 0;
8085
8086 DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */
8087 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */
8088 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */
8089 tcp->tcp_last_rcv_lbolt = 0;
8090
8091 tcp->tcp_init_cwnd = 0;
8092
8093 tcp->tcp_urp_last_valid = 0;
8094 tcp->tcp_hard_binding = 0;
8095 tcp->tcp_hard_bound = 0;
8096 PRESERVE(tcp->tcp_cred);
8097 PRESERVE(tcp->tcp_cpid);
8098 PRESERVE(tcp->tcp_open_time);
8099 PRESERVE(tcp->tcp_exclbind);
8100
8101 tcp->tcp_fin_acked = 0;
8102 tcp->tcp_fin_rcvd = 0;
8103 tcp->tcp_fin_sent = 0;
8104 tcp->tcp_ordrel_done = 0;
8105
8106 tcp->tcp_debug = 0;
8107 tcp->tcp_dontroute = 0;
8108 tcp->tcp_broadcast = 0;
8109
8110 tcp->tcp_useloopback = 0;
8111 tcp->tcp_reuseaddr = 0;
8112 tcp->tcp_oobinline = 0;
8113 tcp->tcp_dgram_errind = 0;
8114
8115 tcp->tcp_detached = 0;
8116 tcp->tcp_bind_pending = 0;
8117 tcp->tcp_unbind_pending = 0;
8118 tcp->tcp_deferred_clean_death = 0;
8119
8120 tcp->tcp_snd_ws_ok = B_FALSE;
8121 tcp->tcp_snd_ts_ok = B_FALSE;
8122 tcp->tcp_linger = 0;
8123 tcp->tcp_ka_enabled = 0;
8124 tcp->tcp_zero_win_probe = 0;
8125
8126 tcp->tcp_loopback = 0;
8127 tcp->tcp_localnet = 0;
8128 tcp->tcp_syn_defense = 0;
8129 tcp->tcp_set_timer = 0;
8130
8131 tcp->tcp_active_open = 0;
8132 ASSERT(tcp->tcp_timeout == B_FALSE);
8133 tcp->tcp_rexmit = B_FALSE;
8134 tcp->tcp_xmit_zc_clean = B_FALSE;
8135
8136 tcp->tcp_snd_sack_ok = B_FALSE;
8137 PRESERVE(tcp->tcp_recvdstaddr);
8138 tcp->tcp_hwcksum = B_FALSE;
8139
8140 tcp->tcp_ire_ill_check_done = B_FALSE;
8141 DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */
8142
8143 tcp->tcp_mdt = B_FALSE;
8144 tcp->tcp_mdt_hdr_head = 0;
8145 tcp->tcp_mdt_hdr_tail = 0;
8146
8147 tcp->tcp_conn_def_q0 = 0;
8148 tcp->tcp_ip_forward_progress = B_FALSE;
8149 tcp->tcp_anon_priv_bind = 0;
8150 tcp->tcp_ecn_ok = B_FALSE;
8151
8152 tcp->tcp_cwr = B_FALSE;
8153 tcp->tcp_ecn_echo_on = B_FALSE;
8154
8155 if (tcp->tcp_sack_info != NULL) {
8156 if (tcp->tcp_notsack_list != NULL) {
8157 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
8158 }
8159 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info);
8160 tcp->tcp_sack_info = NULL;
8161 }
8162
8163 tcp->tcp_rcv_ws = 0;
8164 tcp->tcp_snd_ws = 0;
8165 tcp->tcp_ts_recent = 0;
8166 tcp->tcp_rnxt = 0; /* Displayed in mib */
8167 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */
8168 tcp->tcp_if_mtu = 0;
8169
8170 ASSERT(tcp->tcp_reass_head == NULL);
8171 ASSERT(tcp->tcp_reass_tail == NULL);
8172
8173 tcp->tcp_cwnd_cnt = 0;
8174
8175 ASSERT(tcp->tcp_rcv_list == NULL);
8176 ASSERT(tcp->tcp_rcv_last_head == NULL);
8177 ASSERT(tcp->tcp_rcv_last_tail == NULL);
8178 ASSERT(tcp->tcp_rcv_cnt == 0);
8179
8180 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */
8181 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */
8182 tcp->tcp_csuna = 0;
8183
8184 tcp->tcp_rto = 0; /* Displayed in MIB */
8185 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */
8186 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */
8187 tcp->tcp_rtt_update = 0;
8188
8189 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
8190 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
8191
8192 tcp->tcp_rack = 0; /* Displayed in mib */
8193 tcp->tcp_rack_cnt = 0;
8194 tcp->tcp_rack_cur_max = 0;
8195 tcp->tcp_rack_abs_max = 0;
8196
8197 tcp->tcp_max_swnd = 0;
8198
8199 ASSERT(tcp->tcp_listener == NULL);
8200
8201 DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */
8202
8203 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */
8204 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */
8205 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */
8206 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */
8207
8208 ASSERT(tcp->tcp_conn_req_cnt_q == 0);
8209 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
8210 PRESERVE(tcp->tcp_conn_req_max);
8211 PRESERVE(tcp->tcp_conn_req_seqnum);
8212
8213 DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */
8214 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
8215 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
8216 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
8217 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
8218
8219 tcp->tcp_lingertime = 0;
8220
8221 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */
8222 ASSERT(tcp->tcp_urp_mp == NULL);
8223 ASSERT(tcp->tcp_urp_mark_mp == NULL);
8224 ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
8225
8226 ASSERT(tcp->tcp_eager_next_q == NULL);
8227 ASSERT(tcp->tcp_eager_last_q == NULL);
8228 ASSERT((tcp->tcp_eager_next_q0 == NULL &&
8229 tcp->tcp_eager_prev_q0 == NULL) ||
8230 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
8231 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
8232
8233 ASSERT((tcp->tcp_eager_next_drop_q0 == NULL &&
8234 tcp->tcp_eager_prev_drop_q0 == NULL) ||
8235 tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0);
8236
8237 tcp->tcp_client_errno = 0;
8238
8239 DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */
8240
8241 tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */
8242
8243 PRESERVE(tcp->tcp_bound_source_v6);
8244 tcp->tcp_last_sent_len = 0;
8245 tcp->tcp_dupack_cnt = 0;
8246
8247 tcp->tcp_fport = 0; /* Displayed in MIB */
8248 PRESERVE(tcp->tcp_lport);
8249
8250 PRESERVE(tcp->tcp_acceptor_lockp);
8251
8252 ASSERT(tcp->tcp_ordrelid == 0);
8253 PRESERVE(tcp->tcp_acceptor_id);
8254 DONTCARE(tcp->tcp_ipsec_overhead);
8255
8256 /*
8257 * If tcp_tracing flag is ON (i.e. We have a trace buffer
8258 * in tcp structure and now tracing), Re-initialize all
8259 * members of tcp_traceinfo.
8260 */
8261 if (tcp->tcp_tracebuf != NULL) {
8262 bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t));
8263 }
8264
8265 PRESERVE(tcp->tcp_family);
8266 if (tcp->tcp_family == AF_INET6) {
8267 tcp->tcp_ipversion = IPV6_VERSION;
8268 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
8269 } else {
8270 tcp->tcp_ipversion = IPV4_VERSION;
8271 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
8272 }
8273
8274 tcp->tcp_bound_if = 0;
8275 tcp->tcp_ipv6_recvancillary = 0;
8276 tcp->tcp_recvifindex = 0;
8277 tcp->tcp_recvhops = 0;
8278 tcp->tcp_closed = 0;
8279 tcp->tcp_cleandeathtag = 0;
8280 if (tcp->tcp_hopopts != NULL) {
8281 mi_free(tcp->tcp_hopopts);
8282 tcp->tcp_hopopts = NULL;
8283 tcp->tcp_hopoptslen = 0;
8284 }
8285 ASSERT(tcp->tcp_hopoptslen == 0);
8286 if (tcp->tcp_dstopts != NULL) {
8287 mi_free(tcp->tcp_dstopts);
8288 tcp->tcp_dstopts = NULL;
8289 tcp->tcp_dstoptslen = 0;
8290 }
8291 ASSERT(tcp->tcp_dstoptslen == 0);
8292 if (tcp->tcp_rtdstopts != NULL) {
8293 mi_free(tcp->tcp_rtdstopts);
8294 tcp->tcp_rtdstopts = NULL;
8295 tcp->tcp_rtdstoptslen = 0;
8296 }
8297 ASSERT(tcp->tcp_rtdstoptslen == 0);
8298 if (tcp->tcp_rthdr != NULL) {
8299 mi_free(tcp->tcp_rthdr);
8300 tcp->tcp_rthdr = NULL;
8301 tcp->tcp_rthdrlen = 0;
8302 }
8303 ASSERT(tcp->tcp_rthdrlen == 0);
8304 PRESERVE(tcp->tcp_drop_opt_ack_cnt);
8305
8306 /* Reset fusion-related fields */
8307 tcp->tcp_fused = B_FALSE;
8308 tcp->tcp_unfusable = B_FALSE;
8309 tcp->tcp_fused_sigurg = B_FALSE;
8310 tcp->tcp_direct_sockfs = B_FALSE;
8311 tcp->tcp_fuse_syncstr_stopped = B_FALSE;
8312 tcp->tcp_fuse_syncstr_plugged = B_FALSE;
8313 tcp->tcp_loopback_peer = NULL;
8314 tcp->tcp_fuse_rcv_hiwater = 0;
8315 tcp->tcp_fuse_rcv_unread_hiwater = 0;
8316 tcp->tcp_fuse_rcv_unread_cnt = 0;
8317
8318 tcp->tcp_lso = B_FALSE;
8319
8320 tcp->tcp_in_ack_unsent = 0;
8321 tcp->tcp_cork = B_FALSE;
8322 tcp->tcp_tconnind_started = B_FALSE;
8323
8324 PRESERVE(tcp->tcp_squeue_bytes);
8325
8326 ASSERT(tcp->tcp_kssl_ctx == NULL);
8327 ASSERT(!tcp->tcp_kssl_pending);
8328 PRESERVE(tcp->tcp_kssl_ent);
8329
8330 /* Sodirect */
8331 tcp->tcp_sodirect = NULL;
8332
8333 tcp->tcp_closemp_used = B_FALSE;
8334
8335 #ifdef DEBUG
8336 DONTCARE(tcp->tcmp_stk[0]);
8337 #endif
8338
8339
8340 #undef DONTCARE
8341 #undef PRESERVE
8342 }
8343
8344 /*
8345 * Allocate necessary resources and initialize state vector.
8346 * Guaranteed not to fail so that when an error is returned,
8347 * the caller doesn't need to do any additional cleanup.
8348 */
8349 int
8350 tcp_init(tcp_t *tcp, queue_t *q)
8351 {
8352 int err;
8353
8354 tcp->tcp_rq = q;
8355 tcp->tcp_wq = WR(q);
8356 /* DTrace ignores this - it isn't a tcp:::state-change */
8357 tcp->tcp_state = TCPS_IDLE;
8358 if ((err = tcp_init_values(tcp)) != 0)
8359 tcp_timers_stop(tcp);
8360 return (err);
8361 }
8362
8363 static int
8364 tcp_init_values(tcp_t *tcp)
8365 {
8366 int err;
8367 tcp_stack_t *tcps = tcp->tcp_tcps;
8368
8369 ASSERT((tcp->tcp_family == AF_INET &&
8370 tcp->tcp_ipversion == IPV4_VERSION) ||
8371 (tcp->tcp_family == AF_INET6 &&
8372 (tcp->tcp_ipversion == IPV4_VERSION ||
8373 tcp->tcp_ipversion == IPV6_VERSION)));
8374
8375 /*
8376 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
8377 * will be close to tcp_rexmit_interval_initial. By doing this, we
8378 * allow the algorithm to adjust slowly to large fluctuations of RTT
8379 * during first few transmissions of a connection as seen in slow
8380 * links.
8381 */
8382 tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2;
8383 tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1;
8384 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
8385 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
8386 tcps->tcps_conn_grace_period;
8387 if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min)
8388 tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
8389 tcp->tcp_timer_backoff = 0;
8390 tcp->tcp_ms_we_have_waited = 0;
8391 tcp->tcp_last_recv_time = lbolt;
8392 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
8393 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
8394 tcp->tcp_snd_burst = TCP_CWND_INFINITE;
8395
8396 tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier;
8397
8398 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
8399 tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
8400 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
8401 /*
8402 * Fix it to tcp_ip_abort_linterval later if it turns out to be a
8403 * passive open.
8404 */
8405 tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval;
8406
8407 tcp->tcp_naglim = tcps->tcps_naglim_def;
8408
8409 /* NOTE: ISS is now set in tcp_adapt_ire(). */
8410
8411 tcp->tcp_mdt_hdr_head = 0;
8412 tcp->tcp_mdt_hdr_tail = 0;
8413
8414 /* Reset fusion-related fields */
8415 tcp->tcp_fused = B_FALSE;
8416 tcp->tcp_unfusable = B_FALSE;
8417 tcp->tcp_fused_sigurg = B_FALSE;
8418 tcp->tcp_direct_sockfs = B_FALSE;
8419 tcp->tcp_fuse_syncstr_stopped = B_FALSE;
8420 tcp->tcp_fuse_syncstr_plugged = B_FALSE;
8421 tcp->tcp_loopback_peer = NULL;
8422 tcp->tcp_fuse_rcv_hiwater = 0;
8423 tcp->tcp_fuse_rcv_unread_hiwater = 0;
8424 tcp->tcp_fuse_rcv_unread_cnt = 0;
8425
8426 /* Sodirect */
8427 tcp->tcp_sodirect = NULL;
8428
8429 /* Initialize the header template */
8430 if (tcp->tcp_ipversion == IPV4_VERSION) {
8431 err = tcp_header_init_ipv4(tcp);
8432 } else {
8433 err = tcp_header_init_ipv6(tcp);
8434 }
8435 if (err)
8436 return (err);
8437
8438 /*
8439 * Init the window scale to the max so tcp_rwnd_set() won't pare
8440 * down tcp_rwnd. tcp_adapt_ire() will set the right value later.
8441 */
8442 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
8443 tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat;
8444 tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat;
8445
8446 tcp->tcp_cork = B_FALSE;
8447 /*
8448 * Init the tcp_debug option. This value determines whether TCP
8449 * calls strlog() to print out debug messages. Doing this
8450 * initialization here means that this value is not inherited thru
8451 * tcp_reinit().
8452 */
8453 tcp->tcp_debug = tcps->tcps_dbg;
8454
8455 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
8456 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
8457
8458 return (0);
8459 }
8460
8461 /*
8462 * Initialize the IPv4 header. Loses any record of any IP options.
8463 */
8464 static int
8465 tcp_header_init_ipv4(tcp_t *tcp)
8466 {
8467 tcph_t *tcph;
8468 uint32_t sum;
8469 conn_t *connp;
8470 tcp_stack_t *tcps = tcp->tcp_tcps;
8471
8472 /*
8473 * This is a simple initialization. If there's
8474 * already a template, it should never be too small,
8475 * so reuse it. Otherwise, allocate space for the new one.
8476 */
8477 if (tcp->tcp_iphc == NULL) {
8478 ASSERT(tcp->tcp_iphc_len == 0);
8479 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8480 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8481 if (tcp->tcp_iphc == NULL) {
8482 tcp->tcp_iphc_len = 0;
8483 return (ENOMEM);
8484 }
8485 }
8486
8487 /* options are gone; may need a new label */
8488 connp = tcp->tcp_connp;
8489 connp->conn_mlp_type = mlptSingle;
8490 connp->conn_ulp_labeled = !is_system_labeled();
8491 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8492 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
8493 tcp->tcp_ip6h = NULL;
8494 tcp->tcp_ipversion = IPV4_VERSION;
8495 tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t);
8496 tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8497 tcp->tcp_ip_hdr_len = sizeof (ipha_t);
8498 tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t));
8499 tcp->tcp_ipha->ipha_version_and_hdr_length
8500 = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
8501 tcp->tcp_ipha->ipha_ident = 0;
8502
8503 tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
8504 tcp->tcp_tos = 0;
8505 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
8506 tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
8507 tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP;
8508
8509 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t));
8510 tcp->tcp_tcph = tcph;
8511 tcph->th_offset_and_rsrvd[0] = (5 << 4);
8512 /*
8513 * IP wants our header length in the checksum field to
8514 * allow it to perform a single pseudo-header+checksum
8515 * calculation on behalf of TCP.
8516 * Include the adjustment for a source route once IP_OPTIONS is set.
8517 */
8518 sum = sizeof (tcph_t) + tcp->tcp_sum;
8519 sum = (sum >> 16) + (sum & 0xFFFF);
8520 U16_TO_ABE16(sum, tcph->th_sum);
8521 return (0);
8522 }
8523
8524 /*
8525 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
8526 */
8527 static int
8528 tcp_header_init_ipv6(tcp_t *tcp)
8529 {
8530 tcph_t *tcph;
8531 uint32_t sum;
8532 conn_t *connp;
8533 tcp_stack_t *tcps = tcp->tcp_tcps;
8534
8535 /*
8536 * This is a simple initialization. If there's
8537 * already a template, it should never be too small,
8538 * so reuse it. Otherwise, allocate space for the new one.
8539 * Ensure that there is enough space to "downgrade" the tcp_t
8540 * to an IPv4 tcp_t. This requires having space for a full load
8541 * of IPv4 options, as well as a full load of TCP options
8542 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space
8543 * than a v6 header and a TCP header with a full load of TCP options
8544 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes).
8545 * We want to avoid reallocation in the "downgraded" case when
8546 * processing outbound IPv4 options.
8547 */
8548 if (tcp->tcp_iphc == NULL) {
8549 ASSERT(tcp->tcp_iphc_len == 0);
8550 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8551 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8552 if (tcp->tcp_iphc == NULL) {
8553 tcp->tcp_iphc_len = 0;
8554 return (ENOMEM);
8555 }
8556 }
8557
8558 /* options are gone; may need a new label */
8559 connp = tcp->tcp_connp;
8560 connp->conn_mlp_type = mlptSingle;
8561 connp->conn_ulp_labeled = !is_system_labeled();
8562
8563 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8564 tcp->tcp_ipversion = IPV6_VERSION;
8565 tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t);
8566 tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8567 tcp->tcp_ip_hdr_len = IPV6_HDR_LEN;
8568 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
8569 tcp->tcp_ipha = NULL;
8570
8571 /* Initialize the header template */
8572
8573 tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
8574 tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t));
8575 tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP;
8576 tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit;
8577
8578 tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN);
8579 tcp->tcp_tcph = tcph;
8580 tcph->th_offset_and_rsrvd[0] = (5 << 4);
8581 /*
8582 * IP wants our header length in the checksum field to
8583 * allow it to perform a single psuedo-header+checksum
8584 * calculation on behalf of TCP.
8585 * Include the adjustment for a source route when IPV6_RTHDR is set.
8586 */
8587 sum = sizeof (tcph_t) + tcp->tcp_sum;
8588 sum = (sum >> 16) + (sum & 0xFFFF);
8589 U16_TO_ABE16(sum, tcph->th_sum);
8590 return (0);
8591 }
8592
8593 /* At minimum we need 8 bytes in the TCP header for the lookup */
8594 #define ICMP_MIN_TCP_HDR 8
8595
8596 /*
8597 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages
8598 * passed up by IP. The message is always received on the correct tcp_t.
8599 * Assumes that IP has pulled up everything up to and including the ICMP header.
8600 */
8601 void
8602 tcp_icmp_error(tcp_t *tcp, mblk_t *mp)
8603 {
8604 icmph_t *icmph;
8605 ipha_t *ipha;
8606 int iph_hdr_length;
8607 tcph_t *tcph;
8608 boolean_t ipsec_mctl = B_FALSE;
8609 boolean_t secure;
8610 mblk_t *first_mp = mp;
8611 uint32_t new_mss;
8612 uint32_t ratio;
8613 size_t mp_size = MBLKL(mp);
8614 uint32_t seg_seq;
8615 tcp_stack_t *tcps = tcp->tcp_tcps;
8616
8617 /* Assume IP provides aligned packets - otherwise toss */
8618 if (!OK_32PTR(mp->b_rptr)) {
8619 freemsg(mp);
8620 return;
8621 }
8622
8623 /*
8624 * Since ICMP errors are normal data marked with M_CTL when sent
8625 * to TCP or UDP, we have to look for a IPSEC_IN value to identify
8626 * packets starting with an ipsec_info_t, see ipsec_info.h.
8627 */
8628 if ((mp_size == sizeof (ipsec_info_t)) &&
8629 (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) {
8630 ASSERT(mp->b_cont != NULL);
8631 mp = mp->b_cont;
8632 /* IP should have done this */
8633 ASSERT(OK_32PTR(mp->b_rptr));
8634 mp_size = MBLKL(mp);
8635 ipsec_mctl = B_TRUE;
8636 }
8637
8638 /*
8639 * Verify that we have a complete outer IP header. If not, drop it.
8640 */
8641 if (mp_size < sizeof (ipha_t)) {
8642 noticmpv4:
8643 freemsg(first_mp);
8644 return;
8645 }
8646
8647 ipha = (ipha_t *)mp->b_rptr;
8648 /*
8649 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
8650 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
8651 */
8652 switch (IPH_HDR_VERSION(ipha)) {
8653 case IPV6_VERSION:
8654 tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl);
8655 return;
8656 case IPV4_VERSION:
8657 break;
8658 default:
8659 goto noticmpv4;
8660 }
8661
8662 /* Skip past the outer IP and ICMP headers */
8663 iph_hdr_length = IPH_HDR_LENGTH(ipha);
8664 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
8665 /*
8666 * If we don't have the correct outer IP header length or if the ULP
8667 * is not IPPROTO_ICMP or if we don't have a complete inner IP header
8668 * send it upstream.
8669 */
8670 if (iph_hdr_length < sizeof (ipha_t) ||
8671 ipha->ipha_protocol != IPPROTO_ICMP ||
8672 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
8673 goto noticmpv4;
8674 }
8675 ipha = (ipha_t *)&icmph[1];
8676
8677 /* Skip past the inner IP and find the ULP header */
8678 iph_hdr_length = IPH_HDR_LENGTH(ipha);
8679 tcph = (tcph_t *)((char *)ipha + iph_hdr_length);
8680 /*
8681 * If we don't have the correct inner IP header length or if the ULP
8682 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
8683 * bytes of TCP header, drop it.
8684 */
8685 if (iph_hdr_length < sizeof (ipha_t) ||
8686 ipha->ipha_protocol != IPPROTO_TCP ||
8687 (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) {
8688 goto noticmpv4;
8689 }
8690
8691 if (TCP_IS_DETACHED_NONEAGER(tcp)) {
8692 if (ipsec_mctl) {
8693 secure = ipsec_in_is_secure(first_mp);
8694 } else {
8695 secure = B_FALSE;
8696 }
8697 if (secure) {
8698 /*
8699 * If we are willing to accept this in clear
8700 * we don't have to verify policy.
8701 */
8702 if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) {
8703 if (!tcp_check_policy(tcp, first_mp,
8704 ipha, NULL, secure, ipsec_mctl)) {
8705 /*
8706 * tcp_check_policy called
8707 * ip_drop_packet() on failure.
8708 */
8709 return;
8710 }
8711 }
8712 }
8713 } else if (ipsec_mctl) {
8714 /*
8715 * This is a hard_bound connection. IP has already
8716 * verified policy. We don't have to do it again.
8717 */
8718 freeb(first_mp);
8719 first_mp = mp;
8720 ipsec_mctl = B_FALSE;
8721 }
8722
8723 seg_seq = ABE32_TO_U32(tcph->th_seq);
8724 /*
8725 * TCP SHOULD check that the TCP sequence number contained in
8726 * payload of the ICMP error message is within the range
8727 * SND.UNA <= SEG.SEQ < SND.NXT.
8728 */
8729 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
8730 /*
8731 * If the ICMP message is bogus, should we kill the
8732 * connection, or should we just drop the bogus ICMP
8733 * message? It would probably make more sense to just
8734 * drop the message so that if this one managed to get
8735 * in, the real connection should not suffer.
8736 */
8737 goto noticmpv4;
8738 }
8739
8740 switch (icmph->icmph_type) {
8741 case ICMP_DEST_UNREACHABLE:
8742 switch (icmph->icmph_code) {
8743 case ICMP_FRAGMENTATION_NEEDED:
8744 /*
8745 * Reduce the MSS based on the new MTU. This will
8746 * eliminate any fragmentation locally.
8747 * N.B. There may well be some funny side-effects on
8748 * the local send policy and the remote receive policy.
8749 * Pending further research, we provide
8750 * tcp_ignore_path_mtu just in case this proves
8751 * disastrous somewhere.
8752 *
8753 * After updating the MSS, retransmit part of the
8754 * dropped segment using the new mss by calling
8755 * tcp_wput_data(). Need to adjust all those
8756 * params to make sure tcp_wput_data() work properly.
8757 */
8758 if (tcps->tcps_ignore_path_mtu)
8759 break;
8760
8761 /*
8762 * Decrease the MSS by time stamp options
8763 * IP options and IPSEC options. tcp_hdr_len
8764 * includes time stamp option and IP option
8765 * length.
8766 */
8767
8768 new_mss = ntohs(icmph->icmph_du_mtu) -
8769 tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead;
8770
8771 /*
8772 * Only update the MSS if the new one is
8773 * smaller than the previous one. This is
8774 * to avoid problems when getting multiple
8775 * ICMP errors for the same MTU.
8776 */
8777 if (new_mss >= tcp->tcp_mss)
8778 break;
8779
8780 /*
8781 * Stop doing PMTU if new_mss is less than 68
8782 * or less than tcp_mss_min.
8783 * The value 68 comes from rfc 1191.
8784 */
8785 if (new_mss < MAX(68, tcps->tcps_mss_min))
8786 tcp->tcp_ipha->ipha_fragment_offset_and_flags =
8787 0;
8788
8789 ratio = tcp->tcp_cwnd / tcp->tcp_mss;
8790 ASSERT(ratio >= 1);
8791 tcp_mss_set(tcp, new_mss, B_TRUE);
8792
8793 /*
8794 * Make sure we have something to
8795 * send.
8796 */
8797 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
8798 (tcp->tcp_xmit_head != NULL)) {
8799 /*
8800 * Shrink tcp_cwnd in
8801 * proportion to the old MSS/new MSS.
8802 */
8803 tcp->tcp_cwnd = ratio * tcp->tcp_mss;
8804 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
8805 (tcp->tcp_unsent == 0)) {
8806 tcp->tcp_rexmit_max = tcp->tcp_fss;
8807 } else {
8808 tcp->tcp_rexmit_max = tcp->tcp_snxt;
8809 }
8810 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
8811 tcp->tcp_rexmit = B_TRUE;
8812 tcp->tcp_dupack_cnt = 0;
8813 tcp->tcp_snd_burst = TCP_CWND_SS;
8814 tcp_ss_rexmit(tcp);
8815 }
8816 break;
8817 case ICMP_PORT_UNREACHABLE:
8818 case ICMP_PROTOCOL_UNREACHABLE:
8819 switch (tcp->tcp_state) {
8820 case TCPS_SYN_SENT:
8821 case TCPS_SYN_RCVD:
8822 /*
8823 * ICMP can snipe away incipient
8824 * TCP connections as long as
8825 * seq number is same as initial
8826 * send seq number.
8827 */
8828 if (seg_seq == tcp->tcp_iss) {
8829 (void) tcp_clean_death(tcp,
8830 ECONNREFUSED, 6);
8831 }
8832 break;
8833 }
8834 break;
8835 case ICMP_HOST_UNREACHABLE:
8836 case ICMP_NET_UNREACHABLE:
8837 /* Record the error in case we finally time out. */
8838 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
8839 tcp->tcp_client_errno = EHOSTUNREACH;
8840 else
8841 tcp->tcp_client_errno = ENETUNREACH;
8842 if (tcp->tcp_state == TCPS_SYN_RCVD) {
8843 if (tcp->tcp_listener != NULL &&
8844 tcp->tcp_listener->tcp_syn_defense) {
8845 /*
8846 * Ditch the half-open connection if we
8847 * suspect a SYN attack is under way.
8848 */
8849 tcp_ip_ire_mark_advice(tcp);
8850 (void) tcp_clean_death(tcp,
8851 tcp->tcp_client_errno, 7);
8852 }
8853 }
8854 break;
8855 default:
8856 break;
8857 }
8858 break;
8859 case ICMP_SOURCE_QUENCH: {
8860 /*
8861 * use a global boolean to control
8862 * whether TCP should respond to ICMP_SOURCE_QUENCH.
8863 * The default is false.
8864 */
8865 if (tcp_icmp_source_quench) {
8866 /*
8867 * Reduce the sending rate as if we got a
8868 * retransmit timeout
8869 */
8870 uint32_t npkt;
8871
8872 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
8873 tcp->tcp_mss;
8874 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
8875 tcp->tcp_cwnd = tcp->tcp_mss;
8876 tcp->tcp_cwnd_cnt = 0;
8877 }
8878 break;
8879 }
8880 }
8881 freemsg(first_mp);
8882 }
8883
8884 /*
8885 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6
8886 * error messages passed up by IP.
8887 * Assumes that IP has pulled up all the extension headers as well
8888 * as the ICMPv6 header.
8889 */
8890 static void
8891 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl)
8892 {
8893 icmp6_t *icmp6;
8894 ip6_t *ip6h;
8895 uint16_t iph_hdr_length;
8896 tcpha_t *tcpha;
8897 uint8_t *nexthdrp;
8898 uint32_t new_mss;
8899 uint32_t ratio;
8900 boolean_t secure;
8901 mblk_t *first_mp = mp;
8902 size_t mp_size;
8903 uint32_t seg_seq;
8904 tcp_stack_t *tcps = tcp->tcp_tcps;
8905
8906 /*
8907 * The caller has determined if this is an IPSEC_IN packet and
8908 * set ipsec_mctl appropriately (see tcp_icmp_error).
8909 */
8910 if (ipsec_mctl)
8911 mp = mp->b_cont;
8912
8913 mp_size = MBLKL(mp);
8914
8915 /*
8916 * Verify that we have a complete IP header. If not, send it upstream.
8917 */
8918 if (mp_size < sizeof (ip6_t)) {
8919 noticmpv6:
8920 freemsg(first_mp);
8921 return;
8922 }
8923
8924 /*
8925 * Verify this is an ICMPV6 packet, else send it upstream.
8926 */
8927 ip6h = (ip6_t *)mp->b_rptr;
8928 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8929 iph_hdr_length = IPV6_HDR_LEN;
8930 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
8931 &nexthdrp) ||
8932 *nexthdrp != IPPROTO_ICMPV6) {
8933 goto noticmpv6;
8934 }
8935 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
8936 ip6h = (ip6_t *)&icmp6[1];
8937 /*
8938 * Verify if we have a complete ICMP and inner IP header.
8939 */
8940 if ((uchar_t *)&ip6h[1] > mp->b_wptr)
8941 goto noticmpv6;
8942
8943 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
8944 goto noticmpv6;
8945 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
8946 /*
8947 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
8948 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the
8949 * packet.
8950 */
8951 if ((*nexthdrp != IPPROTO_TCP) ||
8952 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
8953 goto noticmpv6;
8954 }
8955
8956 /*
8957 * ICMP errors come on the right queue or come on
8958 * listener/global queue for detached connections and
8959 * get switched to the right queue. If it comes on the
8960 * right queue, policy check has already been done by IP
8961 * and thus free the first_mp without verifying the policy.
8962 * If it has come for a non-hard bound connection, we need
8963 * to verify policy as IP may not have done it.
8964 */
8965 if (!tcp->tcp_hard_bound) {
8966 if (ipsec_mctl) {
8967 secure = ipsec_in_is_secure(first_mp);
8968 } else {
8969 secure = B_FALSE;
8970 }
8971 if (secure) {
8972 /*
8973 * If we are willing to accept this in clear
8974 * we don't have to verify policy.
8975 */
8976 if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) {
8977 if (!tcp_check_policy(tcp, first_mp,
8978 NULL, ip6h, secure, ipsec_mctl)) {
8979 /*
8980 * tcp_check_policy called
8981 * ip_drop_packet() on failure.
8982 */
8983 return;
8984 }
8985 }
8986 }
8987 } else if (ipsec_mctl) {
8988 /*
8989 * This is a hard_bound connection. IP has already
8990 * verified policy. We don't have to do it again.
8991 */
8992 freeb(first_mp);
8993 first_mp = mp;
8994 ipsec_mctl = B_FALSE;
8995 }
8996
8997 seg_seq = ntohl(tcpha->tha_seq);
8998 /*
8999 * TCP SHOULD check that the TCP sequence number contained in
9000 * payload of the ICMP error message is within the range
9001 * SND.UNA <= SEG.SEQ < SND.NXT.
9002 */
9003 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
9004 /*
9005 * If the ICMP message is bogus, should we kill the
9006 * connection, or should we just drop the bogus ICMP
9007 * message? It would probably make more sense to just
9008 * drop the message so that if this one managed to get
9009 * in, the real connection should not suffer.
9010 */
9011 goto noticmpv6;
9012 }
9013
9014 switch (icmp6->icmp6_type) {
9015 case ICMP6_PACKET_TOO_BIG:
9016 /*
9017 * Reduce the MSS based on the new MTU. This will
9018 * eliminate any fragmentation locally.
9019 * N.B. There may well be some funny side-effects on
9020 * the local send policy and the remote receive policy.
9021 * Pending further research, we provide
9022 * tcp_ignore_path_mtu just in case this proves
9023 * disastrous somewhere.
9024 *
9025 * After updating the MSS, retransmit part of the
9026 * dropped segment using the new mss by calling
9027 * tcp_wput_data(). Need to adjust all those
9028 * params to make sure tcp_wput_data() work properly.
9029 */
9030 if (tcps->tcps_ignore_path_mtu)
9031 break;
9032
9033 /*
9034 * Decrease the MSS by time stamp options
9035 * IP options and IPSEC options. tcp_hdr_len
9036 * includes time stamp option and IP option
9037 * length.
9038 */
9039 new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len -
9040 tcp->tcp_ipsec_overhead;
9041
9042 /*
9043 * Only update the MSS if the new one is
9044 * smaller than the previous one. This is
9045 * to avoid problems when getting multiple
9046 * ICMP errors for the same MTU.
9047 */
9048 if (new_mss >= tcp->tcp_mss)
9049 break;
9050
9051 ratio = tcp->tcp_cwnd / tcp->tcp_mss;
9052 ASSERT(ratio >= 1);
9053 tcp_mss_set(tcp, new_mss, B_TRUE);
9054
9055 /*
9056 * Make sure we have something to
9057 * send.
9058 */
9059 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
9060 (tcp->tcp_xmit_head != NULL)) {
9061 /*
9062 * Shrink tcp_cwnd in
9063 * proportion to the old MSS/new MSS.
9064 */
9065 tcp->tcp_cwnd = ratio * tcp->tcp_mss;
9066 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
9067 (tcp->tcp_unsent == 0)) {
9068 tcp->tcp_rexmit_max = tcp->tcp_fss;
9069 } else {
9070 tcp->tcp_rexmit_max = tcp->tcp_snxt;
9071 }
9072 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
9073 tcp->tcp_rexmit = B_TRUE;
9074 tcp->tcp_dupack_cnt = 0;
9075 tcp->tcp_snd_burst = TCP_CWND_SS;
9076 tcp_ss_rexmit(tcp);
9077 }
9078 break;
9079
9080 case ICMP6_DST_UNREACH:
9081 switch (icmp6->icmp6_code) {
9082 case ICMP6_DST_UNREACH_NOPORT:
9083 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
9084 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
9085 (seg_seq == tcp->tcp_iss)) {
9086 (void) tcp_clean_death(tcp,
9087 ECONNREFUSED, 8);
9088 }
9089 break;
9090
9091 case ICMP6_DST_UNREACH_ADMIN:
9092 case ICMP6_DST_UNREACH_NOROUTE:
9093 case ICMP6_DST_UNREACH_BEYONDSCOPE:
9094 case ICMP6_DST_UNREACH_ADDR:
9095 /* Record the error in case we finally time out. */
9096 tcp->tcp_client_errno = EHOSTUNREACH;
9097 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
9098 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
9099 (seg_seq == tcp->tcp_iss)) {
9100 if (tcp->tcp_listener != NULL &&
9101 tcp->tcp_listener->tcp_syn_defense) {
9102 /*
9103 * Ditch the half-open connection if we
9104 * suspect a SYN attack is under way.
9105 */
9106 tcp_ip_ire_mark_advice(tcp);
9107 (void) tcp_clean_death(tcp,
9108 tcp->tcp_client_errno, 9);
9109 }
9110 }
9111
9112
9113 break;
9114 default:
9115 break;
9116 }
9117 break;
9118
9119 case ICMP6_PARAM_PROB:
9120 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
9121 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
9122 (uchar_t *)ip6h + icmp6->icmp6_pptr ==
9123 (uchar_t *)nexthdrp) {
9124 if (tcp->tcp_state == TCPS_SYN_SENT ||
9125 tcp->tcp_state == TCPS_SYN_RCVD) {
9126 (void) tcp_clean_death(tcp,
9127 ECONNREFUSED, 10);
9128 }
9129 break;
9130 }
9131 break;
9132
9133 case ICMP6_TIME_EXCEEDED:
9134 default:
9135 break;
9136 }
9137 freemsg(first_mp);
9138 }
9139
9140 /*
9141 * IP recognizes seven kinds of bind requests:
9142 *
9143 * - A zero-length address binds only to the protocol number.
9144 *
9145 * - A 4-byte address is treated as a request to
9146 * validate that the address is a valid local IPv4
9147 * address, appropriate for an application to bind to.
9148 * IP does the verification, but does not make any note
9149 * of the address at this time.
9150 *
9151 * - A 16-byte address contains is treated as a request
9152 * to validate a local IPv6 address, as the 4-byte
9153 * address case above.
9154 *
9155 * - A 16-byte sockaddr_in to validate the local IPv4 address and also
9156 * use it for the inbound fanout of packets.
9157 *
9158 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also
9159 * use it for the inbound fanout of packets.
9160 *
9161 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout
9162 * information consisting of local and remote addresses
9163 * and ports. In this case, the addresses are both
9164 * validated as appropriate for this operation, and, if
9165 * so, the information is retained for use in the
9166 * inbound fanout.
9167 *
9168 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6
9169 * fanout information, like the 12-byte case above.
9170 *
9171 * IP will also fill in the IRE request mblk with information
9172 * regarding our peer. In all cases, we notify IP of our protocol
9173 * type by appending a single protocol byte to the bind request.
9174 */
9175 static mblk_t *
9176 tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length)
9177 {
9178 char *cp;
9179 mblk_t *mp;
9180 struct T_bind_req *tbr;
9181 ipa_conn_t *ac;
9182 ipa6_conn_t *ac6;
9183 sin_t *sin;
9184 sin6_t *sin6;
9185
9186 ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ);
9187 ASSERT((tcp->tcp_family == AF_INET &&
9188 tcp->tcp_ipversion == IPV4_VERSION) ||
9189 (tcp->tcp_family == AF_INET6 &&
9190 (tcp->tcp_ipversion == IPV4_VERSION ||
9191 tcp->tcp_ipversion == IPV6_VERSION)));
9192
9193 mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI);
9194 if (!mp)
9195 return (mp);
9196 mp->b_datap->db_type = M_PROTO;
9197 tbr = (struct T_bind_req *)mp->b_rptr;
9198 tbr->PRIM_type = bind_prim;
9199 tbr->ADDR_offset = sizeof (*tbr);
9200 tbr->CONIND_number = 0;
9201 tbr->ADDR_length = addr_length;
9202 cp = (char *)&tbr[1];
9203 switch (addr_length) {
9204 case sizeof (ipa_conn_t):
9205 ASSERT(tcp->tcp_family == AF_INET);
9206 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
9207
9208 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
9209 if (mp->b_cont == NULL) {
9210 freemsg(mp);
9211 return (NULL);
9212 }
9213 mp->b_cont->b_wptr += sizeof (ire_t);
9214 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
9215
9216 /* cp known to be 32 bit aligned */
9217 ac = (ipa_conn_t *)cp;
9218 ac->ac_laddr = tcp->tcp_ipha->ipha_src;
9219 ac->ac_faddr = tcp->tcp_remote;
9220 ac->ac_fport = tcp->tcp_fport;
9221 ac->ac_lport = tcp->tcp_lport;
9222 tcp->tcp_hard_binding = 1;
9223 break;
9224
9225 case sizeof (ipa6_conn_t):
9226 ASSERT(tcp->tcp_family == AF_INET6);
9227
9228 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
9229 if (mp->b_cont == NULL) {
9230 freemsg(mp);
9231 return (NULL);
9232 }
9233 mp->b_cont->b_wptr += sizeof (ire_t);
9234 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
9235
9236 /* cp known to be 32 bit aligned */
9237 ac6 = (ipa6_conn_t *)cp;
9238 if (tcp->tcp_ipversion == IPV4_VERSION) {
9239 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
9240 &ac6->ac6_laddr);
9241 } else {
9242 ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src;
9243 }
9244 ac6->ac6_faddr = tcp->tcp_remote_v6;
9245 ac6->ac6_fport = tcp->tcp_fport;
9246 ac6->ac6_lport = tcp->tcp_lport;
9247 tcp->tcp_hard_binding = 1;
9248 break;
9249
9250 case sizeof (sin_t):
9251 /*
9252 * NOTE: IPV6_ADDR_LEN also has same size.
9253 * Use family to discriminate.
9254 */
9255 if (tcp->tcp_family == AF_INET) {
9256 sin = (sin_t *)cp;
9257
9258 *sin = sin_null;
9259 sin->sin_family = AF_INET;
9260 sin->sin_addr.s_addr = tcp->tcp_bound_source;
9261 sin->sin_port = tcp->tcp_lport;
9262 break;
9263 } else {
9264 *(in6_addr_t *)cp = tcp->tcp_bound_source_v6;
9265 }
9266 break;
9267
9268 case sizeof (sin6_t):
9269 ASSERT(tcp->tcp_family == AF_INET6);
9270 sin6 = (sin6_t *)cp;
9271
9272 *sin6 = sin6_null;
9273 sin6->sin6_family = AF_INET6;
9274 sin6->sin6_addr = tcp->tcp_bound_source_v6;
9275 sin6->sin6_port = tcp->tcp_lport;
9276 break;
9277
9278 case IP_ADDR_LEN:
9279 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
9280 *(uint32_t *)cp = tcp->tcp_ipha->ipha_src;
9281 break;
9282
9283 }
9284 /* Add protocol number to end */
9285 cp[addr_length] = (char)IPPROTO_TCP;
9286 mp->b_wptr = (uchar_t *)&cp[addr_length + 1];
9287 return (mp);
9288 }
9289
9290 /*
9291 * Notify IP that we are having trouble with this connection. IP should
9292 * blow the IRE away and start over.
9293 */
9294 static void
9295 tcp_ip_notify(tcp_t *tcp)
9296 {
9297 struct iocblk *iocp;
9298 ipid_t *ipid;
9299 mblk_t *mp;
9300
9301 /* IPv6 has NUD thus notification to delete the IRE is not needed */
9302 if (tcp->tcp_ipversion == IPV6_VERSION)
9303 return;
9304
9305 mp = mkiocb(IP_IOCTL);
9306 if (mp == NULL)
9307 return;
9308
9309 iocp = (struct iocblk *)mp->b_rptr;
9310 iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst);
9311
9312 mp->b_cont = allocb(iocp->ioc_count, BPRI_HI);
9313 if (!mp->b_cont) {
9314 freeb(mp);
9315 return;
9316 }
9317
9318 ipid = (ipid_t *)mp->b_cont->b_rptr;
9319 mp->b_cont->b_wptr += iocp->ioc_count;
9320 bzero(ipid, sizeof (*ipid));
9321 ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
9322 ipid->ipid_ire_type = IRE_CACHE;
9323 ipid->ipid_addr_offset = sizeof (ipid_t);
9324 ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst);
9325 /*
9326 * Note: in the case of source routing we want to blow away the
9327 * route to the first source route hop.
9328 */
9329 bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1],
9330 sizeof (tcp->tcp_ipha->ipha_dst));
9331
9332 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
9333 }
9334
9335 /* Unlink and return any mblk that looks like it contains an ire */
9336 static mblk_t *
9337 tcp_ire_mp(mblk_t *mp)
9338 {
9339 mblk_t *prev_mp;
9340
9341 for (;;) {
9342 prev_mp = mp;
9343 mp = mp->b_cont;
9344 if (mp == NULL)
9345 break;
9346 switch (DB_TYPE(mp)) {
9347 case IRE_DB_TYPE:
9348 case IRE_DB_REQ_TYPE:
9349 if (prev_mp != NULL)
9350 prev_mp->b_cont = mp->b_cont;
9351 mp->b_cont = NULL;
9352 return (mp);
9353 default:
9354 break;
9355 }
9356 }
9357 return (mp);
9358 }
9359
9360 /*
9361 * Timer callback routine for keepalive probe. We do a fake resend of
9362 * last ACKed byte. Then set a timer using RTO. When the timer expires,
9363 * check to see if we have heard anything from the other end for the last
9364 * RTO period. If we have, set the timer to expire for another
9365 * tcp_keepalive_intrvl and check again. If we have not, set a timer using
9366 * RTO << 1 and check again when it expires. Keep exponentially increasing
9367 * the timeout if we have not heard from the other side. If for more than
9368 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
9369 * kill the connection unless the keepalive abort threshold is 0. In
9370 * that case, we will probe "forever."
9371 */
9372 static void
9373 tcp_keepalive_killer(void *arg)
9374 {
9375 mblk_t *mp;
9376 conn_t *connp = (conn_t *)arg;
9377 tcp_t *tcp = connp->conn_tcp;
9378 int32_t firetime;
9379 int32_t idletime;
9380 int32_t ka_intrvl;
9381 tcp_stack_t *tcps = tcp->tcp_tcps;
9382
9383 tcp->tcp_ka_tid = 0;
9384
9385 if (tcp->tcp_fused)
9386 return;
9387
9388 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive);
9389 ka_intrvl = tcp->tcp_ka_interval;
9390
9391 /*
9392 * Keepalive probe should only be sent if the application has not
9393 * done a close on the connection.
9394 */
9395 if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
9396 return;
9397 }
9398 /* Timer fired too early, restart it. */
9399 if (tcp->tcp_state < TCPS_ESTABLISHED) {
9400 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
9401 MSEC_TO_TICK(ka_intrvl));
9402 return;
9403 }
9404
9405 idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time);
9406 /*
9407 * If we have not heard from the other side for a long
9408 * time, kill the connection unless the keepalive abort
9409 * threshold is 0. In that case, we will probe "forever."
9410 */
9411 if (tcp->tcp_ka_abort_thres != 0 &&
9412 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
9413 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop);
9414 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
9415 tcp->tcp_client_errno : ETIMEDOUT, 11);
9416 return;
9417 }
9418
9419 if (tcp->tcp_snxt == tcp->tcp_suna &&
9420 idletime >= ka_intrvl) {
9421 /* Fake resend of last ACKed byte. */
9422 mblk_t *mp1 = allocb(1, BPRI_LO);
9423
9424 if (mp1 != NULL) {
9425 *mp1->b_wptr++ = '\0';
9426 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
9427 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
9428 freeb(mp1);
9429 /*
9430 * if allocation failed, fall through to start the
9431 * timer back.
9432 */
9433 if (mp != NULL) {
9434 TCP_RECORD_TRACE(tcp, mp,
9435 TCP_TRACE_SEND_PKT);
9436 tcp_send_data(tcp, tcp->tcp_wq, mp);
9437 BUMP_MIB(&tcps->tcps_mib,
9438 tcpTimKeepaliveProbe);
9439 if (tcp->tcp_ka_last_intrvl != 0) {
9440 int max;
9441 /*
9442 * We should probe again at least
9443 * in ka_intrvl, but not more than
9444 * tcp_rexmit_interval_max.
9445 */
9446 max = tcps->tcps_rexmit_interval_max;
9447 firetime = MIN(ka_intrvl - 1,
9448 tcp->tcp_ka_last_intrvl << 1);
9449 if (firetime > max)
9450 firetime = max;
9451 } else {
9452 firetime = tcp->tcp_rto;
9453 }
9454 tcp->tcp_ka_tid = TCP_TIMER(tcp,
9455 tcp_keepalive_killer,
9456 MSEC_TO_TICK(firetime));
9457 tcp->tcp_ka_last_intrvl = firetime;
9458 return;
9459 }
9460 }
9461 } else {
9462 tcp->tcp_ka_last_intrvl = 0;
9463 }
9464
9465 /* firetime can be negative if (mp1 == NULL || mp == NULL) */
9466 if ((firetime = ka_intrvl - idletime) < 0) {
9467 firetime = ka_intrvl;
9468 }
9469 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
9470 MSEC_TO_TICK(firetime));
9471 }
9472
9473 int
9474 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
9475 {
9476 queue_t *q = tcp->tcp_rq;
9477 int32_t mss = tcp->tcp_mss;
9478 int maxpsz;
9479
9480 if (TCP_IS_DETACHED(tcp))
9481 return (mss);
9482
9483 if (tcp->tcp_fused) {
9484 maxpsz = tcp_fuse_maxpsz_set(tcp);
9485 mss = INFPSZ;
9486 } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) {
9487 /*
9488 * Set the sd_qn_maxpsz according to the socket send buffer
9489 * size, and sd_maxblk to INFPSZ (-1). This will essentially
9490 * instruct the stream head to copyin user data into contiguous
9491 * kernel-allocated buffers without breaking it up into smaller
9492 * chunks. We round up the buffer size to the nearest SMSS.
9493 */
9494 maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss);
9495 if (tcp->tcp_kssl_ctx == NULL)
9496 mss = INFPSZ;
9497 else
9498 mss = SSL3_MAX_RECORD_LEN;
9499 } else {
9500 /*
9501 * Set sd_qn_maxpsz to approx half the (receivers) buffer
9502 * (and a multiple of the mss). This instructs the stream
9503 * head to break down larger than SMSS writes into SMSS-
9504 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
9505 */
9506 maxpsz = tcp->tcp_maxpsz * mss;
9507 if (maxpsz > tcp->tcp_xmit_hiwater/2) {
9508 maxpsz = tcp->tcp_xmit_hiwater/2;
9509 /* Round up to nearest mss */
9510 maxpsz = MSS_ROUNDUP(maxpsz, mss);
9511 }
9512 }
9513 (void) setmaxps(q, maxpsz);
9514 tcp->tcp_wq->q_maxpsz = maxpsz;
9515
9516 if (set_maxblk)
9517 (void) mi_set_sth_maxblk(q, mss);
9518
9519 return (mss);
9520 }
9521
9522 /*
9523 * Extract option values from a tcp header. We put any found values into the
9524 * tcpopt struct and return a bitmask saying which options were found.
9525 */
9526 static int
9527 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
9528 {
9529 uchar_t *endp;
9530 int len;
9531 uint32_t mss;
9532 uchar_t *up = (uchar_t *)tcph;
9533 int found = 0;
9534 int32_t sack_len;
9535 tcp_seq sack_begin, sack_end;
9536 tcp_t *tcp;
9537
9538 endp = up + TCP_HDR_LENGTH(tcph);
9539 up += TCP_MIN_HEADER_LENGTH;
9540 while (up < endp) {
9541 len = endp - up;
9542 switch (*up) {
9543 case TCPOPT_EOL:
9544 break;
9545
9546 case TCPOPT_NOP:
9547 up++;
9548 continue;
9549
9550 case TCPOPT_MAXSEG:
9551 if (len < TCPOPT_MAXSEG_LEN ||
9552 up[1] != TCPOPT_MAXSEG_LEN)
9553 break;
9554
9555 mss = BE16_TO_U16(up+2);
9556 /* Caller must handle tcp_mss_min and tcp_mss_max_* */
9557 tcpopt->tcp_opt_mss = mss;
9558 found |= TCP_OPT_MSS_PRESENT;
9559
9560 up += TCPOPT_MAXSEG_LEN;
9561 continue;
9562
9563 case TCPOPT_WSCALE:
9564 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
9565 break;
9566
9567 if (up[2] > TCP_MAX_WINSHIFT)
9568 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
9569 else
9570 tcpopt->tcp_opt_wscale = up[2];
9571 found |= TCP_OPT_WSCALE_PRESENT;
9572
9573 up += TCPOPT_WS_LEN;
9574 continue;
9575
9576 case TCPOPT_SACK_PERMITTED:
9577 if (len < TCPOPT_SACK_OK_LEN ||
9578 up[1] != TCPOPT_SACK_OK_LEN)
9579 break;
9580 found |= TCP_OPT_SACK_OK_PRESENT;
9581 up += TCPOPT_SACK_OK_LEN;
9582 continue;
9583
9584 case TCPOPT_SACK:
9585 if (len <= 2 || up[1] <= 2 || len < up[1])
9586 break;
9587
9588 /* If TCP is not interested in SACK blks... */
9589 if ((tcp = tcpopt->tcp) == NULL) {
9590 up += up[1];
9591 continue;
9592 }
9593 sack_len = up[1] - TCPOPT_HEADER_LEN;
9594 up += TCPOPT_HEADER_LEN;
9595
9596 /*
9597 * If the list is empty, allocate one and assume
9598 * nothing is sack'ed.
9599 */
9600 ASSERT(tcp->tcp_sack_info != NULL);
9601 if (tcp->tcp_notsack_list == NULL) {
9602 tcp_notsack_update(&(tcp->tcp_notsack_list),
9603 tcp->tcp_suna, tcp->tcp_snxt,
9604 &(tcp->tcp_num_notsack_blk),
9605 &(tcp->tcp_cnt_notsack_list));
9606
9607 /*
9608 * Make sure tcp_notsack_list is not NULL.
9609 * This happens when kmem_alloc(KM_NOSLEEP)
9610 * returns NULL.
9611 */
9612 if (tcp->tcp_notsack_list == NULL) {
9613 up += sack_len;
9614 continue;
9615 }
9616 tcp->tcp_fack = tcp->tcp_suna;
9617 }
9618
9619 while (sack_len > 0) {
9620 if (up + 8 > endp) {
9621 up = endp;
9622 break;
9623 }
9624 sack_begin = BE32_TO_U32(up);
9625 up += 4;
9626 sack_end = BE32_TO_U32(up);
9627 up += 4;
9628 sack_len -= 8;
9629 /*
9630 * Bounds checking. Make sure the SACK
9631 * info is within tcp_suna and tcp_snxt.
9632 * If this SACK blk is out of bound, ignore
9633 * it but continue to parse the following
9634 * blks.
9635 */
9636 if (SEQ_LEQ(sack_end, sack_begin) ||
9637 SEQ_LT(sack_begin, tcp->tcp_suna) ||
9638 SEQ_GT(sack_end, tcp->tcp_snxt)) {
9639 continue;
9640 }
9641 tcp_notsack_insert(&(tcp->tcp_notsack_list),
9642 sack_begin, sack_end,
9643 &(tcp->tcp_num_notsack_blk),
9644 &(tcp->tcp_cnt_notsack_list));
9645 if (SEQ_GT(sack_end, tcp->tcp_fack)) {
9646 tcp->tcp_fack = sack_end;
9647 }
9648 }
9649 found |= TCP_OPT_SACK_PRESENT;
9650 continue;
9651
9652 case TCPOPT_TSTAMP:
9653 if (len < TCPOPT_TSTAMP_LEN ||
9654 up[1] != TCPOPT_TSTAMP_LEN)
9655 break;
9656
9657 tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2);
9658 tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6);
9659
9660 found |= TCP_OPT_TSTAMP_PRESENT;
9661
9662 up += TCPOPT_TSTAMP_LEN;
9663 continue;
9664
9665 default:
9666 if (len <= 1 || len < (int)up[1] || up[1] == 0)
9667 break;
9668 up += up[1];
9669 continue;
9670 }
9671 break;
9672 }
9673 return (found);
9674 }
9675
9676 /*
9677 * Set the mss associated with a particular tcp based on its current value,
9678 * and a new one passed in. Observe minimums and maximums, and reset
9679 * other state variables that we want to view as multiples of mss.
9680 *
9681 * This function is called mainly because values like tcp_mss, tcp_cwnd,
9682 * highwater marks etc. need to be initialized or adjusted.
9683 * 1) From tcp_process_options() when the other side's SYN/SYN-ACK
9684 * packet arrives.
9685 * 2) We need to set a new MSS when ICMP_FRAGMENTATION_NEEDED or
9686 * ICMP6_PACKET_TOO_BIG arrives.
9687 * 3) From tcp_paws_check() if the other side stops sending the timestamp,
9688 * to increase the MSS to use the extra bytes available.
9689 *
9690 * Callers except tcp_paws_check() ensure that they only reduce mss.
9691 */
9692 static void
9693 tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss)
9694 {
9695 uint32_t mss_max;
9696 tcp_stack_t *tcps = tcp->tcp_tcps;
9697
9698 if (tcp->tcp_ipversion == IPV4_VERSION)
9699 mss_max = tcps->tcps_mss_max_ipv4;
9700 else
9701 mss_max = tcps->tcps_mss_max_ipv6;
9702
9703 if (mss < tcps->tcps_mss_min)
9704 mss = tcps->tcps_mss_min;
9705 if (mss > mss_max)
9706 mss = mss_max;
9707 /*
9708 * Unless naglim has been set by our client to
9709 * a non-mss value, force naglim to track mss.
9710 * This can help to aggregate small writes.
9711 */
9712 if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim)
9713 tcp->tcp_naglim = mss;
9714 /*
9715 * TCP should be able to buffer at least 4 MSS data for obvious
9716 * performance reason.
9717 */
9718 if ((mss << 2) > tcp->tcp_xmit_hiwater)
9719 tcp->tcp_xmit_hiwater = mss << 2;
9720
9721 if (do_ss) {
9722 /*
9723 * Either the tcp_cwnd is as yet uninitialized, or mss is
9724 * changing due to a reduction in MTU, presumably as a
9725 * result of a new path component, reset cwnd to its
9726 * "initial" value, as a multiple of the new mss.
9727 */
9728 SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_initial);
9729 } else {
9730 /*
9731 * Called by tcp_paws_check(), the mss increased
9732 * marginally to allow use of space previously taken
9733 * by the timestamp option. It would be inappropriate
9734 * to apply slow start or tcp_init_cwnd values to
9735 * tcp_cwnd, simply adjust to a multiple of the new mss.
9736 */
9737 tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss;
9738 tcp->tcp_cwnd_cnt = 0;
9739 }
9740 tcp->tcp_mss = mss;
9741 (void) tcp_maxpsz_set(tcp, B_TRUE);
9742 }
9743
9744 /* For /dev/tcp aka AF_INET open */
9745 static int
9746 tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
9747 {
9748 return (tcp_open(q, devp, flag, sflag, credp, B_FALSE));
9749 }
9750
9751 /* For /dev/tcp6 aka AF_INET6 open */
9752 static int
9753 tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
9754 {
9755 return (tcp_open(q, devp, flag, sflag, credp, B_TRUE));
9756 }
9757
9758 static int
9759 tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp,
9760 boolean_t isv6)
9761 {
9762 tcp_t *tcp = NULL;
9763 conn_t *connp;
9764 int err;
9765 vmem_t *minor_arena = NULL;
9766 dev_t conn_dev;
9767 zoneid_t zoneid;
9768 tcp_stack_t *tcps = NULL;
9769
9770 if (q->q_ptr != NULL)
9771 return (0);
9772
9773 if (sflag == MODOPEN)
9774 return (EINVAL);
9775
9776 if (!(flag & SO_ACCEPTOR)) {
9777 /*
9778 * Special case for install: miniroot needs to be able to
9779 * access files via NFS as though it were always in the
9780 * global zone.
9781 */
9782 if (credp == kcred && nfs_global_client_only != 0) {
9783 zoneid = GLOBAL_ZONEID;
9784 tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->
9785 netstack_tcp;
9786 ASSERT(tcps != NULL);
9787 } else {
9788 netstack_t *ns;
9789
9790 ns = netstack_find_by_cred(credp);
9791 ASSERT(ns != NULL);
9792 tcps = ns->netstack_tcp;
9793 ASSERT(tcps != NULL);
9794
9795 /*
9796 * For exclusive stacks we set the zoneid to zero
9797 * to make TCP operate as if in the global zone.
9798 */
9799 if (tcps->tcps_netstack->netstack_stackid !=
9800 GLOBAL_NETSTACKID)
9801 zoneid = GLOBAL_ZONEID;
9802 else
9803 zoneid = crgetzoneid(credp);
9804 }
9805 /*
9806 * For stackid zero this is done from strplumb.c, but
9807 * non-zero stackids are handled here.
9808 */
9809 if (tcps->tcps_g_q == NULL &&
9810 tcps->tcps_netstack->netstack_stackid !=
9811 GLOBAL_NETSTACKID) {
9812 tcp_g_q_setup(tcps);
9813 }
9814 }
9815
9816 if ((ip_minor_arena_la != NULL) && (flag & SO_SOCKSTR) &&
9817 ((conn_dev = inet_minor_alloc(ip_minor_arena_la)) != 0)) {
9818 minor_arena = ip_minor_arena_la;
9819 } else {
9820 /*
9821 * Either minor numbers in the large arena were exhausted
9822 * or a non socket application is doing the open.
9823 * Try to allocate from the small arena.
9824 */
9825 if ((conn_dev = inet_minor_alloc(ip_minor_arena_sa)) == 0) {
9826 if (tcps != NULL)
9827 netstack_rele(tcps->tcps_netstack);
9828 return (EBUSY);
9829 }
9830 minor_arena = ip_minor_arena_sa;
9831 }
9832 ASSERT(minor_arena != NULL);
9833
9834 *devp = makedevice(getemajor(*devp), (minor_t)conn_dev);
9835
9836 if (flag & SO_ACCEPTOR) {
9837 /* No netstack_find_by_cred, hence no netstack_rele needed */
9838 ASSERT(tcps == NULL);
9839 q->q_qinfo = &tcp_acceptor_rinit;
9840 /*
9841 * the conn_dev and minor_arena will be subsequently used by
9842 * tcp_wput_accept() and tcpclose_accept() to figure out the
9843 * minor device number for this connection from the q_ptr.
9844 */
9845 RD(q)->q_ptr = (void *)conn_dev;
9846 WR(q)->q_qinfo = &tcp_acceptor_winit;
9847 WR(q)->q_ptr = (void *)minor_arena;
9848 qprocson(q);
9849 return (0);
9850 }
9851
9852 connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt), tcps);
9853 /*
9854 * Both tcp_get_conn and netstack_find_by_cred incremented refcnt,
9855 * so we drop it by one.
9856 */
9857 netstack_rele(tcps->tcps_netstack);
9858 if (connp == NULL) {
9859 inet_minor_free(minor_arena, conn_dev);
9860 q->q_ptr = NULL;
9861 return (ENOSR);
9862 }
9863 connp->conn_sqp = IP_SQUEUE_GET(lbolt);
9864 tcp = connp->conn_tcp;
9865
9866 q->q_ptr = WR(q)->q_ptr = connp;
9867 if (isv6) {
9868 connp->conn_flags |= (IPCL_TCP6|IPCL_ISV6);
9869 connp->conn_send = ip_output_v6;
9870 connp->conn_af_isv6 = B_TRUE;
9871 connp->conn_pkt_isv6 = B_TRUE;
9872 connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT;
9873 tcp->tcp_ipversion = IPV6_VERSION;
9874 tcp->tcp_family = AF_INET6;
9875 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
9876 } else {
9877 connp->conn_flags |= IPCL_TCP4;
9878 connp->conn_send = ip_output;
9879 connp->conn_af_isv6 = B_FALSE;
9880 connp->conn_pkt_isv6 = B_FALSE;
9881 tcp->tcp_ipversion = IPV4_VERSION;
9882 tcp->tcp_family = AF_INET;
9883 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
9884 }
9885
9886 /*
9887 * TCP keeps a copy of cred for cache locality reasons but
9888 * we put a reference only once. If connp->conn_cred
9889 * becomes invalid, tcp_cred should also be set to NULL.
9890 */
9891 tcp->tcp_cred = connp->conn_cred = credp;
9892 crhold(connp->conn_cred);
9893 tcp->tcp_cpid = curproc->p_pid;
9894 tcp->tcp_open_time = lbolt64;
9895 connp->conn_zoneid = zoneid;
9896 connp->conn_mlp_type = mlptSingle;
9897 connp->conn_ulp_labeled = !is_system_labeled();
9898 ASSERT(connp->conn_netstack == tcps->tcps_netstack);
9899 ASSERT(tcp->tcp_tcps == tcps);
9900
9901 /*
9902 * If the caller has the process-wide flag set, then default to MAC
9903 * exempt mode. This allows read-down to unlabeled hosts.
9904 */
9905 if (getpflags(NET_MAC_AWARE, credp) != 0)
9906 connp->conn_mac_exempt = B_TRUE;
9907
9908 connp->conn_dev = conn_dev;
9909 connp->conn_minor_arena = minor_arena;
9910
9911 ASSERT(q->q_qinfo == &tcp_rinitv4 || q->q_qinfo == &tcp_rinitv6);
9912 ASSERT(WR(q)->q_qinfo == &tcp_winit);
9913
9914 if (flag & SO_SOCKSTR) {
9915 /*
9916 * No need to insert a socket in tcp acceptor hash.
9917 * If it was a socket acceptor stream, we dealt with
9918 * it above. A socket listener can never accept a
9919 * connection and doesn't need acceptor_id.
9920 */
9921 connp->conn_flags |= IPCL_SOCKET;
9922 tcp->tcp_issocket = 1;
9923 WR(q)->q_qinfo = &tcp_sock_winit;
9924 } else {
9925 #ifdef _ILP32
9926 tcp->tcp_acceptor_id = (t_uscalar_t)RD(q);
9927 #else
9928 tcp->tcp_acceptor_id = conn_dev;
9929 #endif /* _ILP32 */
9930 tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
9931 }
9932
9933 if (tcps->tcps_trace)
9934 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP);
9935
9936 err = tcp_init(tcp, q);
9937 if (err != 0) {
9938 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
9939 tcp_acceptor_hash_remove(tcp);
9940 CONN_DEC_REF(connp);
9941 q->q_ptr = WR(q)->q_ptr = NULL;
9942 return (err);
9943 }
9944
9945 RD(q)->q_hiwat = tcps->tcps_recv_hiwat;
9946 tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
9947
9948 /* Non-zero default values */
9949 connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP;
9950 /*
9951 * Put the ref for TCP. Ref for IP was already put
9952 * by ipcl_conn_create. Also Make the conn_t globally
9953 * visible to walkers
9954 */
9955 mutex_enter(&connp->conn_lock);
9956 CONN_INC_REF_LOCKED(connp);
9957 ASSERT(connp->conn_ref == 2);
9958 connp->conn_state_flags &= ~CONN_INCIPIENT;
9959 mutex_exit(&connp->conn_lock);
9960
9961 qprocson(q);
9962 return (0);
9963 }
9964
9965 /*
9966 * Some TCP options can be "set" by requesting them in the option
9967 * buffer. This is needed for XTI feature test though we do not
9968 * allow it in general. We interpret that this mechanism is more
9969 * applicable to OSI protocols and need not be allowed in general.
9970 * This routine filters out options for which it is not allowed (most)
9971 * and lets through those (few) for which it is. [ The XTI interface
9972 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
9973 * ever implemented will have to be allowed here ].
9974 */
9975 static boolean_t
9976 tcp_allow_connopt_set(int level, int name)
9977 {
9978
9979 switch (level) {
9980 case IPPROTO_TCP:
9981 switch (name) {
9982 case TCP_NODELAY:
9983 return (B_TRUE);
9984 default:
9985 return (B_FALSE);
9986 }
9987 /*NOTREACHED*/
9988 default:
9989 return (B_FALSE);
9990 }
9991 /*NOTREACHED*/
9992 }
9993
9994 /*
9995 * This routine gets default values of certain options whose default
9996 * values are maintained by protocol specific code
9997 */
9998 /* ARGSUSED */
9999 int
10000 tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
10001 {
10002 int32_t *i1 = (int32_t *)ptr;
10003 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
10004
10005 switch (level) {
10006 case IPPROTO_TCP:
10007 switch (name) {
10008 case TCP_NOTIFY_THRESHOLD:
10009 *i1 = tcps->tcps_ip_notify_interval;
10010 break;
10011 case TCP_ABORT_THRESHOLD:
10012 *i1 = tcps->tcps_ip_abort_interval;
10013 break;
10014 case TCP_CONN_NOTIFY_THRESHOLD:
10015 *i1 = tcps->tcps_ip_notify_cinterval;
10016 break;
10017 case TCP_CONN_ABORT_THRESHOLD:
10018 *i1 = tcps->tcps_ip_abort_cinterval;
10019 break;
10020 default:
10021 return (-1);
10022 }
10023 break;
10024 case IPPROTO_IP:
10025 switch (name) {
10026 case IP_TTL:
10027 *i1 = tcps->tcps_ipv4_ttl;
10028 break;
10029 default:
10030 return (-1);
10031 }
10032 break;
10033 case IPPROTO_IPV6:
10034 switch (name) {
10035 case IPV6_UNICAST_HOPS:
10036 *i1 = tcps->tcps_ipv6_hoplimit;
10037 break;
10038 default:
10039 return (-1);
10040 }
10041 break;
10042 default:
10043 return (-1);
10044 }
10045 return (sizeof (int));
10046 }
10047
10048
10049 /*
10050 * TCP routine to get the values of options.
10051 */
10052 int
10053 tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
10054 {
10055 int *i1 = (int *)ptr;
10056 conn_t *connp = Q_TO_CONN(q);
10057 tcp_t *tcp = connp->conn_tcp;
10058 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
10059
10060 switch (level) {
10061 case SOL_SOCKET:
10062 switch (name) {
10063 case SO_LINGER: {
10064 struct linger *lgr = (struct linger *)ptr;
10065
10066 lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0;
10067 lgr->l_linger = tcp->tcp_lingertime;
10068 }
10069 return (sizeof (struct linger));
10070 case SO_DEBUG:
10071 *i1 = tcp->tcp_debug ? SO_DEBUG : 0;
10072 break;
10073 case SO_KEEPALIVE:
10074 *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0;
10075 break;
10076 case SO_DONTROUTE:
10077 *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0;
10078 break;
10079 case SO_USELOOPBACK:
10080 *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0;
10081 break;
10082 case SO_BROADCAST:
10083 *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0;
10084 break;
10085 case SO_REUSEADDR:
10086 *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0;
10087 break;
10088 case SO_OOBINLINE:
10089 *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0;
10090 break;
10091 case SO_DGRAM_ERRIND:
10092 *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0;
10093 break;
10094 case SO_TYPE:
10095 *i1 = SOCK_STREAM;
10096 break;
10097 case SO_SNDBUF:
10098 *i1 = tcp->tcp_xmit_hiwater;
10099 break;
10100 case SO_RCVBUF:
10101 *i1 = RD(q)->q_hiwat;
10102 break;
10103 case SO_SND_COPYAVOID:
10104 *i1 = tcp->tcp_snd_zcopy_on ?
10105 SO_SND_COPYAVOID : 0;
10106 break;
10107 case SO_ALLZONES:
10108 *i1 = connp->conn_allzones ? 1 : 0;
10109 break;
10110 case SO_ANON_MLP:
10111 *i1 = connp->conn_anon_mlp;
10112 break;
10113 case SO_MAC_EXEMPT:
10114 *i1 = connp->conn_mac_exempt;
10115 break;
10116 case SO_EXCLBIND:
10117 *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0;
10118 break;
10119 case SO_PROTOTYPE:
10120 *i1 = IPPROTO_TCP;
10121 break;
10122 case SO_DOMAIN:
10123 *i1 = tcp->tcp_family;
10124 break;
10125 default:
10126 return (-1);
10127 }
10128 break;
10129 case IPPROTO_TCP:
10130 switch (name) {
10131 case TCP_NODELAY:
10132 *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
10133 break;
10134 case TCP_MAXSEG:
10135 *i1 = tcp->tcp_mss;
10136 break;
10137 case TCP_NOTIFY_THRESHOLD:
10138 *i1 = (int)tcp->tcp_first_timer_threshold;
10139 break;
10140 case TCP_ABORT_THRESHOLD:
10141 *i1 = tcp->tcp_second_timer_threshold;
10142 break;
10143 case TCP_CONN_NOTIFY_THRESHOLD:
10144 *i1 = tcp->tcp_first_ctimer_threshold;
10145 break;
10146 case TCP_CONN_ABORT_THRESHOLD:
10147 *i1 = tcp->tcp_second_ctimer_threshold;
10148 break;
10149 case TCP_RECVDSTADDR:
10150 *i1 = tcp->tcp_recvdstaddr;
10151 break;
10152 case TCP_ANONPRIVBIND:
10153 *i1 = tcp->tcp_anon_priv_bind;
10154 break;
10155 case TCP_EXCLBIND:
10156 *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0;
10157 break;
10158 case TCP_INIT_CWND:
10159 *i1 = tcp->tcp_init_cwnd;
10160 break;
10161 case TCP_KEEPALIVE_THRESHOLD:
10162 *i1 = tcp->tcp_ka_interval;
10163 break;
10164 case TCP_KEEPALIVE_ABORT_THRESHOLD:
10165 *i1 = tcp->tcp_ka_abort_thres;
10166 break;
10167 case TCP_CORK:
10168 *i1 = tcp->tcp_cork;
10169 break;
10170 default:
10171 return (-1);
10172 }
10173 break;
10174 case IPPROTO_IP:
10175 if (tcp->tcp_family != AF_INET)
10176 return (-1);
10177 switch (name) {
10178 case IP_OPTIONS:
10179 case T_IP_OPTIONS: {
10180 /*
10181 * This is compatible with BSD in that in only return
10182 * the reverse source route with the final destination
10183 * as the last entry. The first 4 bytes of the option
10184 * will contain the final destination.
10185 */
10186 int opt_len;
10187
10188 opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha;
10189 opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH;
10190 ASSERT(opt_len >= 0);
10191 /* Caller ensures enough space */
10192 if (opt_len > 0) {
10193 /*
10194 * TODO: Do we have to handle getsockopt on an
10195 * initiator as well?
10196 */
10197 return (ip_opt_get_user(tcp->tcp_ipha, ptr));
10198 }
10199 return (0);
10200 }
10201 case IP_TOS:
10202 case T_IP_TOS:
10203 *i1 = (int)tcp->tcp_ipha->ipha_type_of_service;
10204 break;
10205 case IP_TTL:
10206 *i1 = (int)tcp->tcp_ipha->ipha_ttl;
10207 break;
10208 case IP_NEXTHOP:
10209 /* Handled at IP level */
10210 return (-EINVAL);
10211 default:
10212 return (-1);
10213 }
10214 break;
10215 case IPPROTO_IPV6:
10216 /*
10217 * IPPROTO_IPV6 options are only supported for sockets
10218 * that are using IPv6 on the wire.
10219 */
10220 if (tcp->tcp_ipversion != IPV6_VERSION) {
10221 return (-1);
10222 }
10223 switch (name) {
10224 case IPV6_UNICAST_HOPS:
10225 *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops;
10226 break; /* goto sizeof (int) option return */
10227 case IPV6_BOUND_IF:
10228 /* Zero if not set */
10229 *i1 = tcp->tcp_bound_if;
10230 break; /* goto sizeof (int) option return */
10231 case IPV6_RECVPKTINFO:
10232 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)
10233 *i1 = 1;
10234 else
10235 *i1 = 0;
10236 break; /* goto sizeof (int) option return */
10237 case IPV6_RECVTCLASS:
10238 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)
10239 *i1 = 1;
10240 else
10241 *i1 = 0;
10242 break; /* goto sizeof (int) option return */
10243 case IPV6_RECVHOPLIMIT:
10244 if (tcp->tcp_ipv6_recvancillary &
10245 TCP_IPV6_RECVHOPLIMIT)
10246 *i1 = 1;
10247 else
10248 *i1 = 0;
10249 break; /* goto sizeof (int) option return */
10250 case IPV6_RECVHOPOPTS:
10251 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS)
10252 *i1 = 1;
10253 else
10254 *i1 = 0;
10255 break; /* goto sizeof (int) option return */
10256 case IPV6_RECVDSTOPTS:
10257 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS)
10258 *i1 = 1;
10259 else
10260 *i1 = 0;
10261 break; /* goto sizeof (int) option return */
10262 case _OLD_IPV6_RECVDSTOPTS:
10263 if (tcp->tcp_ipv6_recvancillary &
10264 TCP_OLD_IPV6_RECVDSTOPTS)
10265 *i1 = 1;
10266 else
10267 *i1 = 0;
10268 break; /* goto sizeof (int) option return */
10269 case IPV6_RECVRTHDR:
10270 if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR)
10271 *i1 = 1;
10272 else
10273 *i1 = 0;
10274 break; /* goto sizeof (int) option return */
10275 case IPV6_RECVRTHDRDSTOPTS:
10276 if (tcp->tcp_ipv6_recvancillary &
10277 TCP_IPV6_RECVRTDSTOPTS)
10278 *i1 = 1;
10279 else
10280 *i1 = 0;
10281 break; /* goto sizeof (int) option return */
10282 case IPV6_PKTINFO: {
10283 /* XXX assumes that caller has room for max size! */
10284 struct in6_pktinfo *pkti;
10285
10286 pkti = (struct in6_pktinfo *)ptr;
10287 if (ipp->ipp_fields & IPPF_IFINDEX)
10288 pkti->ipi6_ifindex = ipp->ipp_ifindex;
10289 else
10290 pkti->ipi6_ifindex = 0;
10291 if (ipp->ipp_fields & IPPF_ADDR)
10292 pkti->ipi6_addr = ipp->ipp_addr;
10293 else
10294 pkti->ipi6_addr = ipv6_all_zeros;
10295 return (sizeof (struct in6_pktinfo));
10296 }
10297 case IPV6_TCLASS:
10298 if (ipp->ipp_fields & IPPF_TCLASS)
10299 *i1 = ipp->ipp_tclass;
10300 else
10301 *i1 = IPV6_FLOW_TCLASS(
10302 IPV6_DEFAULT_VERS_AND_FLOW);
10303 break; /* goto sizeof (int) option return */
10304 case IPV6_NEXTHOP: {
10305 sin6_t *sin6 = (sin6_t *)ptr;
10306
10307 if (!(ipp->ipp_fields & IPPF_NEXTHOP))
10308 return (0);
10309 *sin6 = sin6_null;
10310 sin6->sin6_family = AF_INET6;
10311 sin6->sin6_addr = ipp->ipp_nexthop;
10312 return (sizeof (sin6_t));
10313 }
10314 case IPV6_HOPOPTS:
10315 if (!(ipp->ipp_fields & IPPF_HOPOPTS))
10316 return (0);
10317 if (ipp->ipp_hopoptslen <= tcp->tcp_label_len)
10318 return (0);
10319 bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len,
10320 ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len);
10321 if (tcp->tcp_label_len > 0) {
10322 ptr[0] = ((char *)ipp->ipp_hopopts)[0];
10323 ptr[1] = (ipp->ipp_hopoptslen -
10324 tcp->tcp_label_len + 7) / 8 - 1;
10325 }
10326 return (ipp->ipp_hopoptslen - tcp->tcp_label_len);
10327 case IPV6_RTHDRDSTOPTS:
10328 if (!(ipp->ipp_fields & IPPF_RTDSTOPTS))
10329 return (0);
10330 bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen);
10331 return (ipp->ipp_rtdstoptslen);
10332 case IPV6_RTHDR:
10333 if (!(ipp->ipp_fields & IPPF_RTHDR))
10334 return (0);
10335 bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen);
10336 return (ipp->ipp_rthdrlen);
10337 case IPV6_DSTOPTS:
10338 if (!(ipp->ipp_fields & IPPF_DSTOPTS))
10339 return (0);
10340 bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen);
10341 return (ipp->ipp_dstoptslen);
10342 case IPV6_SRC_PREFERENCES:
10343 return (ip6_get_src_preferences(connp,
10344 (uint32_t *)ptr));
10345 case IPV6_PATHMTU: {
10346 struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr;
10347
10348 if (tcp->tcp_state < TCPS_ESTABLISHED)
10349 return (-1);
10350
10351 return (ip_fill_mtuinfo(&connp->conn_remv6,
10352 connp->conn_fport, mtuinfo,
10353 connp->conn_netstack));
10354 }
10355 default:
10356 return (-1);
10357 }
10358 break;
10359 default:
10360 return (-1);
10361 }
10362 return (sizeof (int));
10363 }
10364
10365 /*
10366 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
10367 * Parameters are assumed to be verified by the caller.
10368 */
10369 /* ARGSUSED */
10370 int
10371 tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
10372 uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
10373 void *thisdg_attrs, cred_t *cr, mblk_t *mblk)
10374 {
10375 conn_t *connp = Q_TO_CONN(q);
10376 tcp_t *tcp = connp->conn_tcp;
10377 int *i1 = (int *)invalp;
10378 boolean_t onoff = (*i1 == 0) ? 0 : 1;
10379 boolean_t checkonly;
10380 int reterr;
10381 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
10382
10383 switch (optset_context) {
10384 case SETFN_OPTCOM_CHECKONLY:
10385 checkonly = B_TRUE;
10386 /*
10387 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
10388 * inlen != 0 implies value supplied and
10389 * we have to "pretend" to set it.
10390 * inlen == 0 implies that there is no
10391 * value part in T_CHECK request and just validation
10392 * done elsewhere should be enough, we just return here.
10393 */
10394 if (inlen == 0) {
10395 *outlenp = 0;
10396 return (0);
10397 }
10398 break;
10399 case SETFN_OPTCOM_NEGOTIATE:
10400 checkonly = B_FALSE;
10401 break;
10402 case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
10403 case SETFN_CONN_NEGOTIATE:
10404 checkonly = B_FALSE;
10405 /*
10406 * Negotiating local and "association-related" options
10407 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
10408 * primitives is allowed by XTI, but we choose
10409 * to not implement this style negotiation for Internet
10410 * protocols (We interpret it is a must for OSI world but
10411 * optional for Internet protocols) for all options.
10412 * [ Will do only for the few options that enable test
10413 * suites that our XTI implementation of this feature
10414 * works for transports that do allow it ]
10415 */
10416 if (!tcp_allow_connopt_set(level, name)) {
10417 *outlenp = 0;
10418 return (EINVAL);
10419 }
10420 break;
10421 default:
10422 /*
10423 * We should never get here
10424 */
10425 *outlenp = 0;
10426 return (EINVAL);
10427 }
10428
10429 ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
10430 (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));
10431
10432 /*
10433 * For TCP, we should have no ancillary data sent down
10434 * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
10435 * has to be zero.
10436 */
10437 ASSERT(thisdg_attrs == NULL);
10438
10439 /*
10440 * For fixed length options, no sanity check
10441 * of passed in length is done. It is assumed *_optcom_req()
10442 * routines do the right thing.
10443 */
10444
10445 switch (level) {
10446 case SOL_SOCKET:
10447 switch (name) {
10448 case SO_LINGER: {
10449 struct linger *lgr = (struct linger *)invalp;
10450
10451 if (!checkonly) {
10452 if (lgr->l_onoff) {
10453 tcp->tcp_linger = 1;
10454 tcp->tcp_lingertime = lgr->l_linger;
10455 } else {
10456 tcp->tcp_linger = 0;
10457 tcp->tcp_lingertime = 0;
10458 }
10459 /* struct copy */
10460 *(struct linger *)outvalp = *lgr;
10461 } else {
10462 if (!lgr->l_onoff) {
10463 ((struct linger *)
10464 outvalp)->l_onoff = 0;
10465 ((struct linger *)
10466 outvalp)->l_linger = 0;
10467 } else {
10468 /* struct copy */
10469 *(struct linger *)outvalp = *lgr;
10470 }
10471 }
10472 *outlenp = sizeof (struct linger);
10473 return (0);
10474 }
10475 case SO_DEBUG:
10476 if (!checkonly)
10477 tcp->tcp_debug = onoff;
10478 break;
10479 case SO_KEEPALIVE:
10480 if (checkonly) {
10481 /* T_CHECK case */
10482 break;
10483 }
10484
10485 if (!onoff) {
10486 if (tcp->tcp_ka_enabled) {
10487 if (tcp->tcp_ka_tid != 0) {
10488 (void) TCP_TIMER_CANCEL(tcp,
10489 tcp->tcp_ka_tid);
10490 tcp->tcp_ka_tid = 0;
10491 }
10492 tcp->tcp_ka_enabled = 0;
10493 }
10494 break;
10495 }
10496 if (!tcp->tcp_ka_enabled) {
10497 /* Crank up the keepalive timer */
10498 tcp->tcp_ka_last_intrvl = 0;
10499 tcp->tcp_ka_tid = TCP_TIMER(tcp,
10500 tcp_keepalive_killer,
10501 MSEC_TO_TICK(tcp->tcp_ka_interval));
10502 tcp->tcp_ka_enabled = 1;
10503 }
10504 break;
10505 case SO_DONTROUTE:
10506 /*
10507 * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are
10508 * only of interest to IP. We track them here only so
10509 * that we can report their current value.
10510 */
10511 if (!checkonly) {
10512 tcp->tcp_dontroute = onoff;
10513 tcp->tcp_connp->conn_dontroute = onoff;
10514 }
10515 break;
10516 case SO_USELOOPBACK:
10517 if (!checkonly) {
10518 tcp->tcp_useloopback = onoff;
10519 tcp->tcp_connp->conn_loopback = onoff;
10520 }
10521 break;
10522 case SO_BROADCAST:
10523 if (!checkonly) {
10524 tcp->tcp_broadcast = onoff;
10525 tcp->tcp_connp->conn_broadcast = onoff;
10526 }
10527 break;
10528 case SO_REUSEADDR:
10529 if (!checkonly) {
10530 tcp->tcp_reuseaddr = onoff;
10531 tcp->tcp_connp->conn_reuseaddr = onoff;
10532 }
10533 break;
10534 case SO_OOBINLINE:
10535 if (!checkonly)
10536 tcp->tcp_oobinline = onoff;
10537 break;
10538 case SO_DGRAM_ERRIND:
10539 if (!checkonly)
10540 tcp->tcp_dgram_errind = onoff;
10541 break;
10542 case SO_SNDBUF: {
10543 if (*i1 > tcps->tcps_max_buf) {
10544 *outlenp = 0;
10545 return (ENOBUFS);
10546 }
10547 if (checkonly)
10548 break;
10549
10550 tcp->tcp_xmit_hiwater = *i1;
10551 if (tcps->tcps_snd_lowat_fraction != 0)
10552 tcp->tcp_xmit_lowater =
10553 tcp->tcp_xmit_hiwater /
10554 tcps->tcps_snd_lowat_fraction;
10555 (void) tcp_maxpsz_set(tcp, B_TRUE);
10556 /*
10557 * If we are flow-controlled, recheck the condition.
10558 * There are apps that increase SO_SNDBUF size when
10559 * flow-controlled (EWOULDBLOCK), and expect the flow
10560 * control condition to be lifted right away.
10561 */
10562 mutex_enter(&tcp->tcp_non_sq_lock);
10563 if (tcp->tcp_flow_stopped &&
10564 TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) {
10565 tcp_clrqfull(tcp);
10566 }
10567 mutex_exit(&tcp->tcp_non_sq_lock);
10568 break;
10569 }
10570 case SO_RCVBUF:
10571 if (*i1 > tcps->tcps_max_buf) {
10572 *outlenp = 0;
10573 return (ENOBUFS);
10574 }
10575 /* Silently ignore zero */
10576 if (!checkonly && *i1 != 0) {
10577 *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
10578 (void) tcp_rwnd_set(tcp, *i1);
10579 }
10580 /*
10581 * XXX should we return the rwnd here
10582 * and tcp_opt_get ?
10583 */
10584 break;
10585 case SO_SND_COPYAVOID:
10586 if (!checkonly) {
10587 /* we only allow enable at most once for now */
10588 if (tcp->tcp_loopback ||
10589 (tcp->tcp_kssl_ctx != NULL) ||
10590 (!tcp->tcp_snd_zcopy_aware &&
10591 (onoff != 1 || !tcp_zcopy_check(tcp)))) {
10592 *outlenp = 0;
10593 return (EOPNOTSUPP);
10594 }
10595 tcp->tcp_snd_zcopy_aware = 1;
10596 }
10597 break;
10598 case SO_ALLZONES:
10599 /* Pass option along to IP level for handling */
10600 return (-EINVAL);
10601 case SO_ANON_MLP:
10602 /* Pass option along to IP level for handling */
10603 return (-EINVAL);
10604 case SO_MAC_EXEMPT:
10605 /* Pass option along to IP level for handling */
10606 return (-EINVAL);
10607 case SO_EXCLBIND:
10608 if (!checkonly)
10609 tcp->tcp_exclbind = onoff;
10610 break;
10611 default:
10612 *outlenp = 0;
10613 return (EINVAL);
10614 }
10615 break;
10616 case IPPROTO_TCP:
10617 switch (name) {
10618 case TCP_NODELAY:
10619 if (!checkonly)
10620 tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
10621 break;
10622 case TCP_NOTIFY_THRESHOLD:
10623 if (!checkonly)
10624 tcp->tcp_first_timer_threshold = *i1;
10625 break;
10626 case TCP_ABORT_THRESHOLD:
10627 if (!checkonly)
10628 tcp->tcp_second_timer_threshold = *i1;
10629 break;
10630 case TCP_CONN_NOTIFY_THRESHOLD:
10631 if (!checkonly)
10632 tcp->tcp_first_ctimer_threshold = *i1;
10633 break;
10634 case TCP_CONN_ABORT_THRESHOLD:
10635 if (!checkonly)
10636 tcp->tcp_second_ctimer_threshold = *i1;
10637 break;
10638 case TCP_RECVDSTADDR:
10639 if (tcp->tcp_state > TCPS_LISTEN)
10640 return (EOPNOTSUPP);
10641 if (!checkonly)
10642 tcp->tcp_recvdstaddr = onoff;
10643 break;
10644 case TCP_ANONPRIVBIND:
10645 if ((reterr = secpolicy_net_privaddr(cr, 0,
10646 IPPROTO_TCP)) != 0) {
10647 *outlenp = 0;
10648 return (reterr);
10649 }
10650 if (!checkonly) {
10651 tcp->tcp_anon_priv_bind = onoff;
10652 }
10653 break;
10654 case TCP_EXCLBIND:
10655 if (!checkonly)
10656 tcp->tcp_exclbind = onoff;
10657 break; /* goto sizeof (int) option return */
10658 case TCP_INIT_CWND: {
10659 uint32_t init_cwnd = *((uint32_t *)invalp);
10660
10661 if (checkonly)
10662 break;
10663
10664 /*
10665 * Only allow socket with network configuration
10666 * privilege to set the initial cwnd to be larger
10667 * than allowed by RFC 3390.
10668 */
10669 if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
10670 tcp->tcp_init_cwnd = init_cwnd;
10671 break;
10672 }
10673 if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
10674 *outlenp = 0;
10675 return (reterr);
10676 }
10677 if (init_cwnd > TCP_MAX_INIT_CWND) {
10678 *outlenp = 0;
10679 return (EINVAL);
10680 }
10681 tcp->tcp_init_cwnd = init_cwnd;
10682 break;
10683 }
10684 case TCP_KEEPALIVE_THRESHOLD:
10685 if (checkonly)
10686 break;
10687
10688 if (*i1 < tcps->tcps_keepalive_interval_low ||
10689 *i1 > tcps->tcps_keepalive_interval_high) {
10690 *outlenp = 0;
10691 return (EINVAL);
10692 }
10693 if (*i1 != tcp->tcp_ka_interval) {
10694 tcp->tcp_ka_interval = *i1;
10695 /*
10696 * Check if we need to restart the
10697 * keepalive timer.
10698 */
10699 if (tcp->tcp_ka_tid != 0) {
10700 ASSERT(tcp->tcp_ka_enabled);
10701 (void) TCP_TIMER_CANCEL(tcp,
10702 tcp->tcp_ka_tid);
10703 tcp->tcp_ka_last_intrvl = 0;
10704 tcp->tcp_ka_tid = TCP_TIMER(tcp,
10705 tcp_keepalive_killer,
10706 MSEC_TO_TICK(tcp->tcp_ka_interval));
10707 }
10708 }
10709 break;
10710 case TCP_KEEPALIVE_ABORT_THRESHOLD:
10711 if (!checkonly) {
10712 if (*i1 <
10713 tcps->tcps_keepalive_abort_interval_low ||
10714 *i1 >
10715 tcps->tcps_keepalive_abort_interval_high) {
10716 *outlenp = 0;
10717 return (EINVAL);
10718 }
10719 tcp->tcp_ka_abort_thres = *i1;
10720 }
10721 break;
10722 case TCP_CORK:
10723 if (!checkonly) {
10724 /*
10725 * if tcp->tcp_cork was set and is now
10726 * being unset, we have to make sure that
10727 * the remaining data gets sent out. Also
10728 * unset tcp->tcp_cork so that tcp_wput_data()
10729 * can send data even if it is less than mss
10730 */
10731 if (tcp->tcp_cork && onoff == 0 &&
10732 tcp->tcp_unsent > 0) {
10733 tcp->tcp_cork = B_FALSE;
10734 tcp_wput_data(tcp, NULL, B_FALSE);
10735 }
10736 tcp->tcp_cork = onoff;
10737 }
10738 break;
10739 default:
10740 *outlenp = 0;
10741 return (EINVAL);
10742 }
10743 break;
10744 case IPPROTO_IP:
10745 if (tcp->tcp_family != AF_INET) {
10746 *outlenp = 0;
10747 return (ENOPROTOOPT);
10748 }
10749 switch (name) {
10750 case IP_OPTIONS:
10751 case T_IP_OPTIONS:
10752 reterr = tcp_opt_set_header(tcp, checkonly,
10753 invalp, inlen);
10754 if (reterr) {
10755 *outlenp = 0;
10756 return (reterr);
10757 }
10758 /* OK return - copy input buffer into output buffer */
10759 if (invalp != outvalp) {
10760 /* don't trust bcopy for identical src/dst */
10761 bcopy(invalp, outvalp, inlen);
10762 }
10763 *outlenp = inlen;
10764 return (0);
10765 case IP_TOS:
10766 case T_IP_TOS:
10767 if (!checkonly) {
10768 tcp->tcp_ipha->ipha_type_of_service =
10769 (uchar_t)*i1;
10770 tcp->tcp_tos = (uchar_t)*i1;
10771 }
10772 break;
10773 case IP_TTL:
10774 if (!checkonly) {
10775 tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1;
10776 tcp->tcp_ttl = (uchar_t)*i1;
10777 }
10778 break;
10779 case IP_BOUND_IF:
10780 case IP_NEXTHOP:
10781 /* Handled at the IP level */
10782 return (-EINVAL);
10783 case IP_SEC_OPT:
10784 /*
10785 * We should not allow policy setting after
10786 * we start listening for connections.
10787 */
10788 if (tcp->tcp_state == TCPS_LISTEN) {
10789 return (EINVAL);
10790 } else {
10791 /* Handled at the IP level */
10792 return (-EINVAL);
10793 }
10794 default:
10795 *outlenp = 0;
10796 return (EINVAL);
10797 }
10798 break;
10799 case IPPROTO_IPV6: {
10800 ip6_pkt_t *ipp;
10801
10802 /*
10803 * IPPROTO_IPV6 options are only supported for sockets
10804 * that are using IPv6 on the wire.
10805 */
10806 if (tcp->tcp_ipversion != IPV6_VERSION) {
10807 *outlenp = 0;
10808 return (ENOPROTOOPT);
10809 }
10810 /*
10811 * Only sticky options; no ancillary data
10812 */
10813 ASSERT(thisdg_attrs == NULL);
10814 ipp = &tcp->tcp_sticky_ipp;
10815
10816 switch (name) {
10817 case IPV6_UNICAST_HOPS:
10818 /* -1 means use default */
10819 if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) {
10820 *outlenp = 0;
10821 return (EINVAL);
10822 }
10823 if (!checkonly) {
10824 if (*i1 == -1) {
10825 tcp->tcp_ip6h->ip6_hops =
10826 ipp->ipp_unicast_hops =
10827 (uint8_t)tcps->tcps_ipv6_hoplimit;
10828 ipp->ipp_fields &= ~IPPF_UNICAST_HOPS;
10829 /* Pass modified value to IP. */
10830 *i1 = tcp->tcp_ip6h->ip6_hops;
10831 } else {
10832 tcp->tcp_ip6h->ip6_hops =
10833 ipp->ipp_unicast_hops =
10834 (uint8_t)*i1;
10835 ipp->ipp_fields |= IPPF_UNICAST_HOPS;
10836 }
10837 reterr = tcp_build_hdrs(q, tcp);
10838 if (reterr != 0)
10839 return (reterr);
10840 }
10841 break;
10842 case IPV6_BOUND_IF:
10843 if (!checkonly) {
10844 int error = 0;
10845
10846 tcp->tcp_bound_if = *i1;
10847 error = ip_opt_set_ill(tcp->tcp_connp, *i1,
10848 B_TRUE, checkonly, level, name, mblk);
10849 if (error != 0) {
10850 *outlenp = 0;
10851 return (error);
10852 }
10853 }
10854 break;
10855 /*
10856 * Set boolean switches for ancillary data delivery
10857 */
10858 case IPV6_RECVPKTINFO:
10859 if (!checkonly) {
10860 if (onoff)
10861 tcp->tcp_ipv6_recvancillary |=
10862 TCP_IPV6_RECVPKTINFO;
10863 else
10864 tcp->tcp_ipv6_recvancillary &=
10865 ~TCP_IPV6_RECVPKTINFO;
10866 /* Force it to be sent up with the next msg */
10867 tcp->tcp_recvifindex = 0;
10868 }
10869 break;
10870 case IPV6_RECVTCLASS:
10871 if (!checkonly) {
10872 if (onoff)
10873 tcp->tcp_ipv6_recvancillary |=
10874 TCP_IPV6_RECVTCLASS;
10875 else
10876 tcp->tcp_ipv6_recvancillary &=
10877 ~TCP_IPV6_RECVTCLASS;
10878 }
10879 break;
10880 case IPV6_RECVHOPLIMIT:
10881 if (!checkonly) {
10882 if (onoff)
10883 tcp->tcp_ipv6_recvancillary |=
10884 TCP_IPV6_RECVHOPLIMIT;
10885 else
10886 tcp->tcp_ipv6_recvancillary &=
10887 ~TCP_IPV6_RECVHOPLIMIT;
10888 /* Force it to be sent up with the next msg */
10889 tcp->tcp_recvhops = 0xffffffffU;
10890 }
10891 break;
10892 case IPV6_RECVHOPOPTS:
10893 if (!checkonly) {
10894 if (onoff)
10895 tcp->tcp_ipv6_recvancillary |=
10896 TCP_IPV6_RECVHOPOPTS;
10897 else
10898 tcp->tcp_ipv6_recvancillary &=
10899 ~TCP_IPV6_RECVHOPOPTS;
10900 }
10901 break;
10902 case IPV6_RECVDSTOPTS:
10903 if (!checkonly) {
10904 if (onoff)
10905 tcp->tcp_ipv6_recvancillary |=
10906 TCP_IPV6_RECVDSTOPTS;
10907 else
10908 tcp->tcp_ipv6_recvancillary &=
10909 ~TCP_IPV6_RECVDSTOPTS;
10910 }
10911 break;
10912 case _OLD_IPV6_RECVDSTOPTS:
10913 if (!checkonly) {
10914 if (onoff)
10915 tcp->tcp_ipv6_recvancillary |=
10916 TCP_OLD_IPV6_RECVDSTOPTS;
10917 else
10918 tcp->tcp_ipv6_recvancillary &=
10919 ~TCP_OLD_IPV6_RECVDSTOPTS;
10920 }
10921 break;
10922 case IPV6_RECVRTHDR:
10923 if (!checkonly) {
10924 if (onoff)
10925 tcp->tcp_ipv6_recvancillary |=
10926 TCP_IPV6_RECVRTHDR;
10927 else
10928 tcp->tcp_ipv6_recvancillary &=
10929 ~TCP_IPV6_RECVRTHDR;
10930 }
10931 break;
10932 case IPV6_RECVRTHDRDSTOPTS:
10933 if (!checkonly) {
10934 if (onoff)
10935 tcp->tcp_ipv6_recvancillary |=
10936 TCP_IPV6_RECVRTDSTOPTS;
10937 else
10938 tcp->tcp_ipv6_recvancillary &=
10939 ~TCP_IPV6_RECVRTDSTOPTS;
10940 }
10941 break;
10942 case IPV6_PKTINFO:
10943 if (inlen != 0 && inlen != sizeof (struct in6_pktinfo))
10944 return (EINVAL);
10945 if (checkonly)
10946 break;
10947
10948 if (inlen == 0) {
10949 ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR);
10950 } else {
10951 struct in6_pktinfo *pkti;
10952
10953 pkti = (struct in6_pktinfo *)invalp;
10954 /*
10955 * RFC 3542 states that ipi6_addr must be
10956 * the unspecified address when setting the
10957 * IPV6_PKTINFO sticky socket option on a
10958 * TCP socket.
10959 */
10960 if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
10961 return (EINVAL);
10962 /*
10963 * ip6_set_pktinfo() validates the source
10964 * address and interface index.
10965 */
10966 reterr = ip6_set_pktinfo(cr, tcp->tcp_connp,
10967 pkti, mblk);
10968 if (reterr != 0)
10969 return (reterr);
10970 ipp->ipp_ifindex = pkti->ipi6_ifindex;
10971 ipp->ipp_addr = pkti->ipi6_addr;
10972 if (ipp->ipp_ifindex != 0)
10973 ipp->ipp_fields |= IPPF_IFINDEX;
10974 else
10975 ipp->ipp_fields &= ~IPPF_IFINDEX;
10976 if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr))
10977 ipp->ipp_fields |= IPPF_ADDR;
10978 else
10979 ipp->ipp_fields &= ~IPPF_ADDR;
10980 }
10981 reterr = tcp_build_hdrs(q, tcp);
10982