Print this page
*** NO COMMENTS ***
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/inet/tcp/tcp.c
+++ new/usr/src/uts/common/inet/tcp/tcp.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 24 * Use is subject to license terms.
25 25 */
26 26 /* Copyright (c) 1990 Mentat Inc. */
27 27
28 28 #pragma ident "%Z%%M% %I% %E% SMI"
29 29 const char tcp_version[] = "%Z%%M% %I% %E% SMI";
30 30
31 31
32 32 #include <sys/types.h>
33 33 #include <sys/stream.h>
34 34 #include <sys/strsun.h>
35 35 #include <sys/strsubr.h>
36 36 #include <sys/stropts.h>
37 37 #include <sys/strlog.h>
38 38 #include <sys/strsun.h>
39 39 #define _SUN_TPI_VERSION 2
40 40 #include <sys/tihdr.h>
41 41 #include <sys/timod.h>
42 42 #include <sys/ddi.h>
43 43 #include <sys/sunddi.h>
44 44 #include <sys/suntpi.h>
45 45 #include <sys/xti_inet.h>
46 46 #include <sys/cmn_err.h>
47 47 #include <sys/debug.h>
48 48 #include <sys/sdt.h>
49 49 #include <sys/vtrace.h>
50 50 #include <sys/kmem.h>
51 51 #include <sys/ethernet.h>
52 52 #include <sys/cpuvar.h>
53 53 #include <sys/dlpi.h>
54 54 #include <sys/multidata.h>
55 55 #include <sys/multidata_impl.h>
56 56 #include <sys/pattr.h>
57 57 #include <sys/policy.h>
58 58 #include <sys/priv.h>
59 59 #include <sys/zone.h>
60 60 #include <sys/sunldi.h>
61 61
62 62 #include <sys/errno.h>
63 63 #include <sys/signal.h>
64 64 #include <sys/socket.h>
65 65 #include <sys/sockio.h>
66 66 #include <sys/isa_defs.h>
67 67 #include <sys/md5.h>
68 68 #include <sys/random.h>
69 69 #include <sys/sodirect.h>
70 70 #include <sys/uio.h>
71 71 #include <netinet/in.h>
72 72 #include <netinet/tcp.h>
73 73 #include <netinet/ip6.h>
74 74 #include <netinet/icmp6.h>
75 75 #include <net/if.h>
76 76 #include <net/route.h>
77 77 #include <inet/ipsec_impl.h>
78 78
79 79 #include <inet/common.h>
80 80 #include <inet/ip.h>
81 81 #include <inet/ip_impl.h>
82 82 #include <inet/ip6.h>
83 83 #include <inet/ip_ndp.h>
84 84 #include <inet/mi.h>
85 85 #include <inet/mib2.h>
86 86 #include <inet/nd.h>
87 87 #include <inet/optcom.h>
88 88 #include <inet/snmpcom.h>
89 89 #include <inet/kstatcom.h>
90 90 #include <inet/tcp.h>
91 91 #include <inet/tcp_impl.h>
92 92 #include <net/pfkeyv2.h>
93 93 #include <inet/ipsec_info.h>
94 94 #include <inet/ipdrop.h>
95 95 #include <inet/tcp_trace.h>
96 96
97 97 #include <inet/ipclassifier.h>
98 98 #include <inet/ip_ire.h>
99 99 #include <inet/ip_ftable.h>
100 100 #include <inet/ip_if.h>
101 101 #include <inet/ipp_common.h>
102 102 #include <inet/ip_netinfo.h>
103 103 #include <sys/squeue.h>
104 104 #include <inet/kssl/ksslapi.h>
105 105 #include <sys/tsol/label.h>
106 106 #include <sys/tsol/tnet.h>
107 107 #include <rpc/pmap_prot.h>
108 108
109 109 /*
110 110 * TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
111 111 *
112 112 * (Read the detailed design doc in PSARC case directory)
113 113 *
114 114 * The entire tcp state is contained in tcp_t and conn_t structure
115 115 * which are allocated in tandem using ipcl_conn_create() and passing
116 116 * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
117 117 * the references on the tcp_t. The tcp_t structure is never compressed
118 118 * and packets always land on the correct TCP perimeter from the time
119 119 * eager is created till the time tcp_t dies (as such the old mentat
120 120 * TCP global queue is not used for detached state and no IPSEC checking
121 121 * is required). The global queue is still allocated to send out resets
122 122 * for connection which have no listeners and IP directly calls
123 123 * tcp_xmit_listeners_reset() which does any policy check.
124 124 *
125 125 * Protection and Synchronisation mechanism:
126 126 *
127 127 * The tcp data structure does not use any kind of lock for protecting
128 128 * its state but instead uses 'squeues' for mutual exclusion from various
129 129 * read and write side threads. To access a tcp member, the thread should
130 130 * always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
131 131 * squeue_fill). Since the squeues allow a direct function call, caller
132 132 * can pass any tcp function having prototype of edesc_t as argument
133 133 * (different from traditional STREAMs model where packets come in only
134 134 * designated entry points). The list of functions that can be directly
135 135 * called via squeue are listed before the usual function prototype.
136 136 *
137 137 * Referencing:
138 138 *
139 139 * TCP is MT-Hot and we use a reference based scheme to make sure that the
140 140 * tcp structure doesn't disappear when its needed. When the application
141 141 * creates an outgoing connection or accepts an incoming connection, we
142 142 * start out with 2 references on 'conn_ref'. One for TCP and one for IP.
143 143 * The IP reference is just a symbolic reference since ip_tcpclose()
144 144 * looks at tcp structure after tcp_close_output() returns which could
145 145 * have dropped the last TCP reference. So as long as the connection is
146 146 * in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
147 147 * conn_t. The classifier puts its own reference when the connection is
148 148 * inserted in listen or connected hash. Anytime a thread needs to enter
149 149 * the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
150 150 * on write side or by doing a classify on read side and then puts a
151 151 * reference on the conn before doing squeue_enter/tryenter/fill. For
152 152 * read side, the classifier itself puts the reference under fanout lock
153 153 * to make sure that tcp can't disappear before it gets processed. The
154 154 * squeue will drop this reference automatically so the called function
155 155 * doesn't have to do a DEC_REF.
156 156 *
157 157 * Opening a new connection:
158 158 *
159 159 * The outgoing connection open is pretty simple. tcp_open() does the
160 160 * work in creating the conn/tcp structure and initializing it. The
161 161 * squeue assignment is done based on the CPU the application
162 162 * is running on. So for outbound connections, processing is always done
163 163 * on application CPU which might be different from the incoming CPU
164 164 * being interrupted by the NIC. An optimal way would be to figure out
165 165 * the NIC <-> CPU binding at listen time, and assign the outgoing
166 166 * connection to the squeue attached to the CPU that will be interrupted
167 167 * for incoming packets (we know the NIC based on the bind IP address).
168 168 * This might seem like a problem if more data is going out but the
169 169 * fact is that in most cases the transmit is ACK driven transmit where
170 170 * the outgoing data normally sits on TCP's xmit queue waiting to be
171 171 * transmitted.
172 172 *
173 173 * Accepting a connection:
174 174 *
175 175 * This is a more interesting case because of various races involved in
176 176 * establishing a eager in its own perimeter. Read the meta comment on
177 177 * top of tcp_conn_request(). But briefly, the squeue is picked by
178 178 * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
179 179 *
180 180 * Closing a connection:
181 181 *
182 182 * The close is fairly straight forward. tcp_close() calls tcp_close_output()
183 183 * via squeue to do the close and mark the tcp as detached if the connection
184 184 * was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
185 185 * reference but tcp_close() drop IP's reference always. So if tcp was
186 186 * not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
187 187 * and 1 because it is in classifier's connected hash. This is the condition
188 188 * we use to determine that its OK to clean up the tcp outside of squeue
189 189 * when time wait expires (check the ref under fanout and conn_lock and
190 190 * if it is 2, remove it from fanout hash and kill it).
191 191 *
192 192 * Although close just drops the necessary references and marks the
193 193 * tcp_detached state, tcp_close needs to know the tcp_detached has been
194 194 * set (under squeue) before letting the STREAM go away (because a
195 195 * inbound packet might attempt to go up the STREAM while the close
196 196 * has happened and tcp_detached is not set). So a special lock and
197 197 * flag is used along with a condition variable (tcp_closelock, tcp_closed,
198 198 * and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
199 199 * tcp_detached.
200 200 *
201 201 * Special provisions and fast paths:
202 202 *
203 203 * We make special provision for (AF_INET, SOCK_STREAM) sockets which
204 204 * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
205 205 * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
206 206 * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
207 207 * check to send packets directly to tcp_rput_data via squeue. Everyone
208 208 * else comes through tcp_input() on the read side.
209 209 *
210 210 * We also make special provisions for sockfs by marking tcp_issocket
211 211 * whenever we have only sockfs on top of TCP. This allows us to skip
212 212 * putting the tcp in acceptor hash since a sockfs listener can never
213 213 * become acceptor and also avoid allocating a tcp_t for acceptor STREAM
214 214 * since eager has already been allocated and the accept now happens
215 215 * on acceptor STREAM. There is a big blob of comment on top of
216 216 * tcp_conn_request explaining the new accept. When socket is POP'd,
217 217 * sockfs sends us an ioctl to mark the fact and we go back to old
218 218 * behaviour. Once tcp_issocket is unset, its never set for the
219 219 * life of that connection.
220 220 *
221 221 * In support of on-board asynchronous DMA hardware (e.g. Intel I/OAT)
222 222 * two consoldiation private KAPIs are used to enqueue M_DATA mblk_t's
223 223 * directly to the socket (sodirect) and start an asynchronous copyout
224 224 * to a user-land receive-side buffer (uioa) when a blocking socket read
225 225 * (e.g. read, recv, ...) is pending.
226 226 *
227 227 * This is accomplished when tcp_issocket is set and tcp_sodirect is not
228 228 * NULL so points to an sodirect_t and if marked enabled then we enqueue
229 229 * all mblk_t's directly to the socket.
230 230 *
231 231 * Further, if the sodirect_t sod_uioa and if marked enabled (due to a
232 232 * blocking socket read, e.g. user-land read, recv, ...) then an asynchronous
233 233 * copyout will be started directly to the user-land uio buffer. Also, as we
234 234 * have a pending read, TCP's push logic can take into account the number of
235 235 * bytes to be received and only awake the blocked read()er when the uioa_t
236 236 * byte count has been satisfied.
237 237 *
238 238 * IPsec notes :
239 239 *
240 240 * Since a packet is always executed on the correct TCP perimeter
241 241 * all IPsec processing is defered to IP including checking new
242 242 * connections and setting IPSEC policies for new connection. The
243 243 * only exception is tcp_xmit_listeners_reset() which is called
244 244 * directly from IP and needs to policy check to see if TH_RST
245 245 * can be sent out.
246 246 *
247 247 * PFHooks notes :
248 248 *
249 249 * For mdt case, one meta buffer contains multiple packets. Mblks for every
250 250 * packet are assembled and passed to the hooks. When packets are blocked,
251 251 * or boundary of any packet is changed, the mdt processing is stopped, and
252 252 * packets of the meta buffer are send to the IP path one by one.
253 253 */
254 254
255 255 /*
256 256 * Values for squeue switch:
257 257 * 1: squeue_enter_nodrain
258 258 * 2: squeue_enter
259 259 * 3: squeue_fill
260 260 */
261 261 int tcp_squeue_close = 2; /* Setable in /etc/system */
262 262 int tcp_squeue_wput = 2;
263 263
264 264 squeue_func_t tcp_squeue_close_proc;
265 265 squeue_func_t tcp_squeue_wput_proc;
266 266
267 267 /*
268 268 * Macros for sodirect:
269 269 *
270 270 * SOD_PTR_ENTER(tcp, sodp) - for the tcp_t pointer "tcp" set the
271 271 * sodirect_t pointer "sodp" to the socket/tcp shared sodirect_t
272 272 * if it exists and is enabled, else to NULL. Note, in the current
273 273 * sodirect implementation the sod_lock must not be held across any
274 274 * STREAMS call (e.g. putnext) else a "recursive mutex_enter" PANIC
275 275 * will result as sod_lock is the streamhead stdata.sd_lock.
276 276 *
277 277 * SOD_NOT_ENABLED(tcp) - return true if not a sodirect tcp_t or the
278 278 * sodirect_t isn't enabled, usefull for ASSERT()ing that a recieve
279 279 * side tcp code path dealing with a tcp_rcv_list or putnext() isn't
280 280 * being used when sodirect code paths should be.
281 281 */
282 282
283 283 #define SOD_PTR_ENTER(tcp, sodp) \
284 284 (sodp) = (tcp)->tcp_sodirect; \
285 285 \
286 286 if ((sodp) != NULL) { \
287 287 mutex_enter((sodp)->sod_lock); \
288 288 if (!((sodp)->sod_state & SOD_ENABLED)) { \
289 289 mutex_exit((sodp)->sod_lock); \
290 290 (sodp) = NULL; \
291 291 } \
292 292 }
293 293
294 294 #define SOD_NOT_ENABLED(tcp) \
295 295 ((tcp)->tcp_sodirect == NULL || \
296 296 !((tcp)->tcp_sodirect->sod_state & SOD_ENABLED))
297 297
298 298 /*
299 299 * This controls how tiny a write must be before we try to copy it
300 300 * into the the mblk on the tail of the transmit queue. Not much
301 301 * speedup is observed for values larger than sixteen. Zero will
302 302 * disable the optimisation.
303 303 */
304 304 int tcp_tx_pull_len = 16;
305 305
306 306 /*
307 307 * TCP Statistics.
308 308 *
309 309 * How TCP statistics work.
310 310 *
311 311 * There are two types of statistics invoked by two macros.
312 312 *
313 313 * TCP_STAT(name) does non-atomic increment of a named stat counter. It is
314 314 * supposed to be used in non MT-hot paths of the code.
315 315 *
316 316 * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
317 317 * supposed to be used for DEBUG purposes and may be used on a hot path.
318 318 *
319 319 * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
320 320 * (use "kstat tcp" to get them).
321 321 *
322 322 * There is also additional debugging facility that marks tcp_clean_death()
323 323 * instances and saves them in tcp_t structure. It is triggered by
324 324 * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
325 325 * tcp_clean_death() calls that counts the number of times each tag was hit. It
326 326 * is triggered by TCP_CLD_COUNTERS define.
327 327 *
328 328 * How to add new counters.
329 329 *
330 330 * 1) Add a field in the tcp_stat structure describing your counter.
331 331 * 2) Add a line in the template in tcp_kstat2_init() with the name
332 332 * of the counter.
333 333 *
334 334 * IMPORTANT!! - make sure that both are in sync !!
335 335 * 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
336 336 *
337 337 * Please avoid using private counters which are not kstat-exported.
338 338 *
339 339 * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
340 340 * in tcp_t structure.
341 341 *
342 342 * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
343 343 */
344 344
345 345 #ifndef TCP_DEBUG_COUNTER
346 346 #ifdef DEBUG
347 347 #define TCP_DEBUG_COUNTER 1
348 348 #else
349 349 #define TCP_DEBUG_COUNTER 0
350 350 #endif
351 351 #endif
352 352
353 353 #define TCP_CLD_COUNTERS 0
354 354
355 355 #define TCP_TAG_CLEAN_DEATH 1
356 356 #define TCP_MAX_CLEAN_DEATH_TAG 32
357 357
358 358 #ifdef lint
359 359 static int _lint_dummy_;
360 360 #endif
361 361
362 362 #if TCP_CLD_COUNTERS
363 363 static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
364 364 #define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
365 365 #elif defined(lint)
366 366 #define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
367 367 #else
368 368 #define TCP_CLD_STAT(x)
369 369 #endif
370 370
371 371 #if TCP_DEBUG_COUNTER
372 372 #define TCP_DBGSTAT(tcps, x) \
373 373 atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
374 374 #define TCP_G_DBGSTAT(x) \
375 375 atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
376 376 #elif defined(lint)
377 377 #define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
378 378 #define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
379 379 #else
380 380 #define TCP_DBGSTAT(tcps, x)
381 381 #define TCP_G_DBGSTAT(x)
382 382 #endif
383 383
384 384 #define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++)
385 385
386 386 tcp_g_stat_t tcp_g_statistics;
387 387 kstat_t *tcp_g_kstat;
388 388
389 389 /*
390 390 * Call either ip_output or ip_output_v6. This replaces putnext() calls on the
391 391 * tcp write side.
392 392 */
393 393 #define CALL_IP_WPUT(connp, q, mp) { \
394 394 tcp_stack_t *tcps; \
395 395 \
396 396 tcps = connp->conn_netstack->netstack_tcp; \
397 397 ASSERT(((q)->q_flag & QREADR) == 0); \
398 398 TCP_DBGSTAT(tcps, tcp_ip_output); \
399 399 connp->conn_send(connp, (mp), (q), IP_WPUT); \
400 400 }
401 401
402 402 /* Macros for timestamp comparisons */
403 403 #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
404 404 #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
405 405
406 406 /*
407 407 * Parameters for TCP Initial Send Sequence number (ISS) generation. When
408 408 * tcp_strong_iss is set to 1, which is the default, the ISS is calculated
409 409 * by adding three components: a time component which grows by 1 every 4096
410 410 * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
411 411 * a per-connection component which grows by 125000 for every new connection;
412 412 * and an "extra" component that grows by a random amount centered
413 413 * approximately on 64000. This causes the the ISS generator to cycle every
414 414 * 4.89 hours if no TCP connections are made, and faster if connections are
415 415 * made.
416 416 *
417 417 * When tcp_strong_iss is set to 0, ISS is calculated by adding two
418 418 * components: a time component which grows by 250000 every second; and
419 419 * a per-connection component which grows by 125000 for every new connections.
420 420 *
421 421 * A third method, when tcp_strong_iss is set to 2, for generating ISS is
422 422 * prescribed by Steve Bellovin. This involves adding time, the 125000 per
423 423 * connection, and a one-way hash (MD5) of the connection ID <sport, dport,
424 424 * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
425 425 * password.
426 426 */
427 427 #define ISS_INCR 250000
428 428 #define ISS_NSEC_SHT 12
429 429
430 430 static sin_t sin_null; /* Zero address for quick clears */
431 431 static sin6_t sin6_null; /* Zero address for quick clears */
432 432
433 433 /*
434 434 * This implementation follows the 4.3BSD interpretation of the urgent
435 435 * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
436 436 * incompatible changes in protocols like telnet and rlogin.
437 437 */
438 438 #define TCP_OLD_URP_INTERPRETATION 1
439 439
440 440 #define TCP_IS_DETACHED_NONEAGER(tcp) \
441 441 (TCP_IS_DETACHED(tcp) && \
442 442 (!(tcp)->tcp_hard_binding))
443 443
444 444 /*
445 445 * TCP reassembly macros. We hide starting and ending sequence numbers in
446 446 * b_next and b_prev of messages on the reassembly queue. The messages are
447 447 * chained using b_cont. These macros are used in tcp_reass() so we don't
448 448 * have to see the ugly casts and assignments.
449 449 */
450 450 #define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next))
451 451 #define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \
452 452 (mblk_t *)(uintptr_t)(u))
453 453 #define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev))
454 454 #define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \
455 455 (mblk_t *)(uintptr_t)(u))
456 456
457 457 /*
458 458 * Implementation of TCP Timers.
459 459 * =============================
460 460 *
461 461 * INTERFACE:
462 462 *
463 463 * There are two basic functions dealing with tcp timers:
464 464 *
465 465 * timeout_id_t tcp_timeout(connp, func, time)
466 466 * clock_t tcp_timeout_cancel(connp, timeout_id)
467 467 * TCP_TIMER_RESTART(tcp, intvl)
468 468 *
469 469 * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
470 470 * after 'time' ticks passed. The function called by timeout() must adhere to
471 471 * the same restrictions as a driver soft interrupt handler - it must not sleep
472 472 * or call other functions that might sleep. The value returned is the opaque
473 473 * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
474 474 * cancel the request. The call to tcp_timeout() may fail in which case it
475 475 * returns zero. This is different from the timeout(9F) function which never
476 476 * fails.
477 477 *
478 478 * The call-back function 'func' always receives 'connp' as its single
479 479 * argument. It is always executed in the squeue corresponding to the tcp
480 480 * structure. The tcp structure is guaranteed to be present at the time the
481 481 * call-back is called.
482 482 *
483 483 * NOTE: The call-back function 'func' is never called if tcp is in
484 484 * the TCPS_CLOSED state.
485 485 *
486 486 * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
487 487 * request. locks acquired by the call-back routine should not be held across
488 488 * the call to tcp_timeout_cancel() or a deadlock may result.
489 489 *
490 490 * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
491 491 * Otherwise, it returns an integer value greater than or equal to 0. In
492 492 * particular, if the call-back function is already placed on the squeue, it can
493 493 * not be canceled.
494 494 *
495 495 * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
496 496 * within squeue context corresponding to the tcp instance. Since the
497 497 * call-back is also called via the same squeue, there are no race
498 498 * conditions described in untimeout(9F) manual page since all calls are
499 499 * strictly serialized.
500 500 *
501 501 * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
502 502 * stored in tcp_timer_tid and starts a new one using
503 503 * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
504 504 * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
505 505 * field.
506 506 *
507 507 * NOTE: since the timeout cancellation is not guaranteed, the cancelled
508 508 * call-back may still be called, so it is possible tcp_timer() will be
509 509 * called several times. This should not be a problem since tcp_timer()
510 510 * should always check the tcp instance state.
511 511 *
512 512 *
513 513 * IMPLEMENTATION:
514 514 *
515 515 * TCP timers are implemented using three-stage process. The call to
516 516 * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
517 517 * when the timer expires. The tcp_timer_callback() arranges the call of the
518 518 * tcp_timer_handler() function via squeue corresponding to the tcp
519 519 * instance. The tcp_timer_handler() calls actual requested timeout call-back
520 520 * and passes tcp instance as an argument to it. Information is passed between
521 521 * stages using the tcp_timer_t structure which contains the connp pointer, the
522 522 * tcp call-back to call and the timeout id returned by the timeout(9F).
523 523 *
524 524 * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
525 525 * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
526 526 * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
527 527 * returns the pointer to this mblk.
528 528 *
529 529 * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
530 530 * looks like a normal mblk without actual dblk attached to it.
531 531 *
532 532 * To optimize performance each tcp instance holds a small cache of timer
533 533 * mblocks. In the current implementation it caches up to two timer mblocks per
534 534 * tcp instance. The cache is preserved over tcp frees and is only freed when
535 535 * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
536 536 * timer processing happens on a corresponding squeue, the cache manipulation
537 537 * does not require any locks. Experiments show that majority of timer mblocks
538 538 * allocations are satisfied from the tcp cache and do not involve kmem calls.
539 539 *
540 540 * The tcp_timeout() places a refhold on the connp instance which guarantees
541 541 * that it will be present at the time the call-back function fires. The
542 542 * tcp_timer_handler() drops the reference after calling the call-back, so the
543 543 * call-back function does not need to manipulate the references explicitly.
544 544 */
545 545
546 546 typedef struct tcp_timer_s {
547 547 conn_t *connp;
548 548 void (*tcpt_proc)(void *);
549 549 timeout_id_t tcpt_tid;
550 550 } tcp_timer_t;
551 551
552 552 static kmem_cache_t *tcp_timercache;
553 553 kmem_cache_t *tcp_sack_info_cache;
554 554 kmem_cache_t *tcp_iphc_cache;
555 555
556 556 /*
557 557 * For scalability, we must not run a timer for every TCP connection
558 558 * in TIME_WAIT state. To see why, consider (for time wait interval of
559 559 * 4 minutes):
560 560 * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
561 561 *
562 562 * This list is ordered by time, so you need only delete from the head
563 563 * until you get to entries which aren't old enough to delete yet.
564 564 * The list consists of only the detached TIME_WAIT connections.
565 565 *
566 566 * Note that the timer (tcp_time_wait_expire) is started when the tcp_t
567 567 * becomes detached TIME_WAIT (either by changing the state and already
568 568 * being detached or the other way around). This means that the TIME_WAIT
569 569 * state can be extended (up to doubled) if the connection doesn't become
570 570 * detached for a long time.
571 571 *
572 572 * The list manipulations (including tcp_time_wait_next/prev)
573 573 * are protected by the tcp_time_wait_lock. The content of the
574 574 * detached TIME_WAIT connections is protected by the normal perimeters.
575 575 *
576 576 * This list is per squeue and squeues are shared across the tcp_stack_t's.
577 577 * Things on tcp_time_wait_head remain associated with the tcp_stack_t
578 578 * and conn_netstack.
579 579 * The tcp_t's that are added to tcp_free_list are disassociated and
580 580 * have NULL tcp_tcps and conn_netstack pointers.
581 581 */
582 582 typedef struct tcp_squeue_priv_s {
583 583 kmutex_t tcp_time_wait_lock;
584 584 timeout_id_t tcp_time_wait_tid;
585 585 tcp_t *tcp_time_wait_head;
586 586 tcp_t *tcp_time_wait_tail;
587 587 tcp_t *tcp_free_list;
588 588 uint_t tcp_free_list_cnt;
589 589 } tcp_squeue_priv_t;
590 590
591 591 /*
592 592 * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
593 593 * Running it every 5 seconds seems to give the best results.
594 594 */
595 595 #define TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
596 596
597 597 /*
598 598 * To prevent memory hog, limit the number of entries in tcp_free_list
599 599 * to 1% of available memory / number of cpus
600 600 */
601 601 uint_t tcp_free_list_max_cnt = 0;
602 602
603 603 #define TCP_XMIT_LOWATER 4096
604 604 #define TCP_XMIT_HIWATER 49152
605 605 #define TCP_RECV_LOWATER 2048
606 606 #define TCP_RECV_HIWATER 49152
607 607
608 608 /*
609 609 * PAWS needs a timer for 24 days. This is the number of ticks in 24 days
610 610 */
611 611 #define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
612 612
613 613 #define TIDUSZ 4096 /* transport interface data unit size */
614 614
615 615 /*
616 616 * Bind hash list size and has function. It has to be a power of 2 for
617 617 * hashing.
618 618 */
619 619 #define TCP_BIND_FANOUT_SIZE 512
620 620 #define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
621 621 /*
622 622 * Size of listen and acceptor hash list. It has to be a power of 2 for
623 623 * hashing.
624 624 */
625 625 #define TCP_FANOUT_SIZE 256
626 626
627 627 #ifdef _ILP32
628 628 #define TCP_ACCEPTOR_HASH(accid) \
629 629 (((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
630 630 #else
631 631 #define TCP_ACCEPTOR_HASH(accid) \
632 632 ((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
633 633 #endif /* _ILP32 */
634 634
635 635 #define IP_ADDR_CACHE_SIZE 2048
636 636 #define IP_ADDR_CACHE_HASH(faddr) \
637 637 (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
638 638
639 639 /* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */
640 640 #define TCP_HSP_HASH_SIZE 256
641 641
642 642 #define TCP_HSP_HASH(addr) \
643 643 (((addr>>24) ^ (addr >>16) ^ \
644 644 (addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE)
645 645
646 646 /*
647 647 * TCP options struct returned from tcp_parse_options.
648 648 */
649 649 typedef struct tcp_opt_s {
650 650 uint32_t tcp_opt_mss;
651 651 uint32_t tcp_opt_wscale;
652 652 uint32_t tcp_opt_ts_val;
653 653 uint32_t tcp_opt_ts_ecr;
654 654 tcp_t *tcp;
655 655 } tcp_opt_t;
656 656
657 657 /*
658 658 * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
659 659 */
660 660
661 661 #ifdef _BIG_ENDIAN
662 662 #define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
663 663 (TCPOPT_TSTAMP << 8) | 10)
664 664 #else
665 665 #define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
666 666 (TCPOPT_NOP << 8) | TCPOPT_NOP)
667 667 #endif
668 668
669 669 /*
670 670 * Flags returned from tcp_parse_options.
671 671 */
672 672 #define TCP_OPT_MSS_PRESENT 1
673 673 #define TCP_OPT_WSCALE_PRESENT 2
674 674 #define TCP_OPT_TSTAMP_PRESENT 4
675 675 #define TCP_OPT_SACK_OK_PRESENT 8
676 676 #define TCP_OPT_SACK_PRESENT 16
677 677
678 678 /* TCP option length */
679 679 #define TCPOPT_NOP_LEN 1
680 680 #define TCPOPT_MAXSEG_LEN 4
681 681 #define TCPOPT_WS_LEN 3
682 682 #define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1)
683 683 #define TCPOPT_TSTAMP_LEN 10
684 684 #define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2)
685 685 #define TCPOPT_SACK_OK_LEN 2
686 686 #define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2)
687 687 #define TCPOPT_REAL_SACK_LEN 4
688 688 #define TCPOPT_MAX_SACK_LEN 36
689 689 #define TCPOPT_HEADER_LEN 2
690 690
691 691 /* TCP cwnd burst factor. */
692 692 #define TCP_CWND_INFINITE 65535
693 693 #define TCP_CWND_SS 3
694 694 #define TCP_CWND_NORMAL 5
695 695
696 696 /* Maximum TCP initial cwin (start/restart). */
697 697 #define TCP_MAX_INIT_CWND 8
698 698
699 699 /*
700 700 * Initialize cwnd according to RFC 3390. def_max_init_cwnd is
701 701 * either tcp_slow_start_initial or tcp_slow_start_after idle
702 702 * depending on the caller. If the upper layer has not used the
703 703 * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
704 704 * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
705 705 * If the upper layer has changed set the tcp_init_cwnd, just use
706 706 * it to calculate the tcp_cwnd.
707 707 */
708 708 #define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \
709 709 { \
710 710 if ((tcp)->tcp_init_cwnd == 0) { \
711 711 (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \
712 712 MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
713 713 } else { \
714 714 (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \
715 715 } \
716 716 tcp->tcp_cwnd_cnt = 0; \
717 717 }
718 718
719 719 /* TCP Timer control structure */
720 720 typedef struct tcpt_s {
721 721 pfv_t tcpt_pfv; /* The routine we are to call */
722 722 tcp_t *tcpt_tcp; /* The parameter we are to pass in */
723 723 } tcpt_t;
724 724
725 725 /* Host Specific Parameter structure */
726 726 typedef struct tcp_hsp {
727 727 struct tcp_hsp *tcp_hsp_next;
728 728 in6_addr_t tcp_hsp_addr_v6;
729 729 in6_addr_t tcp_hsp_subnet_v6;
730 730 uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */
731 731 int32_t tcp_hsp_sendspace;
732 732 int32_t tcp_hsp_recvspace;
733 733 int32_t tcp_hsp_tstamp;
734 734 } tcp_hsp_t;
735 735 #define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6)
736 736 #define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6)
737 737
738 738 /*
739 739 * Functions called directly via squeue having a prototype of edesc_t.
740 740 */
741 741 void tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
742 742 static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
743 743 void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
744 744 static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
745 745 static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
746 746 void tcp_input(void *arg, mblk_t *mp, void *arg2);
747 747 void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
748 748 static void tcp_close_output(void *arg, mblk_t *mp, void *arg2);
749 749 void tcp_output(void *arg, mblk_t *mp, void *arg2);
750 750 static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
751 751 static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
752 752 static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
753 753
754 754
755 755 /* Prototype for TCP functions */
756 756 static void tcp_random_init(void);
757 757 int tcp_random(void);
758 758 static void tcp_accept(tcp_t *tcp, mblk_t *mp);
759 759 static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
760 760 tcp_t *eager);
761 761 static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
762 762 static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
763 763 int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
764 764 boolean_t user_specified);
765 765 static void tcp_closei_local(tcp_t *tcp);
766 766 static void tcp_close_detached(tcp_t *tcp);
767 767 static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
768 768 mblk_t *idmp, mblk_t **defermp);
769 769 static void tcp_connect(tcp_t *tcp, mblk_t *mp);
770 770 static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
771 771 in_port_t dstport, uint_t srcid);
772 772 static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
773 773 in_port_t dstport, uint32_t flowinfo, uint_t srcid,
774 774 uint32_t scope_id);
775 775 static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
776 776 static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
777 777 static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
778 778 static char *tcp_display(tcp_t *tcp, char *, char);
779 779 static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
780 780 static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
781 781 static void tcp_eager_unlink(tcp_t *tcp);
782 782 static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
783 783 int unixerr);
784 784 static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
785 785 int tlierr, int unixerr);
786 786 static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
787 787 cred_t *cr);
788 788 static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
789 789 char *value, caddr_t cp, cred_t *cr);
790 790 static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
791 791 char *value, caddr_t cp, cred_t *cr);
792 792 static int tcp_tpistate(tcp_t *tcp);
793 793 static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
794 794 int caller_holds_lock);
795 795 static void tcp_bind_hash_remove(tcp_t *tcp);
796 796 static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
797 797 void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
798 798 static void tcp_acceptor_hash_remove(tcp_t *tcp);
799 799 static void tcp_capability_req(tcp_t *tcp, mblk_t *mp);
800 800 static void tcp_info_req(tcp_t *tcp, mblk_t *mp);
801 801 static void tcp_addr_req(tcp_t *tcp, mblk_t *mp);
802 802 static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
803 803 void tcp_g_q_setup(tcp_stack_t *);
804 804 void tcp_g_q_create(tcp_stack_t *);
805 805 void tcp_g_q_destroy(tcp_stack_t *);
806 806 static int tcp_header_init_ipv4(tcp_t *tcp);
807 807 static int tcp_header_init_ipv6(tcp_t *tcp);
808 808 int tcp_init(tcp_t *tcp, queue_t *q);
809 809 static int tcp_init_values(tcp_t *tcp);
810 810 static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
811 811 static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
812 812 t_scalar_t addr_length);
813 813 static void tcp_ip_ire_mark_advice(tcp_t *tcp);
814 814 static void tcp_ip_notify(tcp_t *tcp);
815 815 static mblk_t *tcp_ire_mp(mblk_t *mp);
816 816 static void tcp_iss_init(tcp_t *tcp);
817 817 static void tcp_keepalive_killer(void *arg);
818 818 static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
819 819 static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
820 820 static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
821 821 int *do_disconnectp, int *t_errorp, int *sys_errorp);
822 822 static boolean_t tcp_allow_connopt_set(int level, int name);
823 823 int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
824 824 int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
825 825 int tcp_opt_set(queue_t *q, uint_t optset_context, int level,
826 826 int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
827 827 uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
828 828 mblk_t *mblk);
829 829 static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
830 830 static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
831 831 uchar_t *ptr, uint_t len);
832 832 static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
833 833 static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
834 834 tcp_stack_t *);
835 835 static int tcp_param_set(queue_t *q, mblk_t *mp, char *value,
836 836 caddr_t cp, cred_t *cr);
837 837 static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
838 838 caddr_t cp, cred_t *cr);
839 839 static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
840 840 static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
841 841 caddr_t cp, cred_t *cr);
842 842 static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
843 843 static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
844 844 static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
845 845 static void tcp_reinit(tcp_t *tcp);
846 846 static void tcp_reinit_values(tcp_t *tcp);
847 847 static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
848 848 tcp_t *thisstream, cred_t *cr);
849 849
850 850 static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp);
851 851 static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
852 852 static boolean_t tcp_send_rst_chk(tcp_stack_t *);
853 853 static void tcp_ss_rexmit(tcp_t *tcp);
854 854 static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
855 855 static void tcp_process_options(tcp_t *, tcph_t *);
856 856 static void tcp_rput_common(tcp_t *tcp, mblk_t *mp);
857 857 static void tcp_rsrv(queue_t *q);
858 858 static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
859 859 static int tcp_snmp_state(tcp_t *tcp);
860 860 static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
861 861 cred_t *cr);
862 862 static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
863 863 cred_t *cr);
864 864 static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
865 865 cred_t *cr);
866 866 static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
867 867 cred_t *cr);
868 868 static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
869 869 cred_t *cr);
870 870 static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value,
871 871 caddr_t cp, cred_t *cr);
872 872 static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value,
873 873 caddr_t cp, cred_t *cr);
874 874 static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp,
875 875 cred_t *cr);
876 876 static void tcp_timer(void *arg);
877 877 static void tcp_timer_callback(void *);
878 878 static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
879 879 boolean_t random);
880 880 static in_port_t tcp_get_next_priv_port(const tcp_t *);
881 881 static void tcp_wput_sock(queue_t *q, mblk_t *mp);
882 882 void tcp_wput_accept(queue_t *q, mblk_t *mp);
883 883 static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
884 884 static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
885 885 static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
886 886 static int tcp_send(queue_t *q, tcp_t *tcp, const int mss,
887 887 const int tcp_hdr_len, const int tcp_tcp_hdr_len,
888 888 const int num_sack_blk, int *usable, uint_t *snxt,
889 889 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
890 890 const int mdt_thres);
891 891 static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
892 892 const int tcp_hdr_len, const int tcp_tcp_hdr_len,
893 893 const int num_sack_blk, int *usable, uint_t *snxt,
894 894 int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
895 895 const int mdt_thres);
896 896 static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
897 897 int num_sack_blk);
898 898 static void tcp_wsrv(queue_t *q);
899 899 static int tcp_xmit_end(tcp_t *tcp);
900 900 static void tcp_ack_timer(void *arg);
901 901 static mblk_t *tcp_ack_mp(tcp_t *tcp);
902 902 static void tcp_xmit_early_reset(char *str, mblk_t *mp,
903 903 uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
904 904 zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
905 905 static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
906 906 uint32_t ack, int ctl);
907 907 static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *);
908 908 static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *);
909 909 static int setmaxps(queue_t *q, int maxpsz);
910 910 static void tcp_set_rto(tcp_t *, time_t);
911 911 static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
912 912 boolean_t, boolean_t);
913 913 static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
914 914 boolean_t ipsec_mctl);
915 915 static mblk_t *tcp_setsockopt_mp(int level, int cmd,
916 916 char *opt, int optlen);
917 917 static int tcp_build_hdrs(queue_t *, tcp_t *);
918 918 static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
919 919 uint32_t seg_seq, uint32_t seg_ack, int seg_len,
920 920 tcph_t *tcph);
921 921 boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
922 922 boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *);
923 923 boolean_t tcp_reserved_port_del(in_port_t, in_port_t);
924 924 boolean_t tcp_reserved_port_check(in_port_t, tcp_stack_t *);
925 925 static tcp_t *tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *);
926 926 static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
927 927 static mblk_t *tcp_mdt_info_mp(mblk_t *);
928 928 static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
929 929 static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
930 930 const boolean_t, const uint32_t, const uint32_t,
931 931 const uint32_t, const uint32_t, tcp_stack_t *);
932 932 static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
933 933 const uint_t, const uint_t, boolean_t *);
934 934 static mblk_t *tcp_lso_info_mp(mblk_t *);
935 935 static void tcp_lso_update(tcp_t *, ill_lso_capab_t *);
936 936 static void tcp_send_data(tcp_t *, queue_t *, mblk_t *);
937 937 extern mblk_t *tcp_timermp_alloc(int);
938 938 extern void tcp_timermp_free(tcp_t *);
939 939 static void tcp_timer_free(tcp_t *tcp, mblk_t *mp);
940 940 static void tcp_stop_lingering(tcp_t *tcp);
941 941 static void tcp_close_linger_timeout(void *arg);
942 942 static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns);
943 943 static void tcp_stack_shutdown(netstackid_t stackid, void *arg);
944 944 static void tcp_stack_fini(netstackid_t stackid, void *arg);
945 945 static void *tcp_g_kstat_init(tcp_g_stat_t *);
946 946 static void tcp_g_kstat_fini(kstat_t *);
947 947 static void *tcp_kstat_init(netstackid_t, tcp_stack_t *);
948 948 static void tcp_kstat_fini(netstackid_t, kstat_t *);
949 949 static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *);
950 950 static void tcp_kstat2_fini(netstackid_t, kstat_t *);
951 951 static int tcp_kstat_update(kstat_t *kp, int rw);
952 952 void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
953 953 static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
954 954 tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
955 955 static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
956 956 tcph_t *tcph, mblk_t *idmp);
957 957 static squeue_func_t tcp_squeue_switch(int);
958 958
959 959 static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
960 960 static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
961 961 static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
962 962 static int tcp_close(queue_t *, int);
963 963 static int tcpclose_accept(queue_t *);
964 964
965 965 static void tcp_squeue_add(squeue_t *);
966 966 static boolean_t tcp_zcopy_check(tcp_t *);
967 967 static void tcp_zcopy_notify(tcp_t *);
968 968 static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *);
969 969 static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
970 970 static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
971 971
972 972 extern void tcp_kssl_input(tcp_t *, mblk_t *);
973 973
974 974 void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
975 975 void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
976 976
977 977 /*
978 978 * Routines related to the TCP_IOC_ABORT_CONN ioctl command.
979 979 *
980 980 * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
981 981 * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
982 982 * (defined in tcp.h) needs to be filled in and passed into the kernel
983 983 * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
984 984 * structure contains the four-tuple of a TCP connection and a range of TCP
985 985 * states (specified by ac_start and ac_end). The use of wildcard addresses
986 986 * and ports is allowed. Connections with a matching four tuple and a state
987 987 * within the specified range will be aborted. The valid states for the
988 988 * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
989 989 * inclusive.
990 990 *
991 991 * An application which has its connection aborted by this ioctl will receive
992 992 * an error that is dependent on the connection state at the time of the abort.
993 993 * If the connection state is < TCPS_TIME_WAIT, an application should behave as
994 994 * though a RST packet has been received. If the connection state is equal to
995 995 * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
996 996 * and all resources associated with the connection will be freed.
997 997 */
998 998 static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
999 999 static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
1000 1000 static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
1001 1001 static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
1002 1002 static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
1003 1003 static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
1004 1004 boolean_t, tcp_stack_t *);
1005 1005
1006 1006 static struct module_info tcp_rinfo = {
1007 1007 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
1008 1008 };
1009 1009
1010 1010 static struct module_info tcp_winfo = {
1011 1011 TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
1012 1012 };
1013 1013
1014 1014 /*
1015 1015 * Entry points for TCP as a device. The normal case which supports
1016 1016 * the TCP functionality.
1017 1017 * We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
1018 1018 */
1019 1019 struct qinit tcp_rinitv4 = {
1020 1020 NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, NULL, &tcp_rinfo
1021 1021 };
1022 1022
1023 1023 struct qinit tcp_rinitv6 = {
1024 1024 NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_close, NULL, &tcp_rinfo
1025 1025 };
1026 1026
1027 1027 struct qinit tcp_winit = {
1028 1028 (pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1029 1029 };
1030 1030
1031 1031 /* Initial entry point for TCP in socket mode. */
1032 1032 struct qinit tcp_sock_winit = {
1033 1033 (pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
1034 1034 };
1035 1035
1036 1036 /*
1037 1037 * Entry points for TCP as a acceptor STREAM opened by sockfs when doing
1038 1038 * an accept. Avoid allocating data structures since eager has already
1039 1039 * been created.
1040 1040 */
1041 1041 struct qinit tcp_acceptor_rinit = {
1042 1042 NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo
1043 1043 };
1044 1044
1045 1045 struct qinit tcp_acceptor_winit = {
1046 1046 (pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
1047 1047 };
1048 1048
1049 1049 /*
1050 1050 * Entry points for TCP loopback (read side only)
1051 1051 * The open routine is only used for reopens, thus no need to
1052 1052 * have a separate one for tcp_openv6.
1053 1053 */
1054 1054 struct qinit tcp_loopback_rinit = {
1055 1055 (pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, (pfi_t)0,
1056 1056 &tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
1057 1057 };
1058 1058
1059 1059 /* For AF_INET aka /dev/tcp */
1060 1060 struct streamtab tcpinfov4 = {
1061 1061 &tcp_rinitv4, &tcp_winit
1062 1062 };
1063 1063
1064 1064 /* For AF_INET6 aka /dev/tcp6 */
1065 1065 struct streamtab tcpinfov6 = {
1066 1066 &tcp_rinitv6, &tcp_winit
1067 1067 };
1068 1068
1069 1069 /*
1070 1070 * Have to ensure that tcp_g_q_close is not done by an
1071 1071 * interrupt thread.
1072 1072 */
1073 1073 static taskq_t *tcp_taskq;
1074 1074
1075 1075 /*
1076 1076 * TCP has a private interface for other kernel modules to reserve a
1077 1077 * port range for them to use. Once reserved, TCP will not use any ports
1078 1078 * in the range. This interface relies on the TCP_EXCLBIND feature. If
1079 1079 * the semantics of TCP_EXCLBIND is changed, implementation of this interface
1080 1080 * has to be verified.
1081 1081 *
1082 1082 * There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port
1083 1083 * range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port
1084 1084 * range is [port a, port b] inclusive. And each port range is between
1085 1085 * TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive.
1086 1086 *
1087 1087 * Note that the default anonymous port range starts from 32768. There is
1088 1088 * no port "collision" between that and the reserved port range. If there
1089 1089 * is port collision (because the default smallest anonymous port is lowered
1090 1090 * or some apps specifically bind to ports in the reserved port range), the
1091 1091 * system may not be able to reserve a port range even there are enough
1092 1092 * unbound ports as a reserved port range contains consecutive ports .
1093 1093 */
1094 1094 #define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5
1095 1095 #define TCP_RESERVED_PORTS_RANGE_MAX 1000
1096 1096 #define TCP_SMALLEST_RESERVED_PORT 10240
1097 1097 #define TCP_LARGEST_RESERVED_PORT 20480
1098 1098
1099 1099 /* Structure to represent those reserved port ranges. */
1100 1100 typedef struct tcp_rport_s {
1101 1101 in_port_t lo_port;
1102 1102 in_port_t hi_port;
1103 1103 tcp_t **temp_tcp_array;
1104 1104 } tcp_rport_t;
1105 1105
1106 1106 /* Setable only in /etc/system. Move to ndd? */
1107 1107 boolean_t tcp_icmp_source_quench = B_FALSE;
1108 1108
1109 1109 /*
1110 1110 * Following assumes TPI alignment requirements stay along 32 bit
1111 1111 * boundaries
1112 1112 */
1113 1113 #define ROUNDUP32(x) \
1114 1114 (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
1115 1115
1116 1116 /* Template for response to info request. */
1117 1117 static struct T_info_ack tcp_g_t_info_ack = {
1118 1118 T_INFO_ACK, /* PRIM_type */
1119 1119 0, /* TSDU_size */
1120 1120 T_INFINITE, /* ETSDU_size */
1121 1121 T_INVALID, /* CDATA_size */
1122 1122 T_INVALID, /* DDATA_size */
1123 1123 sizeof (sin_t), /* ADDR_size */
1124 1124 0, /* OPT_size - not initialized here */
1125 1125 TIDUSZ, /* TIDU_size */
1126 1126 T_COTS_ORD, /* SERV_type */
1127 1127 TCPS_IDLE, /* CURRENT_state */
1128 1128 (XPG4_1|EXPINLINE) /* PROVIDER_flag */
1129 1129 };
1130 1130
1131 1131 static struct T_info_ack tcp_g_t_info_ack_v6 = {
1132 1132 T_INFO_ACK, /* PRIM_type */
1133 1133 0, /* TSDU_size */
1134 1134 T_INFINITE, /* ETSDU_size */
1135 1135 T_INVALID, /* CDATA_size */
1136 1136 T_INVALID, /* DDATA_size */
1137 1137 sizeof (sin6_t), /* ADDR_size */
1138 1138 0, /* OPT_size - not initialized here */
1139 1139 TIDUSZ, /* TIDU_size */
1140 1140 T_COTS_ORD, /* SERV_type */
1141 1141 TCPS_IDLE, /* CURRENT_state */
1142 1142 (XPG4_1|EXPINLINE) /* PROVIDER_flag */
1143 1143 };
1144 1144
1145 1145 #define MS 1L
1146 1146 #define SECONDS (1000 * MS)
1147 1147 #define MINUTES (60 * SECONDS)
1148 1148 #define HOURS (60 * MINUTES)
1149 1149 #define DAYS (24 * HOURS)
1150 1150
1151 1151 #define PARAM_MAX (~(uint32_t)0)
1152 1152
1153 1153 /* Max size IP datagram is 64k - 1 */
1154 1154 #define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
1155 1155 #define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
1156 1156 /* Max of the above */
1157 1157 #define TCP_MSS_MAX TCP_MSS_MAX_IPV4
1158 1158
1159 1159 /* Largest TCP port number */
1160 1160 #define TCP_MAX_PORT (64 * 1024 - 1)
1161 1161
1162 1162 /*
1163 1163 * tcp_wroff_xtra is the extra space in front of TCP/IP header for link
1164 1164 * layer header. It has to be a multiple of 4.
1165 1165 */
1166 1166 static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
1167 1167 #define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val
1168 1168
1169 1169 /*
1170 1170 * All of these are alterable, within the min/max values given, at run time.
1171 1171 * Note that the default value of "tcp_time_wait_interval" is four minutes,
1172 1172 * per the TCP spec.
1173 1173 */
1174 1174 /* BEGIN CSTYLED */
1175 1175 static tcpparam_t lcl_tcp_param_arr[] = {
1176 1176 /*min max value name */
1177 1177 { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"},
1178 1178 { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" },
1179 1179 { 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" },
1180 1180 { 1, 1024, 1, "tcp_conn_req_min" },
1181 1181 { 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" },
1182 1182 { 128, (1<<30), 1024*1024, "tcp_cwnd_max" },
1183 1183 { 0, 10, 0, "tcp_debug" },
1184 1184 { 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"},
1185 1185 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"},
1186 1186 { 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"},
1187 1187 { 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"},
1188 1188 { 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"},
1189 1189 { 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"},
1190 1190 { 1, 255, 64, "tcp_ipv4_ttl"},
1191 1191 { 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"},
1192 1192 { 0, 100, 10, "tcp_maxpsz_multiplier" },
1193 1193 { 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"},
1194 1194 { 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
1195 1195 { 1, TCP_MSS_MAX, 108, "tcp_mss_min"},
1196 1196 { 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"},
1197 1197 { 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"},
1198 1198 { 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"},
1199 1199 { 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"},
1200 1200 { 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" },
1201 1201 { 0, 16, 0, "tcp_snd_lowat_fraction" },
1202 1202 { 0, 128000, 0, "tcp_sth_rcv_hiwat" },
1203 1203 { 0, 128000, 0, "tcp_sth_rcv_lowat" },
1204 1204 { 1, 10000, 3, "tcp_dupack_fast_retransmit" },
1205 1205 { 0, 1, 0, "tcp_ignore_path_mtu" },
1206 1206 { 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"},
1207 1207 { 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"},
1208 1208 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
1209 1209 { TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
1210 1210 { TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
1211 1211 { 1, 65536, 4, "tcp_recv_hiwat_minmss"},
1212 1212 { 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"},
1213 1213 { 0, TCP_MSS_MAX, 64, "tcp_co_min"},
1214 1214 { 8192, (1<<30), 1024*1024, "tcp_max_buf"},
1215 1215 /*
1216 1216 * Question: What default value should I set for tcp_strong_iss?
1217 1217 */
1218 1218 { 0, 2, 1, "tcp_strong_iss"},
1219 1219 { 0, 65536, 20, "tcp_rtt_updates"},
1220 1220 { 0, 1, 1, "tcp_wscale_always"},
1221 1221 { 0, 1, 0, "tcp_tstamp_always"},
1222 1222 { 0, 1, 1, "tcp_tstamp_if_wscale"},
1223 1223 { 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"},
1224 1224 { 0, 16, 2, "tcp_deferred_acks_max"},
1225 1225 { 1, 16384, 4, "tcp_slow_start_after_idle"},
1226 1226 { 1, 4, 4, "tcp_slow_start_initial"},
1227 1227 { 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"},
1228 1228 { 0, 2, 2, "tcp_sack_permitted"},
1229 1229 { 0, 1, 0, "tcp_trace"},
1230 1230 { 0, 1, 1, "tcp_compression_enabled"},
1231 1231 { 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"},
1232 1232 { 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"},
1233 1233 { 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
1234 1234 { 0, 1, 0, "tcp_rev_src_routes"},
1235 1235 { 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"},
1236 1236 { 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"},
1237 1237 { 0, 16, 8, "tcp_local_dacks_max"},
1238 1238 { 0, 2, 1, "tcp_ecn_permitted"},
1239 1239 { 0, 1, 1, "tcp_rst_sent_rate_enabled"},
1240 1240 { 0, PARAM_MAX, 40, "tcp_rst_sent_rate"},
1241 1241 { 0, 100*MS, 50*MS, "tcp_push_timer_interval"},
1242 1242 { 0, 1, 0, "tcp_use_smss_as_mss_opt"},
1243 1243 { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"},
1244 1244 };
1245 1245 /* END CSTYLED */
1246 1246
1247 1247 /*
1248 1248 * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
1249 1249 * each header fragment in the header buffer. Each parameter value has
1250 1250 * to be a multiple of 4 (32-bit aligned).
1251 1251 */
1252 1252 static tcpparam_t lcl_tcp_mdt_head_param =
1253 1253 { 32, 256, 32, "tcp_mdt_hdr_head_min" };
1254 1254 static tcpparam_t lcl_tcp_mdt_tail_param =
1255 1255 { 0, 256, 32, "tcp_mdt_hdr_tail_min" };
1256 1256 #define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val
1257 1257 #define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val
1258 1258
1259 1259 /*
1260 1260 * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
1261 1261 * the maximum number of payload buffers associated per Multidata.
1262 1262 */
1263 1263 static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
1264 1264 { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
1265 1265 #define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val
1266 1266
1267 1267 /* Round up the value to the nearest mss. */
1268 1268 #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss))
1269 1269
1270 1270 /*
1271 1271 * Set ECN capable transport (ECT) code point in IP header.
1272 1272 *
1273 1273 * Note that there are 2 ECT code points '01' and '10', which are called
1274 1274 * ECT(1) and ECT(0) respectively. Here we follow the original ECT code
1275 1275 * point ECT(0) for TCP as described in RFC 2481.
1276 1276 */
1277 1277 #define SET_ECT(tcp, iph) \
1278 1278 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1279 1279 /* We need to clear the code point first. */ \
1280 1280 ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
1281 1281 ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
1282 1282 } else { \
1283 1283 ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
1284 1284 ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
1285 1285 }
1286 1286
1287 1287 /*
1288 1288 * The format argument to pass to tcp_display().
1289 1289 * DISP_PORT_ONLY means that the returned string has only port info.
1290 1290 * DISP_ADDR_AND_PORT means that the returned string also contains the
1291 1291 * remote and local IP address.
1292 1292 */
1293 1293 #define DISP_PORT_ONLY 1
1294 1294 #define DISP_ADDR_AND_PORT 2
1295 1295
1296 1296 #define NDD_TOO_QUICK_MSG \
1297 1297 "ndd get info rate too high for non-privileged users, try again " \
1298 1298 "later.\n"
1299 1299 #define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n"
1300 1300
1301 1301 #define IS_VMLOANED_MBLK(mp) \
1302 1302 (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
1303 1303
1304 1304
1305 1305 /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
1306 1306 boolean_t tcp_mdt_chain = B_TRUE;
1307 1307
1308 1308 /*
1309 1309 * MDT threshold in the form of effective send MSS multiplier; we take
1310 1310 * the MDT path if the amount of unsent data exceeds the threshold value
1311 1311 * (default threshold is 1*SMSS).
1312 1312 */
1313 1313 uint_t tcp_mdt_smss_threshold = 1;
1314 1314
1315 1315 uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */
1316 1316
1317 1317 /*
1318 1318 * Forces all connections to obey the value of the tcps_maxpsz_multiplier
1319 1319 * tunable settable via NDD. Otherwise, the per-connection behavior is
1320 1320 * determined dynamically during tcp_adapt_ire(), which is the default.
1321 1321 */
1322 1322 boolean_t tcp_static_maxpsz = B_FALSE;
1323 1323
1324 1324 /* Setable in /etc/system */
1325 1325 /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
1326 1326 uint32_t tcp_random_anon_port = 1;
1327 1327
1328 1328 /*
1329 1329 * To reach to an eager in Q0 which can be dropped due to an incoming
1330 1330 * new SYN request when Q0 is full, a new doubly linked list is
1331 1331 * introduced. This list allows to select an eager from Q0 in O(1) time.
1332 1332 * This is needed to avoid spending too much time walking through the
1333 1333 * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
1334 1334 * this new list has to be a member of Q0.
1335 1335 * This list is headed by listener's tcp_t. When the list is empty,
1336 1336 * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
1337 1337 * of listener's tcp_t point to listener's tcp_t itself.
1338 1338 *
1339 1339 * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
1340 1340 * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
1341 1341 * These macros do not affect the eager's membership to Q0.
1342 1342 */
1343 1343
1344 1344
1345 1345 #define MAKE_DROPPABLE(listener, eager) \
1346 1346 if ((eager)->tcp_eager_next_drop_q0 == NULL) { \
1347 1347 (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
1348 1348 = (eager); \
1349 1349 (eager)->tcp_eager_prev_drop_q0 = (listener); \
1350 1350 (eager)->tcp_eager_next_drop_q0 = \
1351 1351 (listener)->tcp_eager_next_drop_q0; \
1352 1352 (listener)->tcp_eager_next_drop_q0 = (eager); \
1353 1353 }
1354 1354
1355 1355 #define MAKE_UNDROPPABLE(eager) \
1356 1356 if ((eager)->tcp_eager_next_drop_q0 != NULL) { \
1357 1357 (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \
1358 1358 = (eager)->tcp_eager_prev_drop_q0; \
1359 1359 (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \
1360 1360 = (eager)->tcp_eager_next_drop_q0; \
1361 1361 (eager)->tcp_eager_prev_drop_q0 = NULL; \
1362 1362 (eager)->tcp_eager_next_drop_q0 = NULL; \
1363 1363 }
1364 1364
1365 1365 /*
1366 1366 * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
1367 1367 * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
1368 1368 * data, TCP will not respond with an ACK. RFC 793 requires that
1369 1369 * TCP responds with an ACK for such a bogus ACK. By not following
1370 1370 * the RFC, we prevent TCP from getting into an ACK storm if somehow
1371 1371 * an attacker successfully spoofs an acceptable segment to our
1372 1372 * peer; or when our peer is "confused."
1373 1373 */
1374 1374 uint32_t tcp_drop_ack_unsent_cnt = 10;
1375 1375
1376 1376 /*
1377 1377 * Hook functions to enable cluster networking
1378 1378 * On non-clustered systems these vectors must always be NULL.
1379 1379 */
1380 1380
1381 1381 void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family,
1382 1382 uint8_t *laddrp, in_port_t lport) = NULL;
1383 1383 void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family,
1384 1384 uint8_t *laddrp, in_port_t lport) = NULL;
1385 1385 void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family,
1386 1386 uint8_t *laddrp, in_port_t lport,
1387 1387 uint8_t *faddrp, in_port_t fport) = NULL;
1388 1388 void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family,
1389 1389 uint8_t *laddrp, in_port_t lport,
1390 1390 uint8_t *faddrp, in_port_t fport) = NULL;
1391 1391
1392 1392 /*
1393 1393 * The following are defined in ip.c
1394 1394 */
1395 1395 extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family,
1396 1396 uint8_t *laddrp);
1397 1397 extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
1398 1398 uint8_t *laddrp, uint8_t *faddrp);
1399 1399
1400 1400 #define CL_INET_CONNECT(tcp) { \
1401 1401 if (cl_inet_connect != NULL) { \
1402 1402 /* \
1403 1403 * Running in cluster mode - register active connection \
1404 1404 * information \
1405 1405 */ \
1406 1406 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1407 1407 if ((tcp)->tcp_ipha->ipha_src != 0) { \
1408 1408 (*cl_inet_connect)(IPPROTO_TCP, AF_INET,\
1409 1409 (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
1410 1410 (in_port_t)(tcp)->tcp_lport, \
1411 1411 (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
1412 1412 (in_port_t)(tcp)->tcp_fport); \
1413 1413 } \
1414 1414 } else { \
1415 1415 if (!IN6_IS_ADDR_UNSPECIFIED( \
1416 1416 &(tcp)->tcp_ip6h->ip6_src)) {\
1417 1417 (*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\
1418 1418 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
1419 1419 (in_port_t)(tcp)->tcp_lport, \
1420 1420 (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
1421 1421 (in_port_t)(tcp)->tcp_fport); \
1422 1422 } \
1423 1423 } \
1424 1424 } \
1425 1425 }
1426 1426
1427 1427 #define CL_INET_DISCONNECT(tcp) { \
1428 1428 if (cl_inet_disconnect != NULL) { \
1429 1429 /* \
1430 1430 * Running in cluster mode - deregister active \
1431 1431 * connection information \
1432 1432 */ \
1433 1433 if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
1434 1434 if ((tcp)->tcp_ip_src != 0) { \
1435 1435 (*cl_inet_disconnect)(IPPROTO_TCP, \
1436 1436 AF_INET, \
1437 1437 (uint8_t *)(&((tcp)->tcp_ip_src)),\
1438 1438 (in_port_t)(tcp)->tcp_lport, \
1439 1439 (uint8_t *) \
1440 1440 (&((tcp)->tcp_ipha->ipha_dst)),\
1441 1441 (in_port_t)(tcp)->tcp_fport); \
1442 1442 } \
1443 1443 } else { \
1444 1444 if (!IN6_IS_ADDR_UNSPECIFIED( \
1445 1445 &(tcp)->tcp_ip_src_v6)) { \
1446 1446 (*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\
1447 1447 (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
1448 1448 (in_port_t)(tcp)->tcp_lport, \
1449 1449 (uint8_t *) \
1450 1450 (&((tcp)->tcp_ip6h->ip6_dst)),\
1451 1451 (in_port_t)(tcp)->tcp_fport); \
1452 1452 } \
1453 1453 } \
1454 1454 } \
1455 1455 }
1456 1456
1457 1457 /*
1458 1458 * Cluster networking hook for traversing current connection list.
1459 1459 * This routine is used to extract the current list of live connections
1460 1460 * which must continue to to be dispatched to this node.
1461 1461 */
1462 1462 int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
1463 1463
1464 1464 static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
1465 1465 void *arg, tcp_stack_t *tcps);
1466 1466
1467 1467 #define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \
1468 1468 DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \
1469 1469 iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \
1470 1470 ip6_t *, ip6h, int, 0);
1471 1471
1472 1472 /*
1473 1473 * Figure out the value of window scale opton. Note that the rwnd is
1474 1474 * ASSUMED to be rounded up to the nearest MSS before the calculation.
1475 1475 * We cannot find the scale value and then do a round up of tcp_rwnd
1476 1476 * because the scale value may not be correct after that.
1477 1477 *
1478 1478 * Set the compiler flag to make this function inline.
1479 1479 */
1480 1480 static void
1481 1481 tcp_set_ws_value(tcp_t *tcp)
1482 1482 {
1483 1483 int i;
1484 1484 uint32_t rwnd = tcp->tcp_rwnd;
1485 1485
1486 1486 for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
1487 1487 i++, rwnd >>= 1)
1488 1488 ;
1489 1489 tcp->tcp_rcv_ws = i;
1490 1490 }
1491 1491
1492 1492 /*
1493 1493 * Remove a connection from the list of detached TIME_WAIT connections.
1494 1494 * It returns B_FALSE if it can't remove the connection from the list
1495 1495 * as the connection has already been removed from the list due to an
1496 1496 * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
1497 1497 */
1498 1498 static boolean_t
1499 1499 tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
1500 1500 {
1501 1501 boolean_t locked = B_FALSE;
1502 1502
1503 1503 if (tcp_time_wait == NULL) {
1504 1504 tcp_time_wait = *((tcp_squeue_priv_t **)
1505 1505 squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
1506 1506 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1507 1507 locked = B_TRUE;
1508 1508 } else {
1509 1509 ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
1510 1510 }
1511 1511
1512 1512 if (tcp->tcp_time_wait_expire == 0) {
1513 1513 ASSERT(tcp->tcp_time_wait_next == NULL);
1514 1514 ASSERT(tcp->tcp_time_wait_prev == NULL);
1515 1515 if (locked)
1516 1516 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1517 1517 return (B_FALSE);
1518 1518 }
1519 1519 ASSERT(TCP_IS_DETACHED(tcp));
1520 1520 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1521 1521
1522 1522 if (tcp == tcp_time_wait->tcp_time_wait_head) {
1523 1523 ASSERT(tcp->tcp_time_wait_prev == NULL);
1524 1524 tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
1525 1525 if (tcp_time_wait->tcp_time_wait_head != NULL) {
1526 1526 tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
1527 1527 NULL;
1528 1528 } else {
1529 1529 tcp_time_wait->tcp_time_wait_tail = NULL;
1530 1530 }
1531 1531 } else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
1532 1532 ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
1533 1533 ASSERT(tcp->tcp_time_wait_next == NULL);
1534 1534 tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
1535 1535 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1536 1536 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
1537 1537 } else {
1538 1538 ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
1539 1539 ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
1540 1540 tcp->tcp_time_wait_prev->tcp_time_wait_next =
1541 1541 tcp->tcp_time_wait_next;
1542 1542 tcp->tcp_time_wait_next->tcp_time_wait_prev =
1543 1543 tcp->tcp_time_wait_prev;
1544 1544 }
1545 1545 tcp->tcp_time_wait_next = NULL;
1546 1546 tcp->tcp_time_wait_prev = NULL;
1547 1547 tcp->tcp_time_wait_expire = 0;
1548 1548
1549 1549 if (locked)
1550 1550 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1551 1551 return (B_TRUE);
1552 1552 }
1553 1553
1554 1554 /*
1555 1555 * Add a connection to the list of detached TIME_WAIT connections
1556 1556 * and set its time to expire.
1557 1557 */
1558 1558 static void
1559 1559 tcp_time_wait_append(tcp_t *tcp)
1560 1560 {
1561 1561 tcp_stack_t *tcps = tcp->tcp_tcps;
1562 1562 tcp_squeue_priv_t *tcp_time_wait =
1563 1563 *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
1564 1564 SQPRIVATE_TCP));
1565 1565
1566 1566 tcp_timers_stop(tcp);
1567 1567
1568 1568 /* Freed above */
1569 1569 ASSERT(tcp->tcp_timer_tid == 0);
1570 1570 ASSERT(tcp->tcp_ack_tid == 0);
1571 1571
1572 1572 /* must have happened at the time of detaching the tcp */
1573 1573 ASSERT(tcp->tcp_ptpahn == NULL);
1574 1574 ASSERT(tcp->tcp_flow_stopped == 0);
1575 1575 ASSERT(tcp->tcp_time_wait_next == NULL);
1576 1576 ASSERT(tcp->tcp_time_wait_prev == NULL);
1577 1577 ASSERT(tcp->tcp_time_wait_expire == NULL);
1578 1578 ASSERT(tcp->tcp_listener == NULL);
1579 1579
1580 1580 tcp->tcp_time_wait_expire = ddi_get_lbolt();
1581 1581 /*
1582 1582 * The value computed below in tcp->tcp_time_wait_expire may
1583 1583 * appear negative or wrap around. That is ok since our
1584 1584 * interest is only in the difference between the current lbolt
1585 1585 * value and tcp->tcp_time_wait_expire. But the value should not
1586 1586 * be zero, since it means the tcp is not in the TIME_WAIT list.
1587 1587 * The corresponding comparison in tcp_time_wait_collector() uses
1588 1588 * modular arithmetic.
1589 1589 */
1590 1590 tcp->tcp_time_wait_expire +=
1591 1591 drv_usectohz(tcps->tcps_time_wait_interval * 1000);
1592 1592 if (tcp->tcp_time_wait_expire == 0)
1593 1593 tcp->tcp_time_wait_expire = 1;
1594 1594
1595 1595 ASSERT(TCP_IS_DETACHED(tcp));
1596 1596 ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
1597 1597 ASSERT(tcp->tcp_time_wait_next == NULL);
1598 1598 ASSERT(tcp->tcp_time_wait_prev == NULL);
1599 1599 TCP_DBGSTAT(tcps, tcp_time_wait);
1600 1600
1601 1601 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1602 1602 if (tcp_time_wait->tcp_time_wait_head == NULL) {
1603 1603 ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
1604 1604 tcp_time_wait->tcp_time_wait_head = tcp;
1605 1605 } else {
1606 1606 ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
1607 1607 ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
1608 1608 TCPS_TIME_WAIT);
1609 1609 tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
1610 1610 tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
1611 1611 }
1612 1612 tcp_time_wait->tcp_time_wait_tail = tcp;
1613 1613 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1614 1614 }
1615 1615
1616 1616 /* ARGSUSED */
1617 1617 void
1618 1618 tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
1619 1619 {
1620 1620 conn_t *connp = (conn_t *)arg;
1621 1621 tcp_t *tcp = connp->conn_tcp;
1622 1622 tcp_stack_t *tcps = tcp->tcp_tcps;
1623 1623
1624 1624 ASSERT(tcp != NULL);
1625 1625 if (tcp->tcp_state == TCPS_CLOSED) {
1626 1626 return;
1627 1627 }
1628 1628
1629 1629 ASSERT((tcp->tcp_family == AF_INET &&
1630 1630 tcp->tcp_ipversion == IPV4_VERSION) ||
1631 1631 (tcp->tcp_family == AF_INET6 &&
1632 1632 (tcp->tcp_ipversion == IPV4_VERSION ||
1633 1633 tcp->tcp_ipversion == IPV6_VERSION)));
1634 1634 ASSERT(!tcp->tcp_listener);
1635 1635
1636 1636 TCP_STAT(tcps, tcp_time_wait_reap);
1637 1637 ASSERT(TCP_IS_DETACHED(tcp));
1638 1638
1639 1639 /*
1640 1640 * Because they have no upstream client to rebind or tcp_close()
1641 1641 * them later, we axe the connection here and now.
1642 1642 */
1643 1643 tcp_close_detached(tcp);
1644 1644 }
1645 1645
1646 1646 /*
1647 1647 * Remove cached/latched IPsec references.
1648 1648 */
1649 1649 void
1650 1650 tcp_ipsec_cleanup(tcp_t *tcp)
1651 1651 {
1652 1652 conn_t *connp = tcp->tcp_connp;
1653 1653
1654 1654 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1655 1655
1656 1656 if (connp->conn_latch != NULL) {
1657 1657 IPLATCH_REFRELE(connp->conn_latch,
1658 1658 connp->conn_netstack);
1659 1659 connp->conn_latch = NULL;
1660 1660 }
1661 1661 if (connp->conn_policy != NULL) {
1662 1662 IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
1663 1663 connp->conn_policy = NULL;
1664 1664 }
1665 1665 }
1666 1666
1667 1667 /*
1668 1668 * Cleaup before placing on free list.
1669 1669 * Disassociate from the netstack/tcp_stack_t since the freelist
1670 1670 * is per squeue and not per netstack.
1671 1671 */
1672 1672 void
1673 1673 tcp_cleanup(tcp_t *tcp)
1674 1674 {
1675 1675 mblk_t *mp;
1676 1676 char *tcp_iphc;
1677 1677 int tcp_iphc_len;
1678 1678 int tcp_hdr_grown;
1679 1679 tcp_sack_info_t *tcp_sack_info;
1680 1680 conn_t *connp = tcp->tcp_connp;
1681 1681 tcp_stack_t *tcps = tcp->tcp_tcps;
1682 1682 netstack_t *ns = tcps->tcps_netstack;
1683 1683
1684 1684 tcp_bind_hash_remove(tcp);
1685 1685
1686 1686 /* Cleanup that which needs the netstack first */
1687 1687 tcp_ipsec_cleanup(tcp);
1688 1688
1689 1689 tcp_free(tcp);
1690 1690
1691 1691 /* Release any SSL context */
1692 1692 if (tcp->tcp_kssl_ent != NULL) {
1693 1693 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
1694 1694 tcp->tcp_kssl_ent = NULL;
1695 1695 }
1696 1696
1697 1697 if (tcp->tcp_kssl_ctx != NULL) {
1698 1698 kssl_release_ctx(tcp->tcp_kssl_ctx);
1699 1699 tcp->tcp_kssl_ctx = NULL;
1700 1700 }
1701 1701 tcp->tcp_kssl_pending = B_FALSE;
1702 1702
1703 1703 conn_delete_ire(connp, NULL);
1704 1704
1705 1705 /*
1706 1706 * Since we will bzero the entire structure, we need to
1707 1707 * remove it and reinsert it in global hash list. We
1708 1708 * know the walkers can't get to this conn because we
1709 1709 * had set CONDEMNED flag earlier and checked reference
1710 1710 * under conn_lock so walker won't pick it and when we
1711 1711 * go the ipcl_globalhash_remove() below, no walker
1712 1712 * can get to it.
1713 1713 */
1714 1714 ipcl_globalhash_remove(connp);
1715 1715
1716 1716 /*
1717 1717 * Now it is safe to decrement the reference counts.
1718 1718 * This might be the last reference on the netstack and TCPS
1719 1719 * in which case it will cause the tcp_g_q_close and
1720 1720 * the freeing of the IP Instance.
1721 1721 */
1722 1722 connp->conn_netstack = NULL;
1723 1723 netstack_rele(ns);
1724 1724 ASSERT(tcps != NULL);
1725 1725 tcp->tcp_tcps = NULL;
1726 1726 TCPS_REFRELE(tcps);
1727 1727
1728 1728 /* Save some state */
1729 1729 mp = tcp->tcp_timercache;
1730 1730
1731 1731 tcp_sack_info = tcp->tcp_sack_info;
1732 1732 tcp_iphc = tcp->tcp_iphc;
1733 1733 tcp_iphc_len = tcp->tcp_iphc_len;
1734 1734 tcp_hdr_grown = tcp->tcp_hdr_grown;
1735 1735
1736 1736 if (connp->conn_cred != NULL) {
1737 1737 crfree(connp->conn_cred);
1738 1738 connp->conn_cred = NULL;
1739 1739 }
1740 1740 if (connp->conn_peercred != NULL) {
1741 1741 crfree(connp->conn_peercred);
1742 1742 connp->conn_peercred = NULL;
1743 1743 }
1744 1744 ipcl_conn_cleanup(connp);
1745 1745 connp->conn_flags = IPCL_TCPCONN;
1746 1746 bzero(tcp, sizeof (tcp_t));
1747 1747
1748 1748 /* restore the state */
1749 1749 tcp->tcp_timercache = mp;
1750 1750
1751 1751 tcp->tcp_sack_info = tcp_sack_info;
1752 1752 tcp->tcp_iphc = tcp_iphc;
1753 1753 tcp->tcp_iphc_len = tcp_iphc_len;
1754 1754 tcp->tcp_hdr_grown = tcp_hdr_grown;
1755 1755
1756 1756 tcp->tcp_connp = connp;
1757 1757
1758 1758 ASSERT(connp->conn_tcp == tcp);
1759 1759 ASSERT(connp->conn_flags & IPCL_TCPCONN);
1760 1760 connp->conn_state_flags = CONN_INCIPIENT;
1761 1761 ASSERT(connp->conn_ulp == IPPROTO_TCP);
1762 1762 ASSERT(connp->conn_ref == 1);
1763 1763 }
1764 1764
1765 1765 /*
1766 1766 * Blows away all tcps whose TIME_WAIT has expired. List traversal
1767 1767 * is done forwards from the head.
1768 1768 * This walks all stack instances since
1769 1769 * tcp_time_wait remains global across all stacks.
1770 1770 */
1771 1771 /* ARGSUSED */
1772 1772 void
1773 1773 tcp_time_wait_collector(void *arg)
1774 1774 {
1775 1775 tcp_t *tcp;
1776 1776 clock_t now;
1777 1777 mblk_t *mp;
1778 1778 conn_t *connp;
1779 1779 kmutex_t *lock;
1780 1780 boolean_t removed;
1781 1781
1782 1782 squeue_t *sqp = (squeue_t *)arg;
1783 1783 tcp_squeue_priv_t *tcp_time_wait =
1784 1784 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
1785 1785
1786 1786 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1787 1787 tcp_time_wait->tcp_time_wait_tid = 0;
1788 1788
1789 1789 if (tcp_time_wait->tcp_free_list != NULL &&
1790 1790 tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
1791 1791 TCP_G_STAT(tcp_freelist_cleanup);
1792 1792 while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
1793 1793 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
1794 1794 tcp->tcp_time_wait_next = NULL;
1795 1795 tcp_time_wait->tcp_free_list_cnt--;
1796 1796 ASSERT(tcp->tcp_tcps == NULL);
1797 1797 CONN_DEC_REF(tcp->tcp_connp);
1798 1798 }
1799 1799 ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
1800 1800 }
1801 1801
1802 1802 /*
1803 1803 * In order to reap time waits reliably, we should use a
1804 1804 * source of time that is not adjustable by the user -- hence
1805 1805 * the call to ddi_get_lbolt().
1806 1806 */
1807 1807 now = ddi_get_lbolt();
1808 1808 while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
1809 1809 /*
1810 1810 * Compare times using modular arithmetic, since
1811 1811 * lbolt can wrapover.
1812 1812 */
1813 1813 if ((now - tcp->tcp_time_wait_expire) < 0) {
1814 1814 break;
1815 1815 }
1816 1816
1817 1817 removed = tcp_time_wait_remove(tcp, tcp_time_wait);
1818 1818 ASSERT(removed);
1819 1819
1820 1820 connp = tcp->tcp_connp;
1821 1821 ASSERT(connp->conn_fanout != NULL);
1822 1822 lock = &connp->conn_fanout->connf_lock;
1823 1823 /*
1824 1824 * This is essentially a TW reclaim fast path optimization for
1825 1825 * performance where the timewait collector checks under the
1826 1826 * fanout lock (so that no one else can get access to the
1827 1827 * conn_t) that the refcnt is 2 i.e. one for TCP and one for
1828 1828 * the classifier hash list. If ref count is indeed 2, we can
1829 1829 * just remove the conn under the fanout lock and avoid
1830 1830 * cleaning up the conn under the squeue, provided that
1831 1831 * clustering callbacks are not enabled. If clustering is
1832 1832 * enabled, we need to make the clustering callback before
1833 1833 * setting the CONDEMNED flag and after dropping all locks and
1834 1834 * so we forego this optimization and fall back to the slow
1835 1835 * path. Also please see the comments in tcp_closei_local
1836 1836 * regarding the refcnt logic.
1837 1837 *
1838 1838 * Since we are holding the tcp_time_wait_lock, its better
1839 1839 * not to block on the fanout_lock because other connections
1840 1840 * can't add themselves to time_wait list. So we do a
1841 1841 * tryenter instead of mutex_enter.
1842 1842 */
1843 1843 if (mutex_tryenter(lock)) {
1844 1844 mutex_enter(&connp->conn_lock);
1845 1845 if ((connp->conn_ref == 2) &&
1846 1846 (cl_inet_disconnect == NULL)) {
1847 1847 ipcl_hash_remove_locked(connp,
1848 1848 connp->conn_fanout);
1849 1849 /*
1850 1850 * Set the CONDEMNED flag now itself so that
1851 1851 * the refcnt cannot increase due to any
1852 1852 * walker. But we have still not cleaned up
1853 1853 * conn_ire_cache. This is still ok since
1854 1854 * we are going to clean it up in tcp_cleanup
1855 1855 * immediately and any interface unplumb
1856 1856 * thread will wait till the ire is blown away
1857 1857 */
1858 1858 connp->conn_state_flags |= CONN_CONDEMNED;
1859 1859 mutex_exit(lock);
1860 1860 mutex_exit(&connp->conn_lock);
1861 1861 if (tcp_time_wait->tcp_free_list_cnt <
1862 1862 tcp_free_list_max_cnt) {
1863 1863 /* Add to head of tcp_free_list */
1864 1864 mutex_exit(
1865 1865 &tcp_time_wait->tcp_time_wait_lock);
1866 1866 tcp_cleanup(tcp);
1867 1867 ASSERT(connp->conn_latch == NULL);
1868 1868 ASSERT(connp->conn_policy == NULL);
1869 1869 ASSERT(tcp->tcp_tcps == NULL);
1870 1870 ASSERT(connp->conn_netstack == NULL);
1871 1871
1872 1872 mutex_enter(
1873 1873 &tcp_time_wait->tcp_time_wait_lock);
1874 1874 tcp->tcp_time_wait_next =
1875 1875 tcp_time_wait->tcp_free_list;
1876 1876 tcp_time_wait->tcp_free_list = tcp;
1877 1877 tcp_time_wait->tcp_free_list_cnt++;
1878 1878 continue;
1879 1879 } else {
1880 1880 /* Do not add to tcp_free_list */
1881 1881 mutex_exit(
1882 1882 &tcp_time_wait->tcp_time_wait_lock);
1883 1883 tcp_bind_hash_remove(tcp);
1884 1884 conn_delete_ire(tcp->tcp_connp, NULL);
1885 1885 tcp_ipsec_cleanup(tcp);
1886 1886 CONN_DEC_REF(tcp->tcp_connp);
1887 1887 }
1888 1888 } else {
1889 1889 CONN_INC_REF_LOCKED(connp);
1890 1890 mutex_exit(lock);
1891 1891 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1892 1892 mutex_exit(&connp->conn_lock);
1893 1893 /*
1894 1894 * We can reuse the closemp here since conn has
1895 1895 * detached (otherwise we wouldn't even be in
1896 1896 * time_wait list). tcp_closemp_used can safely
1897 1897 * be changed without taking a lock as no other
1898 1898 * thread can concurrently access it at this
1899 1899 * point in the connection lifecycle.
1900 1900 */
1901 1901
1902 1902 if (tcp->tcp_closemp.b_prev == NULL)
1903 1903 tcp->tcp_closemp_used = B_TRUE;
1904 1904 else
1905 1905 cmn_err(CE_PANIC,
1906 1906 "tcp_timewait_collector: "
1907 1907 "concurrent use of tcp_closemp: "
1908 1908 "connp %p tcp %p\n", (void *)connp,
1909 1909 (void *)tcp);
1910 1910
1911 1911 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1912 1912 mp = &tcp->tcp_closemp;
1913 1913 squeue_fill(connp->conn_sqp, mp,
1914 1914 tcp_timewait_output, connp,
1915 1915 SQTAG_TCP_TIMEWAIT);
1916 1916 }
1917 1917 } else {
1918 1918 mutex_enter(&connp->conn_lock);
1919 1919 CONN_INC_REF_LOCKED(connp);
1920 1920 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1921 1921 mutex_exit(&connp->conn_lock);
1922 1922 /*
1923 1923 * We can reuse the closemp here since conn has
1924 1924 * detached (otherwise we wouldn't even be in
1925 1925 * time_wait list). tcp_closemp_used can safely
1926 1926 * be changed without taking a lock as no other
1927 1927 * thread can concurrently access it at this
1928 1928 * point in the connection lifecycle.
1929 1929 */
1930 1930
1931 1931 if (tcp->tcp_closemp.b_prev == NULL)
1932 1932 tcp->tcp_closemp_used = B_TRUE;
1933 1933 else
1934 1934 cmn_err(CE_PANIC, "tcp_timewait_collector: "
1935 1935 "concurrent use of tcp_closemp: "
1936 1936 "connp %p tcp %p\n", (void *)connp,
1937 1937 (void *)tcp);
1938 1938
1939 1939 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
1940 1940 mp = &tcp->tcp_closemp;
1941 1941 squeue_fill(connp->conn_sqp, mp,
1942 1942 tcp_timewait_output, connp, 0);
1943 1943 }
1944 1944 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
1945 1945 }
1946 1946
1947 1947 if (tcp_time_wait->tcp_free_list != NULL)
1948 1948 tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
1949 1949
1950 1950 tcp_time_wait->tcp_time_wait_tid =
1951 1951 timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY);
1952 1952 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
1953 1953 }
1954 1954 /*
1955 1955 * Reply to a clients T_CONN_RES TPI message. This function
1956 1956 * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
1957 1957 * on the acceptor STREAM and processed in tcp_wput_accept().
1958 1958 * Read the block comment on top of tcp_conn_request().
1959 1959 */
1960 1960 static void
1961 1961 tcp_accept(tcp_t *listener, mblk_t *mp)
1962 1962 {
1963 1963 tcp_t *acceptor;
1964 1964 tcp_t *eager;
1965 1965 tcp_t *tcp;
1966 1966 struct T_conn_res *tcr;
1967 1967 t_uscalar_t acceptor_id;
1968 1968 t_scalar_t seqnum;
1969 1969 mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */
1970 1970 mblk_t *ok_mp;
1971 1971 mblk_t *mp1;
1972 1972 tcp_stack_t *tcps = listener->tcp_tcps;
1973 1973
1974 1974 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
1975 1975 tcp_err_ack(listener, mp, TPROTO, 0);
1976 1976 return;
1977 1977 }
1978 1978 tcr = (struct T_conn_res *)mp->b_rptr;
1979 1979
1980 1980 /*
1981 1981 * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
1982 1982 * read side queue of the streams device underneath us i.e. the
1983 1983 * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
1984 1984 * look it up in the queue_hash. Under LP64 it sends down the
1985 1985 * minor_t of the accepting endpoint.
1986 1986 *
1987 1987 * Once the acceptor/eager are modified (in tcp_accept_swap) the
1988 1988 * fanout hash lock is held.
1989 1989 * This prevents any thread from entering the acceptor queue from
1990 1990 * below (since it has not been hard bound yet i.e. any inbound
1991 1991 * packets will arrive on the listener or default tcp queue and
1992 1992 * go through tcp_lookup).
1993 1993 * The CONN_INC_REF will prevent the acceptor from closing.
1994 1994 *
1995 1995 * XXX It is still possible for a tli application to send down data
1996 1996 * on the accepting stream while another thread calls t_accept.
1997 1997 * This should not be a problem for well-behaved applications since
1998 1998 * the T_OK_ACK is sent after the queue swapping is completed.
1999 1999 *
2000 2000 * If the accepting fd is the same as the listening fd, avoid
2001 2001 * queue hash lookup since that will return an eager listener in a
2002 2002 * already established state.
2003 2003 */
2004 2004 acceptor_id = tcr->ACCEPTOR_id;
2005 2005 mutex_enter(&listener->tcp_eager_lock);
2006 2006 if (listener->tcp_acceptor_id == acceptor_id) {
2007 2007 eager = listener->tcp_eager_next_q;
2008 2008 /* only count how many T_CONN_INDs so don't count q0 */
2009 2009 if ((listener->tcp_conn_req_cnt_q != 1) ||
2010 2010 (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
2011 2011 mutex_exit(&listener->tcp_eager_lock);
2012 2012 tcp_err_ack(listener, mp, TBADF, 0);
2013 2013 return;
2014 2014 }
2015 2015 if (listener->tcp_conn_req_cnt_q0 != 0) {
2016 2016 /* Throw away all the eagers on q0. */
2017 2017 tcp_eager_cleanup(listener, 1);
2018 2018 }
2019 2019 if (listener->tcp_syn_defense) {
2020 2020 listener->tcp_syn_defense = B_FALSE;
2021 2021 if (listener->tcp_ip_addr_cache != NULL) {
2022 2022 kmem_free(listener->tcp_ip_addr_cache,
2023 2023 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
2024 2024 listener->tcp_ip_addr_cache = NULL;
2025 2025 }
2026 2026 }
2027 2027 /*
2028 2028 * Transfer tcp_conn_req_max to the eager so that when
2029 2029 * a disconnect occurs we can revert the endpoint to the
2030 2030 * listen state.
2031 2031 */
2032 2032 eager->tcp_conn_req_max = listener->tcp_conn_req_max;
2033 2033 ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
2034 2034 /*
2035 2035 * Get a reference on the acceptor just like the
2036 2036 * tcp_acceptor_hash_lookup below.
2037 2037 */
2038 2038 acceptor = listener;
2039 2039 CONN_INC_REF(acceptor->tcp_connp);
2040 2040 } else {
2041 2041 acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
2042 2042 if (acceptor == NULL) {
2043 2043 if (listener->tcp_debug) {
2044 2044 (void) strlog(TCP_MOD_ID, 0, 1,
2045 2045 SL_ERROR|SL_TRACE,
2046 2046 "tcp_accept: did not find acceptor 0x%x\n",
2047 2047 acceptor_id);
2048 2048 }
2049 2049 mutex_exit(&listener->tcp_eager_lock);
2050 2050 tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
2051 2051 return;
2052 2052 }
2053 2053 /*
2054 2054 * Verify acceptor state. The acceptable states for an acceptor
2055 2055 * include TCPS_IDLE and TCPS_BOUND.
2056 2056 */
2057 2057 switch (acceptor->tcp_state) {
2058 2058 case TCPS_IDLE:
2059 2059 /* FALLTHRU */
2060 2060 case TCPS_BOUND:
2061 2061 break;
2062 2062 default:
2063 2063 CONN_DEC_REF(acceptor->tcp_connp);
2064 2064 mutex_exit(&listener->tcp_eager_lock);
2065 2065 tcp_err_ack(listener, mp, TOUTSTATE, 0);
2066 2066 return;
2067 2067 }
2068 2068 }
2069 2069
2070 2070 /* The listener must be in TCPS_LISTEN */
2071 2071 if (listener->tcp_state != TCPS_LISTEN) {
2072 2072 CONN_DEC_REF(acceptor->tcp_connp);
2073 2073 mutex_exit(&listener->tcp_eager_lock);
2074 2074 tcp_err_ack(listener, mp, TOUTSTATE, 0);
2075 2075 return;
2076 2076 }
2077 2077
2078 2078 /*
2079 2079 * Rendezvous with an eager connection request packet hanging off
2080 2080 * 'tcp' that has the 'seqnum' tag. We tagged the detached open
2081 2081 * tcp structure when the connection packet arrived in
2082 2082 * tcp_conn_request().
2083 2083 */
2084 2084 seqnum = tcr->SEQ_number;
2085 2085 eager = listener;
2086 2086 do {
2087 2087 eager = eager->tcp_eager_next_q;
2088 2088 if (eager == NULL) {
2089 2089 CONN_DEC_REF(acceptor->tcp_connp);
2090 2090 mutex_exit(&listener->tcp_eager_lock);
2091 2091 tcp_err_ack(listener, mp, TBADSEQ, 0);
2092 2092 return;
2093 2093 }
2094 2094 } while (eager->tcp_conn_req_seqnum != seqnum);
2095 2095 mutex_exit(&listener->tcp_eager_lock);
2096 2096
2097 2097 /*
2098 2098 * At this point, both acceptor and listener have 2 ref
2099 2099 * that they begin with. Acceptor has one additional ref
2100 2100 * we placed in lookup while listener has 3 additional
2101 2101 * ref for being behind the squeue (tcp_accept() is
2102 2102 * done on listener's squeue); being in classifier hash;
2103 2103 * and eager's ref on listener.
2104 2104 */
2105 2105 ASSERT(listener->tcp_connp->conn_ref >= 5);
2106 2106 ASSERT(acceptor->tcp_connp->conn_ref >= 3);
2107 2107
2108 2108 /*
2109 2109 * The eager at this point is set in its own squeue and
2110 2110 * could easily have been killed (tcp_accept_finish will
2111 2111 * deal with that) because of a TH_RST so we can only
2112 2112 * ASSERT for a single ref.
2113 2113 */
2114 2114 ASSERT(eager->tcp_connp->conn_ref >= 1);
2115 2115
2116 2116 /* Pre allocate the stroptions mblk also */
2117 2117 opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
2118 2118 if (opt_mp == NULL) {
2119 2119 CONN_DEC_REF(acceptor->tcp_connp);
2120 2120 CONN_DEC_REF(eager->tcp_connp);
2121 2121 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2122 2122 return;
2123 2123 }
2124 2124 DB_TYPE(opt_mp) = M_SETOPTS;
2125 2125 opt_mp->b_wptr += sizeof (struct stroptions);
2126 2126
2127 2127 /*
2128 2128 * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
2129 2129 * from listener to acceptor. The message is chained on opt_mp
2130 2130 * which will be sent onto eager's squeue.
2131 2131 */
2132 2132 if (listener->tcp_bound_if != 0) {
2133 2133 /* allocate optmgmt req */
2134 2134 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2135 2135 IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
2136 2136 sizeof (int));
2137 2137 if (mp1 != NULL)
2138 2138 linkb(opt_mp, mp1);
2139 2139 }
2140 2140 if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
2141 2141 uint_t on = 1;
2142 2142
2143 2143 /* allocate optmgmt req */
2144 2144 mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
2145 2145 IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
2146 2146 if (mp1 != NULL)
2147 2147 linkb(opt_mp, mp1);
2148 2148 }
2149 2149
2150 2150 /* Re-use mp1 to hold a copy of mp, in case reallocb fails */
2151 2151 if ((mp1 = copymsg(mp)) == NULL) {
2152 2152 CONN_DEC_REF(acceptor->tcp_connp);
2153 2153 CONN_DEC_REF(eager->tcp_connp);
2154 2154 freemsg(opt_mp);
2155 2155 tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
2156 2156 return;
2157 2157 }
2158 2158
2159 2159 tcr = (struct T_conn_res *)mp1->b_rptr;
2160 2160
2161 2161 /*
2162 2162 * This is an expanded version of mi_tpi_ok_ack_alloc()
2163 2163 * which allocates a larger mblk and appends the new
2164 2164 * local address to the ok_ack. The address is copied by
2165 2165 * soaccept() for getsockname().
2166 2166 */
2167 2167 {
2168 2168 int extra;
2169 2169
2170 2170 extra = (eager->tcp_family == AF_INET) ?
2171 2171 sizeof (sin_t) : sizeof (sin6_t);
2172 2172
2173 2173 /*
2174 2174 * Try to re-use mp, if possible. Otherwise, allocate
2175 2175 * an mblk and return it as ok_mp. In any case, mp
2176 2176 * is no longer usable upon return.
2177 2177 */
2178 2178 if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
2179 2179 CONN_DEC_REF(acceptor->tcp_connp);
2180 2180 CONN_DEC_REF(eager->tcp_connp);
2181 2181 freemsg(opt_mp);
2182 2182 /* Original mp has been freed by now, so use mp1 */
2183 2183 tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
2184 2184 return;
2185 2185 }
2186 2186
2187 2187 mp = NULL; /* We should never use mp after this point */
2188 2188
2189 2189 switch (extra) {
2190 2190 case sizeof (sin_t): {
2191 2191 sin_t *sin = (sin_t *)ok_mp->b_wptr;
2192 2192
2193 2193 ok_mp->b_wptr += extra;
2194 2194 sin->sin_family = AF_INET;
2195 2195 sin->sin_port = eager->tcp_lport;
2196 2196 sin->sin_addr.s_addr =
2197 2197 eager->tcp_ipha->ipha_src;
2198 2198 break;
2199 2199 }
2200 2200 case sizeof (sin6_t): {
2201 2201 sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
2202 2202
2203 2203 ok_mp->b_wptr += extra;
2204 2204 sin6->sin6_family = AF_INET6;
2205 2205 sin6->sin6_port = eager->tcp_lport;
2206 2206 if (eager->tcp_ipversion == IPV4_VERSION) {
2207 2207 sin6->sin6_flowinfo = 0;
2208 2208 IN6_IPADDR_TO_V4MAPPED(
2209 2209 eager->tcp_ipha->ipha_src,
2210 2210 &sin6->sin6_addr);
2211 2211 } else {
2212 2212 ASSERT(eager->tcp_ip6h != NULL);
2213 2213 sin6->sin6_flowinfo =
2214 2214 eager->tcp_ip6h->ip6_vcf &
2215 2215 ~IPV6_VERS_AND_FLOW_MASK;
2216 2216 sin6->sin6_addr =
2217 2217 eager->tcp_ip6h->ip6_src;
2218 2218 }
2219 2219 sin6->sin6_scope_id = 0;
2220 2220 sin6->__sin6_src_id = 0;
2221 2221 break;
2222 2222 }
2223 2223 default:
2224 2224 break;
2225 2225 }
2226 2226 ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
2227 2227 }
2228 2228
2229 2229 /*
2230 2230 * If there are no options we know that the T_CONN_RES will
2231 2231 * succeed. However, we can't send the T_OK_ACK upstream until
2232 2232 * the tcp_accept_swap is done since it would be dangerous to
2233 2233 * let the application start using the new fd prior to the swap.
2234 2234 */
2235 2235 tcp_accept_swap(listener, acceptor, eager);
2236 2236
2237 2237 /*
2238 2238 * tcp_accept_swap unlinks eager from listener but does not drop
2239 2239 * the eager's reference on the listener.
2240 2240 */
2241 2241 ASSERT(eager->tcp_listener == NULL);
2242 2242 ASSERT(listener->tcp_connp->conn_ref >= 5);
2243 2243
2244 2244 /*
2245 2245 * The eager is now associated with its own queue. Insert in
2246 2246 * the hash so that the connection can be reused for a future
2247 2247 * T_CONN_RES.
2248 2248 */
2249 2249 tcp_acceptor_hash_insert(acceptor_id, eager);
2250 2250
2251 2251 /*
2252 2252 * We now do the processing of options with T_CONN_RES.
2253 2253 * We delay till now since we wanted to have queue to pass to
2254 2254 * option processing routines that points back to the right
2255 2255 * instance structure which does not happen until after
2256 2256 * tcp_accept_swap().
2257 2257 *
2258 2258 * Note:
2259 2259 * The sanity of the logic here assumes that whatever options
2260 2260 * are appropriate to inherit from listner=>eager are done
2261 2261 * before this point, and whatever were to be overridden (or not)
2262 2262 * in transfer logic from eager=>acceptor in tcp_accept_swap().
2263 2263 * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
2264 2264 * before its ACCEPTOR_id comes down in T_CONN_RES ]
2265 2265 * This may not be true at this point in time but can be fixed
2266 2266 * independently. This option processing code starts with
2267 2267 * the instantiated acceptor instance and the final queue at
2268 2268 * this point.
2269 2269 */
2270 2270
2271 2271 if (tcr->OPT_length != 0) {
2272 2272 /* Options to process */
2273 2273 int t_error = 0;
2274 2274 int sys_error = 0;
2275 2275 int do_disconnect = 0;
2276 2276
2277 2277 if (tcp_conprim_opt_process(eager, mp1,
2278 2278 &do_disconnect, &t_error, &sys_error) < 0) {
2279 2279 eager->tcp_accept_error = 1;
2280 2280 if (do_disconnect) {
2281 2281 /*
2282 2282 * An option failed which does not allow
2283 2283 * connection to be accepted.
2284 2284 *
2285 2285 * We allow T_CONN_RES to succeed and
2286 2286 * put a T_DISCON_IND on the eager queue.
2287 2287 */
2288 2288 ASSERT(t_error == 0 && sys_error == 0);
2289 2289 eager->tcp_send_discon_ind = 1;
2290 2290 } else {
2291 2291 ASSERT(t_error != 0);
2292 2292 freemsg(ok_mp);
2293 2293 /*
2294 2294 * Original mp was either freed or set
2295 2295 * to ok_mp above, so use mp1 instead.
2296 2296 */
2297 2297 tcp_err_ack(listener, mp1, t_error, sys_error);
2298 2298 goto finish;
2299 2299 }
2300 2300 }
2301 2301 /*
2302 2302 * Most likely success in setting options (except if
2303 2303 * eager->tcp_send_discon_ind set).
2304 2304 * mp1 option buffer represented by OPT_length/offset
2305 2305 * potentially modified and contains results of setting
2306 2306 * options at this point
2307 2307 */
2308 2308 }
2309 2309
2310 2310 /* We no longer need mp1, since all options processing has passed */
2311 2311 freemsg(mp1);
2312 2312
2313 2313 putnext(listener->tcp_rq, ok_mp);
2314 2314
2315 2315 mutex_enter(&listener->tcp_eager_lock);
2316 2316 if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
2317 2317 tcp_t *tail;
2318 2318 mblk_t *conn_ind;
2319 2319
2320 2320 /*
2321 2321 * This path should not be executed if listener and
2322 2322 * acceptor streams are the same.
2323 2323 */
2324 2324 ASSERT(listener != acceptor);
2325 2325
2326 2326 tcp = listener->tcp_eager_prev_q0;
2327 2327 /*
2328 2328 * listener->tcp_eager_prev_q0 points to the TAIL of the
2329 2329 * deferred T_conn_ind queue. We need to get to the head of
2330 2330 * the queue in order to send up T_conn_ind the same order as
2331 2331 * how the 3WHS is completed.
2332 2332 */
2333 2333 while (tcp != listener) {
2334 2334 if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
2335 2335 break;
2336 2336 else
2337 2337 tcp = tcp->tcp_eager_prev_q0;
2338 2338 }
2339 2339 ASSERT(tcp != listener);
2340 2340 conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
2341 2341 ASSERT(conn_ind != NULL);
2342 2342 tcp->tcp_conn.tcp_eager_conn_ind = NULL;
2343 2343
2344 2344 /* Move from q0 to q */
2345 2345 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
2346 2346 listener->tcp_conn_req_cnt_q0--;
2347 2347 listener->tcp_conn_req_cnt_q++;
2348 2348 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
2349 2349 tcp->tcp_eager_prev_q0;
2350 2350 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
2351 2351 tcp->tcp_eager_next_q0;
2352 2352 tcp->tcp_eager_prev_q0 = NULL;
2353 2353 tcp->tcp_eager_next_q0 = NULL;
2354 2354 tcp->tcp_conn_def_q0 = B_FALSE;
2355 2355
2356 2356 /* Make sure the tcp isn't in the list of droppables */
2357 2357 ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
2358 2358 tcp->tcp_eager_prev_drop_q0 == NULL);
2359 2359
2360 2360 /*
2361 2361 * Insert at end of the queue because sockfs sends
2362 2362 * down T_CONN_RES in chronological order. Leaving
2363 2363 * the older conn indications at front of the queue
2364 2364 * helps reducing search time.
2365 2365 */
2366 2366 tail = listener->tcp_eager_last_q;
2367 2367 if (tail != NULL)
2368 2368 tail->tcp_eager_next_q = tcp;
2369 2369 else
2370 2370 listener->tcp_eager_next_q = tcp;
2371 2371 listener->tcp_eager_last_q = tcp;
2372 2372 tcp->tcp_eager_next_q = NULL;
2373 2373 mutex_exit(&listener->tcp_eager_lock);
2374 2374 putnext(tcp->tcp_rq, conn_ind);
2375 2375 } else {
2376 2376 mutex_exit(&listener->tcp_eager_lock);
2377 2377 }
2378 2378
2379 2379 /*
2380 2380 * Done with the acceptor - free it
2381 2381 *
2382 2382 * Note: from this point on, no access to listener should be made
2383 2383 * as listener can be equal to acceptor.
2384 2384 */
2385 2385 finish:
2386 2386 ASSERT(acceptor->tcp_detached);
2387 2387 ASSERT(tcps->tcps_g_q != NULL);
2388 2388 acceptor->tcp_rq = tcps->tcps_g_q;
2389 2389 acceptor->tcp_wq = WR(tcps->tcps_g_q);
2390 2390 (void) tcp_clean_death(acceptor, 0, 2);
2391 2391 CONN_DEC_REF(acceptor->tcp_connp);
2392 2392
2393 2393 /*
2394 2394 * In case we already received a FIN we have to make tcp_rput send
2395 2395 * the ordrel_ind. This will also send up a window update if the window
2396 2396 * has opened up.
2397 2397 *
2398 2398 * In the normal case of a successful connection acceptance
2399 2399 * we give the O_T_BIND_REQ to the read side put procedure as an
2400 2400 * indication that this was just accepted. This tells tcp_rput to
2401 2401 * pass up any data queued in tcp_rcv_list.
2402 2402 *
2403 2403 * In the fringe case where options sent with T_CONN_RES failed and
2404 2404 * we required, we would be indicating a T_DISCON_IND to blow
2405 2405 * away this connection.
2406 2406 */
2407 2407
2408 2408 /*
2409 2409 * XXX: we currently have a problem if XTI application closes the
2410 2410 * acceptor stream in between. This problem exists in on10-gate also
2411 2411 * and is well know but nothing can be done short of major rewrite
2412 2412 * to fix it. Now it is possible to take care of it by assigning TLI/XTI
2413 2413 * eager same squeue as listener (we can distinguish non socket
2414 2414 * listeners at the time of handling a SYN in tcp_conn_request)
2415 2415 * and do most of the work that tcp_accept_finish does here itself
2416 2416 * and then get behind the acceptor squeue to access the acceptor
2417 2417 * queue.
2418 2418 */
2419 2419 /*
2420 2420 * We already have a ref on tcp so no need to do one before squeue_fill
2421 2421 */
2422 2422 squeue_fill(eager->tcp_connp->conn_sqp, opt_mp,
2423 2423 tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH);
2424 2424 }
2425 2425
2426 2426 /*
2427 2427 * Swap information between the eager and acceptor for a TLI/XTI client.
2428 2428 * The sockfs accept is done on the acceptor stream and control goes
2429 2429 * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
2430 2430 * called. In either case, both the eager and listener are in their own
2431 2431 * perimeter (squeue) and the code has to deal with potential race.
2432 2432 *
2433 2433 * See the block comment on top of tcp_accept() and tcp_wput_accept().
2434 2434 */
2435 2435 static void
2436 2436 tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
2437 2437 {
2438 2438 conn_t *econnp, *aconnp;
2439 2439
2440 2440 ASSERT(eager->tcp_rq == listener->tcp_rq);
2441 2441 ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
2442 2442 ASSERT(!eager->tcp_hard_bound);
2443 2443 ASSERT(!TCP_IS_SOCKET(acceptor));
2444 2444 ASSERT(!TCP_IS_SOCKET(eager));
2445 2445 ASSERT(!TCP_IS_SOCKET(listener));
2446 2446
2447 2447 acceptor->tcp_detached = B_TRUE;
2448 2448 /*
2449 2449 * To permit stream re-use by TLI/XTI, the eager needs a copy of
2450 2450 * the acceptor id.
2451 2451 */
2452 2452 eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
2453 2453
2454 2454 /* remove eager from listen list... */
2455 2455 mutex_enter(&listener->tcp_eager_lock);
2456 2456 tcp_eager_unlink(eager);
2457 2457 ASSERT(eager->tcp_eager_next_q == NULL &&
2458 2458 eager->tcp_eager_last_q == NULL);
2459 2459 ASSERT(eager->tcp_eager_next_q0 == NULL &&
2460 2460 eager->tcp_eager_prev_q0 == NULL);
2461 2461 mutex_exit(&listener->tcp_eager_lock);
2462 2462 eager->tcp_rq = acceptor->tcp_rq;
2463 2463 eager->tcp_wq = acceptor->tcp_wq;
2464 2464
2465 2465 econnp = eager->tcp_connp;
2466 2466 aconnp = acceptor->tcp_connp;
2467 2467
2468 2468 eager->tcp_rq->q_ptr = econnp;
2469 2469 eager->tcp_wq->q_ptr = econnp;
2470 2470
2471 2471 /*
2472 2472 * In the TLI/XTI loopback case, we are inside the listener's squeue,
2473 2473 * which might be a different squeue from our peer TCP instance.
2474 2474 * For TCP Fusion, the peer expects that whenever tcp_detached is
2475 2475 * clear, our TCP queues point to the acceptor's queues. Thus, use
2476 2476 * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
2477 2477 * above reach global visibility prior to the clearing of tcp_detached.
2478 2478 */
2479 2479 membar_producer();
2480 2480 eager->tcp_detached = B_FALSE;
2481 2481
2482 2482 ASSERT(eager->tcp_ack_tid == 0);
2483 2483
2484 2484 econnp->conn_dev = aconnp->conn_dev;
2485 2485 econnp->conn_minor_arena = aconnp->conn_minor_arena;
2486 2486 ASSERT(econnp->conn_minor_arena != NULL);
2487 2487 if (eager->tcp_cred != NULL)
2488 2488 crfree(eager->tcp_cred);
2489 2489 eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
2490 2490 ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
2491 2491 ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
2492 2492
2493 2493 aconnp->conn_cred = NULL;
2494 2494
2495 2495 econnp->conn_zoneid = aconnp->conn_zoneid;
2496 2496 econnp->conn_allzones = aconnp->conn_allzones;
2497 2497
2498 2498 econnp->conn_mac_exempt = aconnp->conn_mac_exempt;
2499 2499 aconnp->conn_mac_exempt = B_FALSE;
2500 2500
2501 2501 ASSERT(aconnp->conn_peercred == NULL);
2502 2502
2503 2503 /* Do the IPC initialization */
2504 2504 CONN_INC_REF(econnp);
2505 2505
2506 2506 econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
2507 2507 econnp->conn_af_isv6 = aconnp->conn_af_isv6;
2508 2508 econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
2509 2509
2510 2510 /* Done with old IPC. Drop its ref on its connp */
2511 2511 CONN_DEC_REF(aconnp);
2512 2512 }
2513 2513
2514 2514
2515 2515 /*
2516 2516 * Adapt to the information, such as rtt and rtt_sd, provided from the
2517 2517 * ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
2518 2518 *
2519 2519 * Checks for multicast and broadcast destination address.
2520 2520 * Returns zero on failure; non-zero if ok.
2521 2521 *
2522 2522 * Note that the MSS calculation here is based on the info given in
2523 2523 * the IRE. We do not do any calculation based on TCP options. They
2524 2524 * will be handled in tcp_rput_other() and tcp_rput_data() when TCP
2525 2525 * knows which options to use.
2526 2526 *
2527 2527 * Note on how TCP gets its parameters for a connection.
2528 2528 *
2529 2529 * When a tcp_t structure is allocated, it gets all the default parameters.
2530 2530 * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
2531 2531 * spipe, rpipe, ... from the route metrics. Route metric overrides the
2532 2532 * default. But if there is an associated tcp_host_param, it will override
2533 2533 * the metrics.
2534 2534 *
2535 2535 * An incoming SYN with a multicast or broadcast destination address, is dropped
2536 2536 * in 1 of 2 places.
2537 2537 *
2538 2538 * 1. If the packet was received over the wire it is dropped in
2539 2539 * ip_rput_process_broadcast()
2540 2540 *
2541 2541 * 2. If the packet was received through internal IP loopback, i.e. the packet
2542 2542 * was generated and received on the same machine, it is dropped in
2543 2543 * ip_wput_local()
2544 2544 *
2545 2545 * An incoming SYN with a multicast or broadcast source address is always
2546 2546 * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
2547 2547 * reject an attempt to connect to a broadcast or multicast (destination)
2548 2548 * address.
2549 2549 */
2550 2550 static int
2551 2551 tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
2552 2552 {
2553 2553 tcp_hsp_t *hsp;
2554 2554 ire_t *ire;
2555 2555 ire_t *sire = NULL;
2556 2556 iulp_t *ire_uinfo = NULL;
2557 2557 uint32_t mss_max;
2558 2558 uint32_t mss;
2559 2559 boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
2560 2560 conn_t *connp = tcp->tcp_connp;
2561 2561 boolean_t ire_cacheable = B_FALSE;
2562 2562 zoneid_t zoneid = connp->conn_zoneid;
2563 2563 int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
2564 2564 MATCH_IRE_SECATTR;
2565 2565 ts_label_t *tsl = crgetlabel(CONN_CRED(connp));
2566 2566 ill_t *ill = NULL;
2567 2567 boolean_t incoming = (ire_mp == NULL);
2568 2568 tcp_stack_t *tcps = tcp->tcp_tcps;
2569 2569 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
2570 2570
2571 2571 ASSERT(connp->conn_ire_cache == NULL);
2572 2572
2573 2573 if (tcp->tcp_ipversion == IPV4_VERSION) {
2574 2574
2575 2575 if (CLASSD(tcp->tcp_connp->conn_rem)) {
2576 2576 BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
2577 2577 return (0);
2578 2578 }
2579 2579 /*
2580 2580 * If IP_NEXTHOP is set, then look for an IRE_CACHE
2581 2581 * for the destination with the nexthop as gateway.
2582 2582 * ire_ctable_lookup() is used because this particular
2583 2583 * ire, if it exists, will be marked private.
2584 2584 * If that is not available, use the interface ire
2585 2585 * for the nexthop.
2586 2586 *
2587 2587 * TSol: tcp_update_label will detect label mismatches based
2588 2588 * only on the destination's label, but that would not
2589 2589 * detect label mismatches based on the security attributes
2590 2590 * of routes or next hop gateway. Hence we need to pass the
2591 2591 * label to ire_ftable_lookup below in order to locate the
2592 2592 * right prefix (and/or) ire cache. Similarly we also need
2593 2593 * pass the label to the ire_cache_lookup below to locate
2594 2594 * the right ire that also matches on the label.
2595 2595 */
2596 2596 if (tcp->tcp_connp->conn_nexthop_set) {
2597 2597 ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
2598 2598 tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
2599 2599 tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW,
2600 2600 ipst);
2601 2601 if (ire == NULL) {
2602 2602 ire = ire_ftable_lookup(
2603 2603 tcp->tcp_connp->conn_nexthop_v4,
2604 2604 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
2605 2605 tsl, match_flags, ipst);
2606 2606 if (ire == NULL)
2607 2607 return (0);
2608 2608 } else {
2609 2609 ire_uinfo = &ire->ire_uinfo;
2610 2610 }
2611 2611 } else {
2612 2612 ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
2613 2613 zoneid, tsl, ipst);
2614 2614 if (ire != NULL) {
2615 2615 ire_cacheable = B_TRUE;
2616 2616 ire_uinfo = (ire_mp != NULL) ?
2617 2617 &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2618 2618 &ire->ire_uinfo;
2619 2619
2620 2620 } else {
2621 2621 if (ire_mp == NULL) {
2622 2622 ire = ire_ftable_lookup(
2623 2623 tcp->tcp_connp->conn_rem,
2624 2624 0, 0, 0, NULL, &sire, zoneid, 0,
2625 2625 tsl, (MATCH_IRE_RECURSIVE |
2626 2626 MATCH_IRE_DEFAULT), ipst);
2627 2627 if (ire == NULL)
2628 2628 return (0);
2629 2629 ire_uinfo = (sire != NULL) ?
2630 2630 &sire->ire_uinfo :
2631 2631 &ire->ire_uinfo;
2632 2632 } else {
2633 2633 ire = (ire_t *)ire_mp->b_rptr;
2634 2634 ire_uinfo =
2635 2635 &((ire_t *)
2636 2636 ire_mp->b_rptr)->ire_uinfo;
2637 2637 }
2638 2638 }
2639 2639 }
2640 2640 ASSERT(ire != NULL);
2641 2641
2642 2642 if ((ire->ire_src_addr == INADDR_ANY) ||
2643 2643 (ire->ire_type & IRE_BROADCAST)) {
2644 2644 /*
2645 2645 * ire->ire_mp is non null when ire_mp passed in is used
2646 2646 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2647 2647 */
2648 2648 if (ire->ire_mp == NULL)
2649 2649 ire_refrele(ire);
2650 2650 if (sire != NULL)
2651 2651 ire_refrele(sire);
2652 2652 return (0);
2653 2653 }
2654 2654
2655 2655 if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
2656 2656 ipaddr_t src_addr;
2657 2657
2658 2658 /*
2659 2659 * ip_bind_connected() has stored the correct source
2660 2660 * address in conn_src.
2661 2661 */
2662 2662 src_addr = tcp->tcp_connp->conn_src;
2663 2663 tcp->tcp_ipha->ipha_src = src_addr;
2664 2664 /*
2665 2665 * Copy of the src addr. in tcp_t is needed
2666 2666 * for the lookup funcs.
2667 2667 */
2668 2668 IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
2669 2669 }
2670 2670 /*
2671 2671 * Set the fragment bit so that IP will tell us if the MTU
2672 2672 * should change. IP tells us the latest setting of
2673 2673 * ip_path_mtu_discovery through ire_frag_flag.
2674 2674 */
2675 2675 if (ipst->ips_ip_path_mtu_discovery) {
2676 2676 tcp->tcp_ipha->ipha_fragment_offset_and_flags =
2677 2677 htons(IPH_DF);
2678 2678 }
2679 2679 /*
2680 2680 * If ire_uinfo is NULL, this is the IRE_INTERFACE case
2681 2681 * for IP_NEXTHOP. No cache ire has been found for the
2682 2682 * destination and we are working with the nexthop's
2683 2683 * interface ire. Since we need to forward all packets
2684 2684 * to the nexthop first, we "blindly" set tcp_localnet
2685 2685 * to false, eventhough the destination may also be
2686 2686 * onlink.
2687 2687 */
2688 2688 if (ire_uinfo == NULL)
2689 2689 tcp->tcp_localnet = 0;
2690 2690 else
2691 2691 tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
2692 2692 } else {
2693 2693 /*
2694 2694 * For incoming connection ire_mp = NULL
2695 2695 * For outgoing connection ire_mp != NULL
2696 2696 * Technically we should check conn_incoming_ill
2697 2697 * when ire_mp is NULL and conn_outgoing_ill when
2698 2698 * ire_mp is non-NULL. But this is performance
2699 2699 * critical path and for IPV*_BOUND_IF, outgoing
2700 2700 * and incoming ill are always set to the same value.
2701 2701 */
2702 2702 ill_t *dst_ill = NULL;
2703 2703 ipif_t *dst_ipif = NULL;
2704 2704
2705 2705 ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
2706 2706
2707 2707 if (connp->conn_outgoing_ill != NULL) {
2708 2708 /* Outgoing or incoming path */
2709 2709 int err;
2710 2710
2711 2711 dst_ill = conn_get_held_ill(connp,
2712 2712 &connp->conn_outgoing_ill, &err);
2713 2713 if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
2714 2714 ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
2715 2715 return (0);
2716 2716 }
2717 2717 match_flags |= MATCH_IRE_ILL;
2718 2718 dst_ipif = dst_ill->ill_ipif;
2719 2719 }
2720 2720 ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
2721 2721 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst);
2722 2722
2723 2723 if (ire != NULL) {
2724 2724 ire_cacheable = B_TRUE;
2725 2725 ire_uinfo = (ire_mp != NULL) ?
2726 2726 &((ire_t *)ire_mp->b_rptr)->ire_uinfo:
2727 2727 &ire->ire_uinfo;
2728 2728 } else {
2729 2729 if (ire_mp == NULL) {
2730 2730 ire = ire_ftable_lookup_v6(
2731 2731 &tcp->tcp_connp->conn_remv6,
2732 2732 0, 0, 0, dst_ipif, &sire, zoneid,
2733 2733 0, tsl, match_flags, ipst);
2734 2734 if (ire == NULL) {
2735 2735 if (dst_ill != NULL)
2736 2736 ill_refrele(dst_ill);
2737 2737 return (0);
2738 2738 }
2739 2739 ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
2740 2740 &ire->ire_uinfo;
2741 2741 } else {
2742 2742 ire = (ire_t *)ire_mp->b_rptr;
2743 2743 ire_uinfo =
2744 2744 &((ire_t *)ire_mp->b_rptr)->ire_uinfo;
2745 2745 }
2746 2746 }
2747 2747 if (dst_ill != NULL)
2748 2748 ill_refrele(dst_ill);
2749 2749
2750 2750 ASSERT(ire != NULL);
2751 2751 ASSERT(ire_uinfo != NULL);
2752 2752
2753 2753 if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
2754 2754 IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
2755 2755 /*
2756 2756 * ire->ire_mp is non null when ire_mp passed in is used
2757 2757 * ire->ire_mp is set in ip_bind_insert_ire[_v6]().
2758 2758 */
2759 2759 if (ire->ire_mp == NULL)
2760 2760 ire_refrele(ire);
2761 2761 if (sire != NULL)
2762 2762 ire_refrele(sire);
2763 2763 return (0);
2764 2764 }
2765 2765
2766 2766 if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
2767 2767 in6_addr_t src_addr;
2768 2768
2769 2769 /*
2770 2770 * ip_bind_connected_v6() has stored the correct source
2771 2771 * address per IPv6 addr. selection policy in
2772 2772 * conn_src_v6.
2773 2773 */
2774 2774 src_addr = tcp->tcp_connp->conn_srcv6;
2775 2775
2776 2776 tcp->tcp_ip6h->ip6_src = src_addr;
2777 2777 /*
2778 2778 * Copy of the src addr. in tcp_t is needed
2779 2779 * for the lookup funcs.
2780 2780 */
2781 2781 tcp->tcp_ip_src_v6 = src_addr;
2782 2782 ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src,
2783 2783 &connp->conn_srcv6));
2784 2784 }
2785 2785 tcp->tcp_localnet =
2786 2786 IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6);
2787 2787 }
2788 2788
2789 2789 /*
2790 2790 * This allows applications to fail quickly when connections are made
2791 2791 * to dead hosts. Hosts can be labeled dead by adding a reject route
2792 2792 * with both the RTF_REJECT and RTF_PRIVATE flags set.
2793 2793 */
2794 2794 if ((ire->ire_flags & RTF_REJECT) &&
2795 2795 (ire->ire_flags & RTF_PRIVATE))
2796 2796 goto error;
2797 2797
2798 2798 /*
2799 2799 * Make use of the cached rtt and rtt_sd values to calculate the
2800 2800 * initial RTO. Note that they are already initialized in
2801 2801 * tcp_init_values().
2802 2802 * If ire_uinfo is NULL, i.e., we do not have a cache ire for
2803 2803 * IP_NEXTHOP, but instead are using the interface ire for the
2804 2804 * nexthop, then we do not use the ire_uinfo from that ire to
2805 2805 * do any initializations.
2806 2806 */
2807 2807 if (ire_uinfo != NULL) {
2808 2808 if (ire_uinfo->iulp_rtt != 0) {
2809 2809 clock_t rto;
2810 2810
2811 2811 tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt;
2812 2812 tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd;
2813 2813 rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
2814 2814 tcps->tcps_rexmit_interval_extra +
2815 2815 (tcp->tcp_rtt_sa >> 5);
2816 2816
2817 2817 if (rto > tcps->tcps_rexmit_interval_max) {
2818 2818 tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
2819 2819 } else if (rto < tcps->tcps_rexmit_interval_min) {
2820 2820 tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
2821 2821 } else {
2822 2822 tcp->tcp_rto = rto;
2823 2823 }
2824 2824 }
2825 2825 if (ire_uinfo->iulp_ssthresh != 0)
2826 2826 tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh;
2827 2827 else
2828 2828 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
2829 2829 if (ire_uinfo->iulp_spipe > 0) {
2830 2830 tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe,
2831 2831 tcps->tcps_max_buf);
2832 2832 if (tcps->tcps_snd_lowat_fraction != 0)
2833 2833 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2834 2834 tcps->tcps_snd_lowat_fraction;
2835 2835 (void) tcp_maxpsz_set(tcp, B_TRUE);
2836 2836 }
2837 2837 /*
2838 2838 * Note that up till now, acceptor always inherits receive
2839 2839 * window from the listener. But if there is a metrics
2840 2840 * associated with a host, we should use that instead of
2841 2841 * inheriting it from listener. Thus we need to pass this
2842 2842 * info back to the caller.
2843 2843 */
2844 2844 if (ire_uinfo->iulp_rpipe > 0) {
2845 2845 tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe,
2846 2846 tcps->tcps_max_buf);
2847 2847 }
2848 2848
2849 2849 if (ire_uinfo->iulp_rtomax > 0) {
2850 2850 tcp->tcp_second_timer_threshold =
2851 2851 ire_uinfo->iulp_rtomax;
2852 2852 }
2853 2853
2854 2854 /*
2855 2855 * Use the metric option settings, iulp_tstamp_ok and
2856 2856 * iulp_wscale_ok, only for active open. What this means
2857 2857 * is that if the other side uses timestamp or window
2858 2858 * scale option, TCP will also use those options. That
2859 2859 * is for passive open. If the application sets a
2860 2860 * large window, window scale is enabled regardless of
2861 2861 * the value in iulp_wscale_ok. This is the behavior
2862 2862 * since 2.6. So we keep it.
2863 2863 * The only case left in passive open processing is the
2864 2864 * check for SACK.
2865 2865 * For ECN, it should probably be like SACK. But the
2866 2866 * current value is binary, so we treat it like the other
2867 2867 * cases. The metric only controls active open.For passive
2868 2868 * open, the ndd param, tcp_ecn_permitted, controls the
2869 2869 * behavior.
2870 2870 */
2871 2871 if (!tcp_detached) {
2872 2872 /*
2873 2873 * The if check means that the following can only
2874 2874 * be turned on by the metrics only IRE, but not off.
2875 2875 */
2876 2876 if (ire_uinfo->iulp_tstamp_ok)
2877 2877 tcp->tcp_snd_ts_ok = B_TRUE;
2878 2878 if (ire_uinfo->iulp_wscale_ok)
2879 2879 tcp->tcp_snd_ws_ok = B_TRUE;
2880 2880 if (ire_uinfo->iulp_sack == 2)
2881 2881 tcp->tcp_snd_sack_ok = B_TRUE;
2882 2882 if (ire_uinfo->iulp_ecn_ok)
2883 2883 tcp->tcp_ecn_ok = B_TRUE;
2884 2884 } else {
2885 2885 /*
2886 2886 * Passive open.
2887 2887 *
2888 2888 * As above, the if check means that SACK can only be
2889 2889 * turned on by the metric only IRE.
2890 2890 */
2891 2891 if (ire_uinfo->iulp_sack > 0) {
2892 2892 tcp->tcp_snd_sack_ok = B_TRUE;
2893 2893 }
2894 2894 }
2895 2895 }
2896 2896
2897 2897
2898 2898 /*
2899 2899 * XXX: Note that currently, ire_max_frag can be as small as 68
2900 2900 * because of PMTUd. So tcp_mss may go to negative if combined
2901 2901 * length of all those options exceeds 28 bytes. But because
2902 2902 * of the tcp_mss_min check below, we may not have a problem if
2903 2903 * tcp_mss_min is of a reasonable value. The default is 1 so
2904 2904 * the negative problem still exists. And the check defeats PMTUd.
2905 2905 * In fact, if PMTUd finds that the MSS should be smaller than
2906 2906 * tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
2907 2907 * value.
2908 2908 *
2909 2909 * We do not deal with that now. All those problems related to
2910 2910 * PMTUd will be fixed later.
2911 2911 */
2912 2912 ASSERT(ire->ire_max_frag != 0);
2913 2913 mss = tcp->tcp_if_mtu = ire->ire_max_frag;
2914 2914 if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) {
2915 2915 if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) {
2916 2916 mss = MIN(mss, IPV6_MIN_MTU);
2917 2917 }
2918 2918 }
2919 2919
2920 2920 /* Sanity check for MSS value. */
2921 2921 if (tcp->tcp_ipversion == IPV4_VERSION)
2922 2922 mss_max = tcps->tcps_mss_max_ipv4;
2923 2923 else
2924 2924 mss_max = tcps->tcps_mss_max_ipv6;
2925 2925
2926 2926 if (tcp->tcp_ipversion == IPV6_VERSION &&
2927 2927 (ire->ire_frag_flag & IPH_FRAG_HDR)) {
2928 2928 /*
2929 2929 * After receiving an ICMPv6 "packet too big" message with a
2930 2930 * MTU < 1280, and for multirouted IPv6 packets, the IP layer
2931 2931 * will insert a 8-byte fragment header in every packet; we
2932 2932 * reduce the MSS by that amount here.
2933 2933 */
2934 2934 mss -= sizeof (ip6_frag_t);
2935 2935 }
2936 2936
2937 2937 if (tcp->tcp_ipsec_overhead == 0)
2938 2938 tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
2939 2939
2940 2940 mss -= tcp->tcp_ipsec_overhead;
2941 2941
2942 2942 if (mss < tcps->tcps_mss_min)
2943 2943 mss = tcps->tcps_mss_min;
2944 2944 if (mss > mss_max)
2945 2945 mss = mss_max;
2946 2946
2947 2947 /* Note that this is the maximum MSS, excluding all options. */
2948 2948 tcp->tcp_mss = mss;
2949 2949
2950 2950 /*
2951 2951 * Initialize the ISS here now that we have the full connection ID.
2952 2952 * The RFC 1948 method of initial sequence number generation requires
2953 2953 * knowledge of the full connection ID before setting the ISS.
2954 2954 */
2955 2955
2956 2956 tcp_iss_init(tcp);
2957 2957
2958 2958 if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL))
2959 2959 tcp->tcp_loopback = B_TRUE;
2960 2960
2961 2961 if (tcp->tcp_ipversion == IPV4_VERSION) {
2962 2962 hsp = tcp_hsp_lookup(tcp->tcp_remote, tcps);
2963 2963 } else {
2964 2964 hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6, tcps);
2965 2965 }
2966 2966
2967 2967 if (hsp != NULL) {
2968 2968 /* Only modify if we're going to make them bigger */
2969 2969 if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) {
2970 2970 tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace;
2971 2971 if (tcps->tcps_snd_lowat_fraction != 0)
2972 2972 tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater /
2973 2973 tcps->tcps_snd_lowat_fraction;
2974 2974 }
2975 2975
2976 2976 if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) {
2977 2977 tcp->tcp_rwnd = hsp->tcp_hsp_recvspace;
2978 2978 }
2979 2979
2980 2980 /* Copy timestamp flag only for active open */
2981 2981 if (!tcp_detached)
2982 2982 tcp->tcp_snd_ts_ok = hsp->tcp_hsp_tstamp;
2983 2983 }
2984 2984
2985 2985 if (sire != NULL)
2986 2986 IRE_REFRELE(sire);
2987 2987
2988 2988 /*
2989 2989 * If we got an IRE_CACHE and an ILL, go through their properties;
2990 2990 * otherwise, this is deferred until later when we have an IRE_CACHE.
2991 2991 */
2992 2992 if (tcp->tcp_loopback ||
2993 2993 (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) {
2994 2994 /*
2995 2995 * For incoming, see if this tcp may be MDT-capable. For
2996 2996 * outgoing, this process has been taken care of through
2997 2997 * tcp_rput_other.
2998 2998 */
2999 2999 tcp_ire_ill_check(tcp, ire, ill, incoming);
3000 3000 tcp->tcp_ire_ill_check_done = B_TRUE;
3001 3001 }
3002 3002
3003 3003 mutex_enter(&connp->conn_lock);
3004 3004 /*
3005 3005 * Make sure that conn is not marked incipient
3006 3006 * for incoming connections. A blind
3007 3007 * removal of incipient flag is cheaper than
3008 3008 * check and removal.
3009 3009 */
3010 3010 connp->conn_state_flags &= ~CONN_INCIPIENT;
3011 3011
3012 3012 /*
3013 3013 * Must not cache forwarding table routes
3014 3014 * or recache an IRE after the conn_t has
3015 3015 * had conn_ire_cache cleared and is flagged
3016 3016 * unusable, (see the CONN_CACHE_IRE() macro).
3017 3017 */
3018 3018 if (ire_cacheable && CONN_CACHE_IRE(connp)) {
3019 3019 rw_enter(&ire->ire_bucket->irb_lock, RW_READER);
3020 3020 if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) {
3021 3021 connp->conn_ire_cache = ire;
3022 3022 IRE_UNTRACE_REF(ire);
3023 3023 rw_exit(&ire->ire_bucket->irb_lock);
3024 3024 mutex_exit(&connp->conn_lock);
3025 3025 return (1);
3026 3026 }
3027 3027 rw_exit(&ire->ire_bucket->irb_lock);
3028 3028 }
3029 3029 mutex_exit(&connp->conn_lock);
3030 3030
3031 3031 if (ire->ire_mp == NULL)
3032 3032 ire_refrele(ire);
3033 3033 return (1);
3034 3034
3035 3035 error:
3036 3036 if (ire->ire_mp == NULL)
3037 3037 ire_refrele(ire);
3038 3038 if (sire != NULL)
3039 3039 ire_refrele(sire);
3040 3040 return (0);
3041 3041 }
3042 3042
3043 3043 /*
3044 3044 * tcp_bind is called (holding the writer lock) by tcp_wput_proto to process a
3045 3045 * O_T_BIND_REQ/T_BIND_REQ message.
3046 3046 */
3047 3047 static void
3048 3048 tcp_bind(tcp_t *tcp, mblk_t *mp)
3049 3049 {
3050 3050 sin_t *sin;
3051 3051 sin6_t *sin6;
3052 3052 mblk_t *mp1;
3053 3053 in_port_t requested_port;
3054 3054 in_port_t allocated_port;
3055 3055 struct T_bind_req *tbr;
3056 3056 boolean_t bind_to_req_port_only;
3057 3057 boolean_t backlog_update = B_FALSE;
3058 3058 boolean_t user_specified;
3059 3059 in6_addr_t v6addr;
3060 3060 ipaddr_t v4addr;
3061 3061 uint_t origipversion;
3062 3062 int err;
3063 3063 queue_t *q = tcp->tcp_wq;
3064 3064 conn_t *connp = tcp->tcp_connp;
3065 3065 mlp_type_t addrtype, mlptype;
3066 3066 zone_t *zone;
3067 3067 cred_t *cr;
3068 3068 in_port_t mlp_port;
3069 3069 tcp_stack_t *tcps = tcp->tcp_tcps;
3070 3070
3071 3071 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
3072 3072 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
3073 3073 if (tcp->tcp_debug) {
3074 3074 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3075 3075 "tcp_bind: bad req, len %u",
3076 3076 (uint_t)(mp->b_wptr - mp->b_rptr));
3077 3077 }
3078 3078 tcp_err_ack(tcp, mp, TPROTO, 0);
3079 3079 return;
3080 3080 }
3081 3081 /* Make sure the largest address fits */
3082 3082 mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1);
3083 3083 if (mp1 == NULL) {
3084 3084 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3085 3085 return;
3086 3086 }
3087 3087 mp = mp1;
3088 3088 tbr = (struct T_bind_req *)mp->b_rptr;
3089 3089 if (tcp->tcp_state >= TCPS_BOUND) {
3090 3090 if ((tcp->tcp_state == TCPS_BOUND ||
3091 3091 tcp->tcp_state == TCPS_LISTEN) &&
3092 3092 tcp->tcp_conn_req_max != tbr->CONIND_number &&
3093 3093 tbr->CONIND_number > 0) {
3094 3094 /*
3095 3095 * Handle listen() increasing CONIND_number.
3096 3096 * This is more "liberal" then what the TPI spec
3097 3097 * requires but is needed to avoid a t_unbind
3098 3098 * when handling listen() since the port number
3099 3099 * might be "stolen" between the unbind and bind.
3100 3100 */
3101 3101 backlog_update = B_TRUE;
3102 3102 goto do_bind;
3103 3103 }
3104 3104 if (tcp->tcp_debug) {
3105 3105 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3106 3106 "tcp_bind: bad state, %d", tcp->tcp_state);
3107 3107 }
3108 3108 tcp_err_ack(tcp, mp, TOUTSTATE, 0);
3109 3109 return;
3110 3110 }
3111 3111 origipversion = tcp->tcp_ipversion;
3112 3112
3113 3113 switch (tbr->ADDR_length) {
3114 3114 case 0: /* request for a generic port */
3115 3115 tbr->ADDR_offset = sizeof (struct T_bind_req);
3116 3116 if (tcp->tcp_family == AF_INET) {
3117 3117 tbr->ADDR_length = sizeof (sin_t);
3118 3118 sin = (sin_t *)&tbr[1];
3119 3119 *sin = sin_null;
3120 3120 sin->sin_family = AF_INET;
3121 3121 mp->b_wptr = (uchar_t *)&sin[1];
3122 3122 tcp->tcp_ipversion = IPV4_VERSION;
3123 3123 IN6_IPADDR_TO_V4MAPPED(INADDR_ANY, &v6addr);
3124 3124 } else {
3125 3125 ASSERT(tcp->tcp_family == AF_INET6);
3126 3126 tbr->ADDR_length = sizeof (sin6_t);
3127 3127 sin6 = (sin6_t *)&tbr[1];
3128 3128 *sin6 = sin6_null;
3129 3129 sin6->sin6_family = AF_INET6;
3130 3130 mp->b_wptr = (uchar_t *)&sin6[1];
3131 3131 tcp->tcp_ipversion = IPV6_VERSION;
3132 3132 V6_SET_ZERO(v6addr);
3133 3133 }
3134 3134 requested_port = 0;
3135 3135 break;
3136 3136
3137 3137 case sizeof (sin_t): /* Complete IPv4 address */
3138 3138 sin = (sin_t *)mi_offset_param(mp, tbr->ADDR_offset,
3139 3139 sizeof (sin_t));
3140 3140 if (sin == NULL || !OK_32PTR((char *)sin)) {
3141 3141 if (tcp->tcp_debug) {
3142 3142 (void) strlog(TCP_MOD_ID, 0, 1,
3143 3143 SL_ERROR|SL_TRACE,
3144 3144 "tcp_bind: bad address parameter, "
3145 3145 "offset %d, len %d",
3146 3146 tbr->ADDR_offset, tbr->ADDR_length);
3147 3147 }
3148 3148 tcp_err_ack(tcp, mp, TPROTO, 0);
3149 3149 return;
3150 3150 }
3151 3151 /*
3152 3152 * With sockets sockfs will accept bogus sin_family in
3153 3153 * bind() and replace it with the family used in the socket
3154 3154 * call.
3155 3155 */
3156 3156 if (sin->sin_family != AF_INET ||
3157 3157 tcp->tcp_family != AF_INET) {
3158 3158 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3159 3159 return;
3160 3160 }
3161 3161 requested_port = ntohs(sin->sin_port);
3162 3162 tcp->tcp_ipversion = IPV4_VERSION;
3163 3163 v4addr = sin->sin_addr.s_addr;
3164 3164 IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr);
3165 3165 break;
3166 3166
3167 3167 case sizeof (sin6_t): /* Complete IPv6 address */
3168 3168 sin6 = (sin6_t *)mi_offset_param(mp,
3169 3169 tbr->ADDR_offset, sizeof (sin6_t));
3170 3170 if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
3171 3171 if (tcp->tcp_debug) {
3172 3172 (void) strlog(TCP_MOD_ID, 0, 1,
3173 3173 SL_ERROR|SL_TRACE,
3174 3174 "tcp_bind: bad IPv6 address parameter, "
3175 3175 "offset %d, len %d", tbr->ADDR_offset,
3176 3176 tbr->ADDR_length);
3177 3177 }
3178 3178 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
3179 3179 return;
3180 3180 }
3181 3181 if (sin6->sin6_family != AF_INET6 ||
3182 3182 tcp->tcp_family != AF_INET6) {
3183 3183 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
3184 3184 return;
3185 3185 }
3186 3186 requested_port = ntohs(sin6->sin6_port);
3187 3187 tcp->tcp_ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ?
3188 3188 IPV4_VERSION : IPV6_VERSION;
3189 3189 v6addr = sin6->sin6_addr;
3190 3190 break;
3191 3191
3192 3192 default:
3193 3193 if (tcp->tcp_debug) {
3194 3194 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
3195 3195 "tcp_bind: bad address length, %d",
3196 3196 tbr->ADDR_length);
3197 3197 }
3198 3198 tcp_err_ack(tcp, mp, TBADADDR, 0);
3199 3199 return;
3200 3200 }
3201 3201 tcp->tcp_bound_source_v6 = v6addr;
3202 3202
3203 3203 /* Check for change in ipversion */
3204 3204 if (origipversion != tcp->tcp_ipversion) {
3205 3205 ASSERT(tcp->tcp_family == AF_INET6);
3206 3206 err = tcp->tcp_ipversion == IPV6_VERSION ?
3207 3207 tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp);
3208 3208 if (err) {
3209 3209 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3210 3210 return;
3211 3211 }
3212 3212 }
3213 3213
3214 3214 /*
3215 3215 * Initialize family specific fields. Copy of the src addr.
3216 3216 * in tcp_t is needed for the lookup funcs.
3217 3217 */
3218 3218 if (tcp->tcp_ipversion == IPV6_VERSION) {
3219 3219 tcp->tcp_ip6h->ip6_src = v6addr;
3220 3220 } else {
3221 3221 IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src);
3222 3222 }
3223 3223 tcp->tcp_ip_src_v6 = v6addr;
3224 3224
3225 3225 /*
3226 3226 * For O_T_BIND_REQ:
3227 3227 * Verify that the target port/addr is available, or choose
3228 3228 * another.
3229 3229 * For T_BIND_REQ:
3230 3230 * Verify that the target port/addr is available or fail.
3231 3231 * In both cases when it succeeds the tcp is inserted in the
3232 3232 * bind hash table. This ensures that the operation is atomic
3233 3233 * under the lock on the hash bucket.
3234 3234 */
3235 3235 bind_to_req_port_only = requested_port != 0 &&
3236 3236 tbr->PRIM_type != O_T_BIND_REQ;
3237 3237 /*
3238 3238 * Get a valid port (within the anonymous range and should not
3239 3239 * be a privileged one) to use if the user has not given a port.
3240 3240 * If multiple threads are here, they may all start with
3241 3241 * with the same initial port. But, it should be fine as long as
3242 3242 * tcp_bindi will ensure that no two threads will be assigned
3243 3243 * the same port.
3244 3244 *
3245 3245 * NOTE: XXX If a privileged process asks for an anonymous port, we
3246 3246 * still check for ports only in the range > tcp_smallest_non_priv_port,
3247 3247 * unless TCP_ANONPRIVBIND option is set.
3248 3248 */
3249 3249 mlptype = mlptSingle;
3250 3250 mlp_port = requested_port;
3251 3251 if (requested_port == 0) {
3252 3252 requested_port = tcp->tcp_anon_priv_bind ?
3253 3253 tcp_get_next_priv_port(tcp) :
3254 3254 tcp_update_next_port(tcps->tcps_next_port_to_try,
3255 3255 tcp, B_TRUE);
3256 3256 if (requested_port == 0) {
3257 3257 tcp_err_ack(tcp, mp, TNOADDR, 0);
3258 3258 return;
3259 3259 }
3260 3260 user_specified = B_FALSE;
3261 3261
3262 3262 /*
3263 3263 * If the user went through one of the RPC interfaces to create
3264 3264 * this socket and RPC is MLP in this zone, then give him an
3265 3265 * anonymous MLP.
3266 3266 */
3267 3267 cr = DB_CREDDEF(mp, tcp->tcp_cred);
3268 3268 if (connp->conn_anon_mlp && is_system_labeled()) {
3269 3269 zone = crgetzone(cr);
3270 3270 addrtype = tsol_mlp_addr_type(zone->zone_id,
3271 3271 IPV6_VERSION, &v6addr,
3272 3272 tcps->tcps_netstack->netstack_ip);
3273 3273 if (addrtype == mlptSingle) {
3274 3274 tcp_err_ack(tcp, mp, TNOADDR, 0);
3275 3275 return;
3276 3276 }
3277 3277 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3278 3278 PMAPPORT, addrtype);
3279 3279 mlp_port = PMAPPORT;
3280 3280 }
3281 3281 } else {
3282 3282 int i;
3283 3283 boolean_t priv = B_FALSE;
3284 3284
3285 3285 /*
3286 3286 * If the requested_port is in the well-known privileged range,
3287 3287 * verify that the stream was opened by a privileged user.
3288 3288 * Note: No locks are held when inspecting tcp_g_*epriv_ports
3289 3289 * but instead the code relies on:
3290 3290 * - the fact that the address of the array and its size never
3291 3291 * changes
3292 3292 * - the atomic assignment of the elements of the array
3293 3293 */
3294 3294 cr = DB_CREDDEF(mp, tcp->tcp_cred);
3295 3295 if (requested_port < tcps->tcps_smallest_nonpriv_port) {
3296 3296 priv = B_TRUE;
3297 3297 } else {
3298 3298 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
3299 3299 if (requested_port ==
3300 3300 tcps->tcps_g_epriv_ports[i]) {
3301 3301 priv = B_TRUE;
3302 3302 break;
3303 3303 }
3304 3304 }
3305 3305 }
3306 3306 if (priv) {
3307 3307 if (secpolicy_net_privaddr(cr, requested_port,
3308 3308 IPPROTO_TCP) != 0) {
3309 3309 if (tcp->tcp_debug) {
3310 3310 (void) strlog(TCP_MOD_ID, 0, 1,
3311 3311 SL_ERROR|SL_TRACE,
3312 3312 "tcp_bind: no priv for port %d",
3313 3313 requested_port);
3314 3314 }
3315 3315 tcp_err_ack(tcp, mp, TACCES, 0);
3316 3316 return;
3317 3317 }
3318 3318 }
3319 3319 user_specified = B_TRUE;
3320 3320
3321 3321 if (is_system_labeled()) {
3322 3322 zone = crgetzone(cr);
3323 3323 addrtype = tsol_mlp_addr_type(zone->zone_id,
3324 3324 IPV6_VERSION, &v6addr,
3325 3325 tcps->tcps_netstack->netstack_ip);
3326 3326 if (addrtype == mlptSingle) {
3327 3327 tcp_err_ack(tcp, mp, TNOADDR, 0);
3328 3328 return;
3329 3329 }
3330 3330 mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP,
3331 3331 requested_port, addrtype);
3332 3332 }
3333 3333 }
3334 3334
3335 3335 if (mlptype != mlptSingle) {
3336 3336 if (secpolicy_net_bindmlp(cr) != 0) {
3337 3337 if (tcp->tcp_debug) {
3338 3338 (void) strlog(TCP_MOD_ID, 0, 1,
3339 3339 SL_ERROR|SL_TRACE,
3340 3340 "tcp_bind: no priv for multilevel port %d",
3341 3341 requested_port);
3342 3342 }
3343 3343 tcp_err_ack(tcp, mp, TACCES, 0);
3344 3344 return;
3345 3345 }
3346 3346
3347 3347 /*
3348 3348 * If we're specifically binding a shared IP address and the
3349 3349 * port is MLP on shared addresses, then check to see if this
3350 3350 * zone actually owns the MLP. Reject if not.
3351 3351 */
3352 3352 if (mlptype == mlptShared && addrtype == mlptShared) {
3353 3353 /*
3354 3354 * No need to handle exclusive-stack zones since
3355 3355 * ALL_ZONES only applies to the shared stack.
3356 3356 */
3357 3357 zoneid_t mlpzone;
3358 3358
3359 3359 mlpzone = tsol_mlp_findzone(IPPROTO_TCP,
3360 3360 htons(mlp_port));
3361 3361 if (connp->conn_zoneid != mlpzone) {
3362 3362 if (tcp->tcp_debug) {
3363 3363 (void) strlog(TCP_MOD_ID, 0, 1,
3364 3364 SL_ERROR|SL_TRACE,
3365 3365 "tcp_bind: attempt to bind port "
3366 3366 "%d on shared addr in zone %d "
3367 3367 "(should be %d)",
3368 3368 mlp_port, connp->conn_zoneid,
3369 3369 mlpzone);
3370 3370 }
3371 3371 tcp_err_ack(tcp, mp, TACCES, 0);
3372 3372 return;
3373 3373 }
3374 3374 }
3375 3375
3376 3376 if (!user_specified) {
3377 3377 err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3378 3378 requested_port, B_TRUE);
3379 3379 if (err != 0) {
3380 3380 if (tcp->tcp_debug) {
3381 3381 (void) strlog(TCP_MOD_ID, 0, 1,
3382 3382 SL_ERROR|SL_TRACE,
3383 3383 "tcp_bind: cannot establish anon "
3384 3384 "MLP for port %d",
3385 3385 requested_port);
3386 3386 }
3387 3387 tcp_err_ack(tcp, mp, TSYSERR, err);
3388 3388 return;
3389 3389 }
3390 3390 connp->conn_anon_port = B_TRUE;
3391 3391 }
3392 3392 connp->conn_mlp_type = mlptype;
3393 3393 }
3394 3394
3395 3395 allocated_port = tcp_bindi(tcp, requested_port, &v6addr,
3396 3396 tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified);
3397 3397
3398 3398 if (allocated_port == 0) {
3399 3399 connp->conn_mlp_type = mlptSingle;
3400 3400 if (connp->conn_anon_port) {
3401 3401 connp->conn_anon_port = B_FALSE;
3402 3402 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3403 3403 requested_port, B_FALSE);
3404 3404 }
3405 3405 if (bind_to_req_port_only) {
3406 3406 if (tcp->tcp_debug) {
3407 3407 (void) strlog(TCP_MOD_ID, 0, 1,
3408 3408 SL_ERROR|SL_TRACE,
3409 3409 "tcp_bind: requested addr busy");
3410 3410 }
3411 3411 tcp_err_ack(tcp, mp, TADDRBUSY, 0);
3412 3412 } else {
3413 3413 /* If we are out of ports, fail the bind. */
3414 3414 if (tcp->tcp_debug) {
3415 3415 (void) strlog(TCP_MOD_ID, 0, 1,
3416 3416 SL_ERROR|SL_TRACE,
3417 3417 "tcp_bind: out of ports?");
3418 3418 }
3419 3419 tcp_err_ack(tcp, mp, TNOADDR, 0);
3420 3420 }
3421 3421 return;
3422 3422 }
3423 3423 ASSERT(tcp->tcp_state == TCPS_BOUND);
3424 3424 do_bind:
3425 3425 if (!backlog_update) {
3426 3426 if (tcp->tcp_family == AF_INET)
3427 3427 sin->sin_port = htons(allocated_port);
3428 3428 else
3429 3429 sin6->sin6_port = htons(allocated_port);
3430 3430 }
3431 3431 if (tcp->tcp_family == AF_INET) {
3432 3432 if (tbr->CONIND_number != 0) {
3433 3433 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3434 3434 sizeof (sin_t));
3435 3435 } else {
3436 3436 /* Just verify the local IP address */
3437 3437 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type, IP_ADDR_LEN);
3438 3438 }
3439 3439 } else {
3440 3440 if (tbr->CONIND_number != 0) {
3441 3441 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3442 3442 sizeof (sin6_t));
3443 3443 } else {
3444 3444 /* Just verify the local IP address */
3445 3445 mp1 = tcp_ip_bind_mp(tcp, tbr->PRIM_type,
3446 3446 IPV6_ADDR_LEN);
3447 3447 }
3448 3448 }
3449 3449 if (mp1 == NULL) {
3450 3450 if (connp->conn_anon_port) {
3451 3451 connp->conn_anon_port = B_FALSE;
3452 3452 (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp,
3453 3453 requested_port, B_FALSE);
3454 3454 }
3455 3455 connp->conn_mlp_type = mlptSingle;
3456 3456 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
3457 3457 return;
3458 3458 }
3459 3459
3460 3460 tbr->PRIM_type = T_BIND_ACK;
3461 3461 mp->b_datap->db_type = M_PCPROTO;
3462 3462
3463 3463 /* Chain in the reply mp for tcp_rput() */
3464 3464 mp1->b_cont = mp;
3465 3465 mp = mp1;
3466 3466
3467 3467 tcp->tcp_conn_req_max = tbr->CONIND_number;
3468 3468 if (tcp->tcp_conn_req_max) {
3469 3469 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
|
↓ open down ↓ |
3469 lines elided |
↑ open up ↑ |
3470 3470 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
3471 3471 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
3472 3472 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
3473 3473 /*
3474 3474 * If this is a listener, do not reset the eager list
3475 3475 * and other stuffs. Note that we don't check if the
3476 3476 * existing eager list meets the new tcp_conn_req_max
3477 3477 * requirement.
3478 3478 */
3479 3479 if (tcp->tcp_state != TCPS_LISTEN) {
3480 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3481 + tcp_t *, tcp, int32_t, TCPS_LISTEN);
3480 3482 tcp->tcp_state = TCPS_LISTEN;
3481 3483 /* Initialize the chain. Don't need the eager_lock */
3482 3484 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3483 3485 tcp->tcp_eager_next_drop_q0 = tcp;
3484 3486 tcp->tcp_eager_prev_drop_q0 = tcp;
3485 3487 tcp->tcp_second_ctimer_threshold =
3486 3488 tcps->tcps_ip_abort_linterval;
3487 3489 }
3488 3490 }
3489 3491
3490 3492 /*
3491 3493 * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3492 3494 * processing continues in tcp_rput_other().
3493 3495 *
3494 3496 * We need to make sure that the conn_recv is set to a non-null
3495 3497 * value before we insert the conn into the classifier table.
3496 3498 * This is to avoid a race with an incoming packet which does an
3497 3499 * ipcl_classify().
3498 3500 */
3499 3501 connp->conn_recv = tcp_conn_request;
3500 3502 if (tcp->tcp_family == AF_INET6) {
3501 3503 ASSERT(tcp->tcp_connp->conn_af_isv6);
3502 3504 mp = ip_bind_v6(q, mp, tcp->tcp_connp, &tcp->tcp_sticky_ipp);
3503 3505 } else {
3504 3506 ASSERT(!tcp->tcp_connp->conn_af_isv6);
3505 3507 mp = ip_bind_v4(q, mp, tcp->tcp_connp);
3506 3508 }
3507 3509 /*
3508 3510 * If the bind cannot complete immediately
3509 3511 * IP will arrange to call tcp_rput_other
3510 3512 * when the bind completes.
3511 3513 */
3512 3514 if (mp != NULL) {
3513 3515 tcp_rput_other(tcp, mp);
3514 3516 } else {
3515 3517 /*
3516 3518 * Bind will be resumed later. Need to ensure
3517 3519 * that conn doesn't disappear when that happens.
3518 3520 * This will be decremented in ip_resume_tcp_bind().
3519 3521 */
3520 3522 CONN_INC_REF(tcp->tcp_connp);
3521 3523 }
3522 3524 }
3523 3525
3524 3526
3525 3527 /*
3526 3528 * If the "bind_to_req_port_only" parameter is set, if the requested port
3527 3529 * number is available, return it, If not return 0
3528 3530 *
3529 3531 * If "bind_to_req_port_only" parameter is not set and
3530 3532 * If the requested port number is available, return it. If not, return
3531 3533 * the first anonymous port we happen across. If no anonymous ports are
3532 3534 * available, return 0. addr is the requested local address, if any.
3533 3535 *
3534 3536 * In either case, when succeeding update the tcp_t to record the port number
3535 3537 * and insert it in the bind hash table.
3536 3538 *
3537 3539 * Note that TCP over IPv4 and IPv6 sockets can use the same port number
3538 3540 * without setting SO_REUSEADDR. This is needed so that they
3539 3541 * can be viewed as two independent transport protocols.
3540 3542 */
3541 3543 static in_port_t
3542 3544 tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
3543 3545 int reuseaddr, boolean_t quick_connect,
3544 3546 boolean_t bind_to_req_port_only, boolean_t user_specified)
3545 3547 {
3546 3548 /* number of times we have run around the loop */
3547 3549 int count = 0;
3548 3550 /* maximum number of times to run around the loop */
3549 3551 int loopmax;
3550 3552 conn_t *connp = tcp->tcp_connp;
3551 3553 zoneid_t zoneid = connp->conn_zoneid;
3552 3554 tcp_stack_t *tcps = tcp->tcp_tcps;
3553 3555
3554 3556 /*
3555 3557 * Lookup for free addresses is done in a loop and "loopmax"
3556 3558 * influences how long we spin in the loop
3557 3559 */
3558 3560 if (bind_to_req_port_only) {
3559 3561 /*
3560 3562 * If the requested port is busy, don't bother to look
3561 3563 * for a new one. Setting loop maximum count to 1 has
3562 3564 * that effect.
3563 3565 */
3564 3566 loopmax = 1;
3565 3567 } else {
3566 3568 /*
3567 3569 * If the requested port is busy, look for a free one
3568 3570 * in the anonymous port range.
3569 3571 * Set loopmax appropriately so that one does not look
3570 3572 * forever in the case all of the anonymous ports are in use.
3571 3573 */
3572 3574 if (tcp->tcp_anon_priv_bind) {
3573 3575 /*
3574 3576 * loopmax =
3575 3577 * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
3576 3578 */
3577 3579 loopmax = IPPORT_RESERVED -
3578 3580 tcps->tcps_min_anonpriv_port;
3579 3581 } else {
3580 3582 loopmax = (tcps->tcps_largest_anon_port -
3581 3583 tcps->tcps_smallest_anon_port + 1);
3582 3584 }
3583 3585 }
3584 3586 do {
3585 3587 uint16_t lport;
3586 3588 tf_t *tbf;
3587 3589 tcp_t *ltcp;
3588 3590 conn_t *lconnp;
3589 3591
3590 3592 lport = htons(port);
3591 3593
3592 3594 /*
3593 3595 * Ensure that the tcp_t is not currently in the bind hash.
3594 3596 * Hold the lock on the hash bucket to ensure that
3595 3597 * the duplicate check plus the insertion is an atomic
3596 3598 * operation.
3597 3599 *
3598 3600 * This function does an inline lookup on the bind hash list
3599 3601 * Make sure that we access only members of tcp_t
3600 3602 * and that we don't look at tcp_tcp, since we are not
3601 3603 * doing a CONN_INC_REF.
3602 3604 */
3603 3605 tcp_bind_hash_remove(tcp);
3604 3606 tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
3605 3607 mutex_enter(&tbf->tf_lock);
3606 3608 for (ltcp = tbf->tf_tcp; ltcp != NULL;
3607 3609 ltcp = ltcp->tcp_bind_hash) {
3608 3610 boolean_t not_socket;
3609 3611 boolean_t exclbind;
3610 3612
3611 3613 if (lport != ltcp->tcp_lport)
3612 3614 continue;
3613 3615
3614 3616 lconnp = ltcp->tcp_connp;
3615 3617
3616 3618 /*
3617 3619 * On a labeled system, we must treat bindings to ports
3618 3620 * on shared IP addresses by sockets with MAC exemption
3619 3621 * privilege as being in all zones, as there's
3620 3622 * otherwise no way to identify the right receiver.
3621 3623 */
3622 3624 if (!(IPCL_ZONE_MATCH(ltcp->tcp_connp, zoneid) ||
3623 3625 IPCL_ZONE_MATCH(connp,
3624 3626 ltcp->tcp_connp->conn_zoneid)) &&
3625 3627 !lconnp->conn_mac_exempt &&
3626 3628 !connp->conn_mac_exempt)
3627 3629 continue;
3628 3630
3629 3631 /*
3630 3632 * If TCP_EXCLBIND is set for either the bound or
3631 3633 * binding endpoint, the semantics of bind
3632 3634 * is changed according to the following.
3633 3635 *
3634 3636 * spec = specified address (v4 or v6)
3635 3637 * unspec = unspecified address (v4 or v6)
3636 3638 * A = specified addresses are different for endpoints
3637 3639 *
3638 3640 * bound bind to allowed
3639 3641 * -------------------------------------
3640 3642 * unspec unspec no
3641 3643 * unspec spec no
3642 3644 * spec unspec no
3643 3645 * spec spec yes if A
3644 3646 *
3645 3647 * For labeled systems, SO_MAC_EXEMPT behaves the same
3646 3648 * as TCP_EXCLBIND, except that zoneid is ignored.
3647 3649 *
3648 3650 * Note:
3649 3651 *
3650 3652 * 1. Because of TLI semantics, an endpoint can go
3651 3653 * back from, say TCP_ESTABLISHED to TCPS_LISTEN or
3652 3654 * TCPS_BOUND, depending on whether it is originally
3653 3655 * a listener or not. That is why we need to check
3654 3656 * for states greater than or equal to TCPS_BOUND
3655 3657 * here.
3656 3658 *
3657 3659 * 2. Ideally, we should only check for state equals
3658 3660 * to TCPS_LISTEN. And the following check should be
3659 3661 * added.
3660 3662 *
3661 3663 * if (ltcp->tcp_state == TCPS_LISTEN ||
3662 3664 * !reuseaddr || !ltcp->tcp_reuseaddr) {
3663 3665 * ...
3664 3666 * }
3665 3667 *
3666 3668 * The semantics will be changed to this. If the
3667 3669 * endpoint on the list is in state not equal to
3668 3670 * TCPS_LISTEN and both endpoints have SO_REUSEADDR
3669 3671 * set, let the bind succeed.
3670 3672 *
3671 3673 * Because of (1), we cannot do that for TLI
3672 3674 * endpoints. But we can do that for socket endpoints.
3673 3675 * If in future, we can change this going back
3674 3676 * semantics, we can use the above check for TLI also.
3675 3677 */
3676 3678 not_socket = !(TCP_IS_SOCKET(ltcp) &&
3677 3679 TCP_IS_SOCKET(tcp));
3678 3680 exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind;
3679 3681
3680 3682 if (lconnp->conn_mac_exempt || connp->conn_mac_exempt ||
3681 3683 (exclbind && (not_socket ||
3682 3684 ltcp->tcp_state <= TCPS_ESTABLISHED))) {
3683 3685 if (V6_OR_V4_INADDR_ANY(
3684 3686 ltcp->tcp_bound_source_v6) ||
3685 3687 V6_OR_V4_INADDR_ANY(*laddr) ||
3686 3688 IN6_ARE_ADDR_EQUAL(laddr,
3687 3689 <cp->tcp_bound_source_v6)) {
3688 3690 break;
3689 3691 }
3690 3692 continue;
3691 3693 }
3692 3694
3693 3695 /*
3694 3696 * Check ipversion to allow IPv4 and IPv6 sockets to
3695 3697 * have disjoint port number spaces, if *_EXCLBIND
3696 3698 * is not set and only if the application binds to a
3697 3699 * specific port. We use the same autoassigned port
3698 3700 * number space for IPv4 and IPv6 sockets.
3699 3701 */
3700 3702 if (tcp->tcp_ipversion != ltcp->tcp_ipversion &&
3701 3703 bind_to_req_port_only)
3702 3704 continue;
3703 3705
3704 3706 /*
3705 3707 * Ideally, we should make sure that the source
3706 3708 * address, remote address, and remote port in the
3707 3709 * four tuple for this tcp-connection is unique.
3708 3710 * However, trying to find out the local source
3709 3711 * address would require too much code duplication
3710 3712 * with IP, since IP needs needs to have that code
3711 3713 * to support userland TCP implementations.
3712 3714 */
3713 3715 if (quick_connect &&
3714 3716 (ltcp->tcp_state > TCPS_LISTEN) &&
3715 3717 ((tcp->tcp_fport != ltcp->tcp_fport) ||
3716 3718 !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6,
3717 3719 <cp->tcp_remote_v6)))
3718 3720 continue;
3719 3721
3720 3722 if (!reuseaddr) {
3721 3723 /*
3722 3724 * No socket option SO_REUSEADDR.
3723 3725 * If existing port is bound to
3724 3726 * a non-wildcard IP address
3725 3727 * and the requesting stream is
3726 3728 * bound to a distinct
3727 3729 * different IP addresses
3728 3730 * (non-wildcard, also), keep
3729 3731 * going.
3730 3732 */
3731 3733 if (!V6_OR_V4_INADDR_ANY(*laddr) &&
3732 3734 !V6_OR_V4_INADDR_ANY(
3733 3735 ltcp->tcp_bound_source_v6) &&
3734 3736 !IN6_ARE_ADDR_EQUAL(laddr,
3735 3737 <cp->tcp_bound_source_v6))
3736 3738 continue;
3737 3739 if (ltcp->tcp_state >= TCPS_BOUND) {
3738 3740 /*
3739 3741 * This port is being used and
3740 3742 * its state is >= TCPS_BOUND,
3741 3743 * so we can't bind to it.
3742 3744 */
3743 3745 break;
3744 3746 }
3745 3747 } else {
3746 3748 /*
3747 3749 * socket option SO_REUSEADDR is set on the
3748 3750 * binding tcp_t.
3749 3751 *
3750 3752 * If two streams are bound to
3751 3753 * same IP address or both addr
3752 3754 * and bound source are wildcards
3753 3755 * (INADDR_ANY), we want to stop
3754 3756 * searching.
3755 3757 * We have found a match of IP source
3756 3758 * address and source port, which is
3757 3759 * refused regardless of the
3758 3760 * SO_REUSEADDR setting, so we break.
3759 3761 */
3760 3762 if (IN6_ARE_ADDR_EQUAL(laddr,
3761 3763 <cp->tcp_bound_source_v6) &&
3762 3764 (ltcp->tcp_state == TCPS_LISTEN ||
3763 3765 ltcp->tcp_state == TCPS_BOUND))
3764 3766 break;
3765 3767 }
|
↓ open down ↓ |
276 lines elided |
↑ open up ↑ |
3766 3768 }
3767 3769 if (ltcp != NULL) {
3768 3770 /* The port number is busy */
3769 3771 mutex_exit(&tbf->tf_lock);
3770 3772 } else {
3771 3773 /*
3772 3774 * This port is ours. Insert in fanout and mark as
3773 3775 * bound to prevent others from getting the port
3774 3776 * number.
3775 3777 */
3778 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3779 + tcp_t *, tcp, int32_t, TCPS_BOUND);
3776 3780 tcp->tcp_state = TCPS_BOUND;
3777 3781 tcp->tcp_lport = htons(port);
3778 3782 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3779 3783
3780 3784 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
3781 3785 tcp->tcp_lport)] == tbf);
3782 3786 tcp_bind_hash_insert(tbf, tcp, 1);
3783 3787
3784 3788 mutex_exit(&tbf->tf_lock);
3785 3789
3786 3790 /*
3787 3791 * We don't want tcp_next_port_to_try to "inherit"
3788 3792 * a port number supplied by the user in a bind.
3789 3793 */
3790 3794 if (user_specified)
3791 3795 return (port);
3792 3796
3793 3797 /*
3794 3798 * This is the only place where tcp_next_port_to_try
3795 3799 * is updated. After the update, it may or may not
3796 3800 * be in the valid range.
3797 3801 */
3798 3802 if (!tcp->tcp_anon_priv_bind)
3799 3803 tcps->tcps_next_port_to_try = port + 1;
3800 3804 return (port);
3801 3805 }
3802 3806
3803 3807 if (tcp->tcp_anon_priv_bind) {
3804 3808 port = tcp_get_next_priv_port(tcp);
3805 3809 } else {
3806 3810 if (count == 0 && user_specified) {
3807 3811 /*
3808 3812 * We may have to return an anonymous port. So
3809 3813 * get one to start with.
3810 3814 */
3811 3815 port =
3812 3816 tcp_update_next_port(
3813 3817 tcps->tcps_next_port_to_try,
3814 3818 tcp, B_TRUE);
3815 3819 user_specified = B_FALSE;
3816 3820 } else {
3817 3821 port = tcp_update_next_port(port + 1, tcp,
3818 3822 B_FALSE);
3819 3823 }
3820 3824 }
3821 3825 if (port == 0)
3822 3826 break;
3823 3827
3824 3828 /*
3825 3829 * Don't let this loop run forever in the case where
3826 3830 * all of the anonymous ports are in use.
3827 3831 */
3828 3832 } while (++count < loopmax);
3829 3833 return (0);
3830 3834 }
3831 3835
3832 3836 /*
3833 3837 * tcp_clean_death / tcp_close_detached must not be called more than once
3834 3838 * on a tcp. Thus every function that potentially calls tcp_clean_death
3835 3839 * must check for the tcp state before calling tcp_clean_death.
3836 3840 * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper,
3837 3841 * tcp_timer_handler, all check for the tcp state.
3838 3842 */
3839 3843 /* ARGSUSED */
3840 3844 void
3841 3845 tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2)
3842 3846 {
3843 3847 tcp_t *tcp = ((conn_t *)arg)->conn_tcp;
3844 3848
3845 3849 freemsg(mp);
3846 3850 if (tcp->tcp_state > TCPS_BOUND)
3847 3851 (void) tcp_clean_death(((conn_t *)arg)->conn_tcp,
3848 3852 ETIMEDOUT, 5);
3849 3853 }
3850 3854
3851 3855 /*
3852 3856 * We are dying for some reason. Try to do it gracefully. (May be called
3853 3857 * as writer.)
3854 3858 *
3855 3859 * Return -1 if the structure was not cleaned up (if the cleanup had to be
3856 3860 * done by a service procedure).
3857 3861 * TBD - Should the return value distinguish between the tcp_t being
3858 3862 * freed and it being reinitialized?
3859 3863 */
3860 3864 static int
3861 3865 tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
3862 3866 {
3863 3867 mblk_t *mp;
3864 3868 queue_t *q;
3865 3869 tcp_stack_t *tcps = tcp->tcp_tcps;
3866 3870 sodirect_t *sodp;
3867 3871
3868 3872 TCP_CLD_STAT(tag);
3869 3873
3870 3874 #if TCP_TAG_CLEAN_DEATH
3871 3875 tcp->tcp_cleandeathtag = tag;
3872 3876 #endif
3873 3877
3874 3878 if (tcp->tcp_fused)
3875 3879 tcp_unfuse(tcp);
3876 3880
3877 3881 if (tcp->tcp_linger_tid != 0 &&
3878 3882 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
3879 3883 tcp_stop_lingering(tcp);
3880 3884 }
3881 3885
3882 3886 ASSERT(tcp != NULL);
3883 3887 ASSERT((tcp->tcp_family == AF_INET &&
3884 3888 tcp->tcp_ipversion == IPV4_VERSION) ||
3885 3889 (tcp->tcp_family == AF_INET6 &&
3886 3890 (tcp->tcp_ipversion == IPV4_VERSION ||
3887 3891 tcp->tcp_ipversion == IPV6_VERSION)));
3888 3892
3889 3893 if (TCP_IS_DETACHED(tcp)) {
3890 3894 if (tcp->tcp_hard_binding) {
3891 3895 /*
3892 3896 * Its an eager that we are dealing with. We close the
3893 3897 * eager but in case a conn_ind has already gone to the
3894 3898 * listener, let tcp_accept_finish() send a discon_ind
3895 3899 * to the listener and drop the last reference. If the
3896 3900 * listener doesn't even know about the eager i.e. the
|
↓ open down ↓ |
111 lines elided |
↑ open up ↑ |
3897 3901 * conn_ind hasn't gone up, blow away the eager and drop
3898 3902 * the last reference as well. If the conn_ind has gone
3899 3903 * up, state should be BOUND. tcp_accept_finish
3900 3904 * will figure out that the connection has received a
3901 3905 * RST and will send a DISCON_IND to the application.
3902 3906 */
3903 3907 tcp_closei_local(tcp);
3904 3908 if (!tcp->tcp_tconnind_started) {
3905 3909 CONN_DEC_REF(tcp->tcp_connp);
3906 3910 } else {
3911 + DTRACE_TCP4(state__change, void, NULL,
3912 + conn_t *, NULL, tcp_t *, tcp, int32_t,
3913 + TCPS_BOUND);
3907 3914 tcp->tcp_state = TCPS_BOUND;
3908 3915 }
3909 3916 } else {
3910 3917 tcp_close_detached(tcp);
3911 3918 }
3912 3919 return (0);
3913 3920 }
3914 3921
3915 3922 TCP_STAT(tcps, tcp_clean_death_nondetached);
3916 3923
3917 3924 /*
3918 3925 * If T_ORDREL_IND has not been sent yet (done when service routine
3919 3926 * is run) postpone cleaning up the endpoint until service routine
3920 3927 * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3921 3928 * client_errno since tcp_close uses the client_errno field.
3922 3929 */
3923 3930 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3924 3931 if (err != 0)
3925 3932 tcp->tcp_client_errno = err;
3926 3933
3927 3934 tcp->tcp_deferred_clean_death = B_TRUE;
3928 3935 return (-1);
3929 3936 }
3930 3937
3931 3938 /* If sodirect, not anymore */
3932 3939 SOD_PTR_ENTER(tcp, sodp);
3933 3940 if (sodp != NULL) {
3934 3941 tcp->tcp_sodirect = NULL;
3935 3942 mutex_exit(sodp->sod_lock);
3936 3943 }
3937 3944
3938 3945 q = tcp->tcp_rq;
3939 3946
3940 3947 /* Trash all inbound data */
3941 3948 flushq(q, FLUSHALL);
3942 3949
3943 3950 /*
3944 3951 * If we are at least part way open and there is error
3945 3952 * (err==0 implies no error)
3946 3953 * notify our client by a T_DISCON_IND.
3947 3954 */
3948 3955 if ((tcp->tcp_state >= TCPS_SYN_SENT) && err) {
3949 3956 if (tcp->tcp_state >= TCPS_ESTABLISHED &&
3950 3957 !TCP_IS_SOCKET(tcp)) {
3951 3958 /*
3952 3959 * Send M_FLUSH according to TPI. Because sockets will
3953 3960 * (and must) ignore FLUSHR we do that only for TPI
3954 3961 * endpoints and sockets in STREAMS mode.
3955 3962 */
3956 3963 (void) putnextctl1(q, M_FLUSH, FLUSHR);
3957 3964 }
3958 3965 if (tcp->tcp_debug) {
3959 3966 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
3960 3967 "tcp_clean_death: discon err %d", err);
3961 3968 }
3962 3969 mp = mi_tpi_discon_ind(NULL, err, 0);
3963 3970 if (mp != NULL) {
3964 3971 putnext(q, mp);
3965 3972 } else {
3966 3973 if (tcp->tcp_debug) {
3967 3974 (void) strlog(TCP_MOD_ID, 0, 1,
3968 3975 SL_ERROR|SL_TRACE,
3969 3976 "tcp_clean_death, sending M_ERROR");
3970 3977 }
3971 3978 (void) putnextctl1(q, M_ERROR, EPROTO);
3972 3979 }
3973 3980 if (tcp->tcp_state <= TCPS_SYN_RCVD) {
3974 3981 /* SYN_SENT or SYN_RCVD */
3975 3982 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
3976 3983 } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) {
3977 3984 /* ESTABLISHED or CLOSE_WAIT */
3978 3985 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
3979 3986 }
3980 3987 }
3981 3988
3982 3989 tcp_reinit(tcp);
3983 3990 return (-1);
3984 3991 }
3985 3992
3986 3993 /*
3987 3994 * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout
3988 3995 * to expire, stop the wait and finish the close.
3989 3996 */
3990 3997 static void
3991 3998 tcp_stop_lingering(tcp_t *tcp)
3992 3999 {
3993 4000 clock_t delta = 0;
3994 4001 tcp_stack_t *tcps = tcp->tcp_tcps;
3995 4002
3996 4003 tcp->tcp_linger_tid = 0;
3997 4004 if (tcp->tcp_state > TCPS_LISTEN) {
3998 4005 tcp_acceptor_hash_remove(tcp);
3999 4006 mutex_enter(&tcp->tcp_non_sq_lock);
4000 4007 if (tcp->tcp_flow_stopped) {
4001 4008 tcp_clrqfull(tcp);
4002 4009 }
4003 4010 mutex_exit(&tcp->tcp_non_sq_lock);
4004 4011
4005 4012 if (tcp->tcp_timer_tid != 0) {
4006 4013 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4007 4014 tcp->tcp_timer_tid = 0;
4008 4015 }
4009 4016 /*
4010 4017 * Need to cancel those timers which will not be used when
4011 4018 * TCP is detached. This has to be done before the tcp_wq
4012 4019 * is set to the global queue.
4013 4020 */
4014 4021 tcp_timers_stop(tcp);
4015 4022
4016 4023
4017 4024 tcp->tcp_detached = B_TRUE;
4018 4025 ASSERT(tcps->tcps_g_q != NULL);
4019 4026 tcp->tcp_rq = tcps->tcps_g_q;
4020 4027 tcp->tcp_wq = WR(tcps->tcps_g_q);
4021 4028
4022 4029 if (tcp->tcp_state == TCPS_TIME_WAIT) {
4023 4030 tcp_time_wait_append(tcp);
4024 4031 TCP_DBGSTAT(tcps, tcp_detach_time_wait);
4025 4032 goto finish;
4026 4033 }
4027 4034
4028 4035 /*
4029 4036 * If delta is zero the timer event wasn't executed and was
4030 4037 * successfully canceled. In this case we need to restart it
4031 4038 * with the minimal delta possible.
4032 4039 */
4033 4040 if (delta >= 0) {
4034 4041 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
4035 4042 delta ? delta : 1);
4036 4043 }
4037 4044 } else {
4038 4045 tcp_closei_local(tcp);
4039 4046 CONN_DEC_REF(tcp->tcp_connp);
4040 4047 }
4041 4048 finish:
4042 4049 /* Signal closing thread that it can complete close */
4043 4050 mutex_enter(&tcp->tcp_closelock);
4044 4051 tcp->tcp_detached = B_TRUE;
4045 4052 ASSERT(tcps->tcps_g_q != NULL);
4046 4053 tcp->tcp_rq = tcps->tcps_g_q;
4047 4054 tcp->tcp_wq = WR(tcps->tcps_g_q);
4048 4055 tcp->tcp_closed = 1;
4049 4056 cv_signal(&tcp->tcp_closecv);
4050 4057 mutex_exit(&tcp->tcp_closelock);
4051 4058 }
4052 4059
4053 4060 /*
4054 4061 * Handle lingering timeouts. This function is called when the SO_LINGER timeout
4055 4062 * expires.
4056 4063 */
4057 4064 static void
4058 4065 tcp_close_linger_timeout(void *arg)
4059 4066 {
4060 4067 conn_t *connp = (conn_t *)arg;
4061 4068 tcp_t *tcp = connp->conn_tcp;
4062 4069
4063 4070 tcp->tcp_client_errno = ETIMEDOUT;
4064 4071 tcp_stop_lingering(tcp);
4065 4072 }
4066 4073
4067 4074 static int
4068 4075 tcp_close(queue_t *q, int flags)
4069 4076 {
4070 4077 conn_t *connp = Q_TO_CONN(q);
4071 4078 tcp_t *tcp = connp->conn_tcp;
4072 4079 mblk_t *mp = &tcp->tcp_closemp;
4073 4080 boolean_t conn_ioctl_cleanup_reqd = B_FALSE;
4074 4081 mblk_t *bp;
4075 4082
4076 4083 ASSERT(WR(q)->q_next == NULL);
4077 4084 ASSERT(connp->conn_ref >= 2);
4078 4085
4079 4086 /*
4080 4087 * We are being closed as /dev/tcp or /dev/tcp6.
4081 4088 *
4082 4089 * Mark the conn as closing. ill_pending_mp_add will not
4083 4090 * add any mp to the pending mp list, after this conn has
4084 4091 * started closing. Same for sq_pending_mp_add
4085 4092 */
4086 4093 mutex_enter(&connp->conn_lock);
4087 4094 connp->conn_state_flags |= CONN_CLOSING;
4088 4095 if (connp->conn_oper_pending_ill != NULL)
4089 4096 conn_ioctl_cleanup_reqd = B_TRUE;
4090 4097 CONN_INC_REF_LOCKED(connp);
4091 4098 mutex_exit(&connp->conn_lock);
4092 4099 tcp->tcp_closeflags = (uint8_t)flags;
4093 4100 ASSERT(connp->conn_ref >= 3);
4094 4101
4095 4102 /*
4096 4103 * tcp_closemp_used is used below without any protection of a lock
4097 4104 * as we don't expect any one else to use it concurrently at this
4098 4105 * point otherwise it would be a major defect.
4099 4106 */
4100 4107
4101 4108 if (mp->b_prev == NULL)
4102 4109 tcp->tcp_closemp_used = B_TRUE;
4103 4110 else
4104 4111 cmn_err(CE_PANIC, "tcp_close: concurrent use of tcp_closemp: "
4105 4112 "connp %p tcp %p\n", (void *)connp, (void *)tcp);
4106 4113
4107 4114 TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
4108 4115
4109 4116 (*tcp_squeue_close_proc)(connp->conn_sqp, mp,
4110 4117 tcp_close_output, connp, SQTAG_IP_TCP_CLOSE);
4111 4118
4112 4119 mutex_enter(&tcp->tcp_closelock);
4113 4120 while (!tcp->tcp_closed) {
4114 4121 if (!cv_wait_sig(&tcp->tcp_closecv, &tcp->tcp_closelock)) {
4115 4122 /*
4116 4123 * The cv_wait_sig() was interrupted. We now do the
4117 4124 * following:
4118 4125 *
4119 4126 * 1) If the endpoint was lingering, we allow this
4120 4127 * to be interrupted by cancelling the linger timeout
4121 4128 * and closing normally.
4122 4129 *
4123 4130 * 2) Revert to calling cv_wait()
4124 4131 *
4125 4132 * We revert to using cv_wait() to avoid an
4126 4133 * infinite loop which can occur if the calling
4127 4134 * thread is higher priority than the squeue worker
4128 4135 * thread and is bound to the same cpu.
4129 4136 */
4130 4137 if (tcp->tcp_linger && tcp->tcp_lingertime > 0) {
4131 4138 mutex_exit(&tcp->tcp_closelock);
4132 4139 /* Entering squeue, bump ref count. */
4133 4140 CONN_INC_REF(connp);
4134 4141 bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL);
4135 4142 squeue_enter(connp->conn_sqp, bp,
4136 4143 tcp_linger_interrupted, connp,
4137 4144 SQTAG_IP_TCP_CLOSE);
4138 4145 mutex_enter(&tcp->tcp_closelock);
4139 4146 }
4140 4147 break;
4141 4148 }
4142 4149 }
4143 4150 while (!tcp->tcp_closed)
4144 4151 cv_wait(&tcp->tcp_closecv, &tcp->tcp_closelock);
4145 4152 mutex_exit(&tcp->tcp_closelock);
4146 4153
4147 4154 /*
4148 4155 * In the case of listener streams that have eagers in the q or q0
4149 4156 * we wait for the eagers to drop their reference to us. tcp_rq and
4150 4157 * tcp_wq of the eagers point to our queues. By waiting for the
4151 4158 * refcnt to drop to 1, we are sure that the eagers have cleaned
4152 4159 * up their queue pointers and also dropped their references to us.
4153 4160 */
4154 4161 if (tcp->tcp_wait_for_eagers) {
4155 4162 mutex_enter(&connp->conn_lock);
4156 4163 while (connp->conn_ref != 1) {
4157 4164 cv_wait(&connp->conn_cv, &connp->conn_lock);
4158 4165 }
4159 4166 mutex_exit(&connp->conn_lock);
4160 4167 }
4161 4168 /*
4162 4169 * ioctl cleanup. The mp is queued in the
4163 4170 * ill_pending_mp or in the sq_pending_mp.
4164 4171 */
4165 4172 if (conn_ioctl_cleanup_reqd)
4166 4173 conn_ioctl_cleanup(connp);
4167 4174
4168 4175 qprocsoff(q);
4169 4176 inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
4170 4177
4171 4178 tcp->tcp_cpid = -1;
4172 4179
4173 4180 /*
4174 4181 * Drop IP's reference on the conn. This is the last reference
4175 4182 * on the connp if the state was less than established. If the
4176 4183 * connection has gone into timewait state, then we will have
4177 4184 * one ref for the TCP and one more ref (total of two) for the
4178 4185 * classifier connected hash list (a timewait connections stays
4179 4186 * in connected hash till closed).
4180 4187 *
4181 4188 * We can't assert the references because there might be other
4182 4189 * transient reference places because of some walkers or queued
4183 4190 * packets in squeue for the timewait state.
4184 4191 */
4185 4192 CONN_DEC_REF(connp);
4186 4193 q->q_ptr = WR(q)->q_ptr = NULL;
4187 4194 return (0);
4188 4195 }
4189 4196
4190 4197 static int
4191 4198 tcpclose_accept(queue_t *q)
4192 4199 {
4193 4200 vmem_t *minor_arena;
4194 4201 dev_t conn_dev;
4195 4202
4196 4203 ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
4197 4204
4198 4205 /*
4199 4206 * We had opened an acceptor STREAM for sockfs which is
4200 4207 * now being closed due to some error.
4201 4208 */
4202 4209 qprocsoff(q);
4203 4210
4204 4211 minor_arena = (vmem_t *)WR(q)->q_ptr;
4205 4212 conn_dev = (dev_t)RD(q)->q_ptr;
4206 4213 ASSERT(minor_arena != NULL);
4207 4214 ASSERT(conn_dev != 0);
4208 4215 inet_minor_free(minor_arena, conn_dev);
4209 4216 q->q_ptr = WR(q)->q_ptr = NULL;
4210 4217 return (0);
4211 4218 }
4212 4219
4213 4220 /*
4214 4221 * Called by tcp_close() routine via squeue when lingering is
4215 4222 * interrupted by a signal.
4216 4223 */
4217 4224
4218 4225 /* ARGSUSED */
4219 4226 static void
4220 4227 tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2)
4221 4228 {
4222 4229 conn_t *connp = (conn_t *)arg;
4223 4230 tcp_t *tcp = connp->conn_tcp;
4224 4231
4225 4232 freeb(mp);
4226 4233 if (tcp->tcp_linger_tid != 0 &&
4227 4234 TCP_TIMER_CANCEL(tcp, tcp->tcp_linger_tid) >= 0) {
4228 4235 tcp_stop_lingering(tcp);
4229 4236 tcp->tcp_client_errno = EINTR;
4230 4237 }
4231 4238 }
4232 4239
4233 4240 /*
4234 4241 * Called by streams close routine via squeues when our client blows off her
4235 4242 * descriptor, we take this to mean: "close the stream state NOW, close the tcp
4236 4243 * connection politely" When SO_LINGER is set (with a non-zero linger time and
4237 4244 * it is not a nonblocking socket) then this routine sleeps until the FIN is
4238 4245 * acked.
4239 4246 *
4240 4247 * NOTE: tcp_close potentially returns error when lingering.
4241 4248 * However, the stream head currently does not pass these errors
4242 4249 * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK
4243 4250 * errors to the application (from tsleep()) and not errors
4244 4251 * like ECONNRESET caused by receiving a reset packet.
4245 4252 */
4246 4253
4247 4254 /* ARGSUSED */
4248 4255 static void
4249 4256 tcp_close_output(void *arg, mblk_t *mp, void *arg2)
4250 4257 {
4251 4258 char *msg;
4252 4259 conn_t *connp = (conn_t *)arg;
4253 4260 tcp_t *tcp = connp->conn_tcp;
4254 4261 clock_t delta = 0;
4255 4262 tcp_stack_t *tcps = tcp->tcp_tcps;
4256 4263
4257 4264 ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
4258 4265 (connp->conn_fanout == NULL && connp->conn_ref >= 3));
4259 4266
4260 4267 /* Cancel any pending timeout */
4261 4268 if (tcp->tcp_ordrelid != 0) {
4262 4269 if (tcp->tcp_timeout) {
4263 4270 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ordrelid);
4264 4271 }
4265 4272 tcp->tcp_ordrelid = 0;
4266 4273 tcp->tcp_timeout = B_FALSE;
4267 4274 }
4268 4275
4269 4276 mutex_enter(&tcp->tcp_eager_lock);
4270 4277 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
4271 4278 /* Cleanup for listener */
4272 4279 tcp_eager_cleanup(tcp, 0);
4273 4280 tcp->tcp_wait_for_eagers = 1;
4274 4281 }
4275 4282 mutex_exit(&tcp->tcp_eager_lock);
4276 4283
4277 4284 connp->conn_mdt_ok = B_FALSE;
4278 4285 tcp->tcp_mdt = B_FALSE;
4279 4286
4280 4287 connp->conn_lso_ok = B_FALSE;
4281 4288 tcp->tcp_lso = B_FALSE;
4282 4289
4283 4290 msg = NULL;
4284 4291 switch (tcp->tcp_state) {
4285 4292 case TCPS_CLOSED:
4286 4293 case TCPS_IDLE:
4287 4294 case TCPS_BOUND:
4288 4295 case TCPS_LISTEN:
4289 4296 break;
4290 4297 case TCPS_SYN_SENT:
4291 4298 msg = "tcp_close, during connect";
4292 4299 break;
4293 4300 case TCPS_SYN_RCVD:
4294 4301 /*
4295 4302 * Close during the connect 3-way handshake
4296 4303 * but here there may or may not be pending data
4297 4304 * already on queue. Process almost same as in
4298 4305 * the ESTABLISHED state.
4299 4306 */
4300 4307 /* FALLTHRU */
4301 4308 default:
4302 4309 if (tcp->tcp_sodirect != NULL) {
4303 4310 /* Ok, no more sodirect */
4304 4311 tcp->tcp_sodirect = NULL;
4305 4312 }
4306 4313
4307 4314 if (tcp->tcp_fused)
4308 4315 tcp_unfuse(tcp);
4309 4316
4310 4317 /*
4311 4318 * If SO_LINGER has set a zero linger time, abort the
4312 4319 * connection with a reset.
4313 4320 */
4314 4321 if (tcp->tcp_linger && tcp->tcp_lingertime == 0) {
4315 4322 msg = "tcp_close, zero lingertime";
4316 4323 break;
4317 4324 }
4318 4325
4319 4326 ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding);
4320 4327 /*
4321 4328 * Abort connection if there is unread data queued.
4322 4329 */
4323 4330 if (tcp->tcp_rcv_list || tcp->tcp_reass_head) {
4324 4331 msg = "tcp_close, unread data";
4325 4332 break;
4326 4333 }
4327 4334 /*
4328 4335 * tcp_hard_bound is now cleared thus all packets go through
4329 4336 * tcp_lookup. This fact is used by tcp_detach below.
4330 4337 *
4331 4338 * We have done a qwait() above which could have possibly
4332 4339 * drained more messages in turn causing transition to a
4333 4340 * different state. Check whether we have to do the rest
4334 4341 * of the processing or not.
4335 4342 */
4336 4343 if (tcp->tcp_state <= TCPS_LISTEN)
4337 4344 break;
4338 4345
4339 4346 /*
4340 4347 * Transmit the FIN before detaching the tcp_t.
4341 4348 * After tcp_detach returns this queue/perimeter
4342 4349 * no longer owns the tcp_t thus others can modify it.
4343 4350 */
4344 4351 (void) tcp_xmit_end(tcp);
4345 4352
4346 4353 /*
4347 4354 * If lingering on close then wait until the fin is acked,
4348 4355 * the SO_LINGER time passes, or a reset is sent/received.
4349 4356 */
4350 4357 if (tcp->tcp_linger && tcp->tcp_lingertime > 0 &&
4351 4358 !(tcp->tcp_fin_acked) &&
4352 4359 tcp->tcp_state >= TCPS_ESTABLISHED) {
4353 4360 if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) {
4354 4361 tcp->tcp_client_errno = EWOULDBLOCK;
4355 4362 } else if (tcp->tcp_client_errno == 0) {
4356 4363
4357 4364 ASSERT(tcp->tcp_linger_tid == 0);
4358 4365
4359 4366 tcp->tcp_linger_tid = TCP_TIMER(tcp,
4360 4367 tcp_close_linger_timeout,
4361 4368 tcp->tcp_lingertime * hz);
4362 4369
4363 4370 /* tcp_close_linger_timeout will finish close */
4364 4371 if (tcp->tcp_linger_tid == 0)
4365 4372 tcp->tcp_client_errno = ENOSR;
4366 4373 else
4367 4374 return;
4368 4375 }
4369 4376
4370 4377 /*
4371 4378 * Check if we need to detach or just close
4372 4379 * the instance.
4373 4380 */
4374 4381 if (tcp->tcp_state <= TCPS_LISTEN)
4375 4382 break;
4376 4383 }
4377 4384
4378 4385 /*
4379 4386 * Make sure that no other thread will access the tcp_rq of
4380 4387 * this instance (through lookups etc.) as tcp_rq will go
4381 4388 * away shortly.
4382 4389 */
4383 4390 tcp_acceptor_hash_remove(tcp);
4384 4391
4385 4392 mutex_enter(&tcp->tcp_non_sq_lock);
4386 4393 if (tcp->tcp_flow_stopped) {
4387 4394 tcp_clrqfull(tcp);
4388 4395 }
4389 4396 mutex_exit(&tcp->tcp_non_sq_lock);
4390 4397
4391 4398 if (tcp->tcp_timer_tid != 0) {
4392 4399 delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4393 4400 tcp->tcp_timer_tid = 0;
4394 4401 }
4395 4402 /*
4396 4403 * Need to cancel those timers which will not be used when
4397 4404 * TCP is detached. This has to be done before the tcp_wq
4398 4405 * is set to the global queue.
4399 4406 */
4400 4407 tcp_timers_stop(tcp);
4401 4408
4402 4409 tcp->tcp_detached = B_TRUE;
4403 4410 if (tcp->tcp_state == TCPS_TIME_WAIT) {
4404 4411 tcp_time_wait_append(tcp);
4405 4412 TCP_DBGSTAT(tcps, tcp_detach_time_wait);
4406 4413 ASSERT(connp->conn_ref >= 3);
4407 4414 goto finish;
4408 4415 }
4409 4416
4410 4417 /*
4411 4418 * If delta is zero the timer event wasn't executed and was
4412 4419 * successfully canceled. In this case we need to restart it
4413 4420 * with the minimal delta possible.
4414 4421 */
4415 4422 if (delta >= 0)
4416 4423 tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer,
4417 4424 delta ? delta : 1);
4418 4425
4419 4426 ASSERT(connp->conn_ref >= 3);
4420 4427 goto finish;
4421 4428 }
4422 4429
4423 4430 /* Detach did not complete. Still need to remove q from stream. */
4424 4431 if (msg) {
4425 4432 if (tcp->tcp_state == TCPS_ESTABLISHED ||
4426 4433 tcp->tcp_state == TCPS_CLOSE_WAIT)
4427 4434 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
4428 4435 if (tcp->tcp_state == TCPS_SYN_SENT ||
4429 4436 tcp->tcp_state == TCPS_SYN_RCVD)
4430 4437 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
4431 4438 tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST);
4432 4439 }
4433 4440
4434 4441 tcp_closei_local(tcp);
4435 4442 CONN_DEC_REF(connp);
4436 4443 ASSERT(connp->conn_ref >= 2);
4437 4444
4438 4445 finish:
4439 4446 /*
4440 4447 * Although packets are always processed on the correct
4441 4448 * tcp's perimeter and access is serialized via squeue's,
4442 4449 * IP still needs a queue when sending packets in time_wait
4443 4450 * state so use WR(tcps_g_q) till ip_output() can be
4444 4451 * changed to deal with just connp. For read side, we
4445 4452 * could have set tcp_rq to NULL but there are some cases
4446 4453 * in tcp_rput_data() from early days of this code which
4447 4454 * do a putnext without checking if tcp is closed. Those
4448 4455 * need to be identified before both tcp_rq and tcp_wq
4449 4456 * can be set to NULL and tcps_g_q can disappear forever.
4450 4457 */
4451 4458 mutex_enter(&tcp->tcp_closelock);
4452 4459 /*
4453 4460 * Don't change the queues in the case of a listener that has
4454 4461 * eagers in its q or q0. It could surprise the eagers.
4455 4462 * Instead wait for the eagers outside the squeue.
4456 4463 */
4457 4464 if (!tcp->tcp_wait_for_eagers) {
4458 4465 tcp->tcp_detached = B_TRUE;
4459 4466 /*
4460 4467 * When default queue is closing we set tcps_g_q to NULL
4461 4468 * after the close is done.
4462 4469 */
4463 4470 ASSERT(tcps->tcps_g_q != NULL);
4464 4471 tcp->tcp_rq = tcps->tcps_g_q;
4465 4472 tcp->tcp_wq = WR(tcps->tcps_g_q);
4466 4473 }
4467 4474
4468 4475 /* Signal tcp_close() to finish closing. */
4469 4476 tcp->tcp_closed = 1;
4470 4477 cv_signal(&tcp->tcp_closecv);
4471 4478 mutex_exit(&tcp->tcp_closelock);
4472 4479 }
4473 4480
4474 4481
4475 4482 /*
4476 4483 * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp.
4477 4484 * Some stream heads get upset if they see these later on as anything but NULL.
4478 4485 */
4479 4486 static void
4480 4487 tcp_close_mpp(mblk_t **mpp)
4481 4488 {
4482 4489 mblk_t *mp;
4483 4490
4484 4491 if ((mp = *mpp) != NULL) {
4485 4492 do {
4486 4493 mp->b_next = NULL;
4487 4494 mp->b_prev = NULL;
4488 4495 } while ((mp = mp->b_cont) != NULL);
4489 4496
4490 4497 mp = *mpp;
4491 4498 *mpp = NULL;
4492 4499 freemsg(mp);
4493 4500 }
4494 4501 }
4495 4502
4496 4503 /* Do detached close. */
4497 4504 static void
4498 4505 tcp_close_detached(tcp_t *tcp)
4499 4506 {
4500 4507 if (tcp->tcp_fused)
4501 4508 tcp_unfuse(tcp);
4502 4509
4503 4510 /*
4504 4511 * Clustering code serializes TCP disconnect callbacks and
4505 4512 * cluster tcp list walks by blocking a TCP disconnect callback
4506 4513 * if a cluster tcp list walk is in progress. This ensures
4507 4514 * accurate accounting of TCPs in the cluster code even though
4508 4515 * the TCP list walk itself is not atomic.
4509 4516 */
4510 4517 tcp_closei_local(tcp);
4511 4518 CONN_DEC_REF(tcp->tcp_connp);
4512 4519 }
4513 4520
4514 4521 /*
4515 4522 * Stop all TCP timers, and free the timer mblks if requested.
4516 4523 */
4517 4524 void
4518 4525 tcp_timers_stop(tcp_t *tcp)
4519 4526 {
4520 4527 if (tcp->tcp_timer_tid != 0) {
4521 4528 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
4522 4529 tcp->tcp_timer_tid = 0;
4523 4530 }
4524 4531 if (tcp->tcp_ka_tid != 0) {
4525 4532 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
4526 4533 tcp->tcp_ka_tid = 0;
4527 4534 }
4528 4535 if (tcp->tcp_ack_tid != 0) {
4529 4536 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
4530 4537 tcp->tcp_ack_tid = 0;
4531 4538 }
4532 4539 if (tcp->tcp_push_tid != 0) {
4533 4540 (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
4534 4541 tcp->tcp_push_tid = 0;
4535 4542 }
4536 4543 }
4537 4544
4538 4545 /*
4539 4546 * The tcp_t is going away. Remove it from all lists and set it
4540 4547 * to TCPS_CLOSED. The freeing up of memory is deferred until
4541 4548 * tcp_inactive. This is needed since a thread in tcp_rput might have
4542 4549 * done a CONN_INC_REF on this structure before it was removed from the
4543 4550 * hashes.
4544 4551 */
4545 4552 static void
4546 4553 tcp_closei_local(tcp_t *tcp)
4547 4554 {
4548 4555 ire_t *ire;
4549 4556 conn_t *connp = tcp->tcp_connp;
4550 4557 tcp_stack_t *tcps = tcp->tcp_tcps;
4551 4558
4552 4559 if (!TCP_IS_SOCKET(tcp))
4553 4560 tcp_acceptor_hash_remove(tcp);
4554 4561
4555 4562 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
4556 4563 tcp->tcp_ibsegs = 0;
4557 4564 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
4558 4565 tcp->tcp_obsegs = 0;
4559 4566
4560 4567 /*
4561 4568 * If we are an eager connection hanging off a listener that
4562 4569 * hasn't formally accepted the connection yet, get off his
4563 4570 * list and blow off any data that we have accumulated.
4564 4571 */
4565 4572 if (tcp->tcp_listener != NULL) {
4566 4573 tcp_t *listener = tcp->tcp_listener;
4567 4574 mutex_enter(&listener->tcp_eager_lock);
4568 4575 /*
4569 4576 * tcp_tconnind_started == B_TRUE means that the
4570 4577 * conn_ind has already gone to listener. At
4571 4578 * this point, eager will be closed but we
4572 4579 * leave it in listeners eager list so that
4573 4580 * if listener decides to close without doing
4574 4581 * accept, we can clean this up. In tcp_wput_accept
4575 4582 * we take care of the case of accept on closed
4576 4583 * eager.
4577 4584 */
4578 4585 if (!tcp->tcp_tconnind_started) {
4579 4586 tcp_eager_unlink(tcp);
4580 4587 mutex_exit(&listener->tcp_eager_lock);
4581 4588 /*
4582 4589 * We don't want to have any pointers to the
4583 4590 * listener queue, after we have released our
4584 4591 * reference on the listener
4585 4592 */
4586 4593 ASSERT(tcps->tcps_g_q != NULL);
4587 4594 tcp->tcp_rq = tcps->tcps_g_q;
4588 4595 tcp->tcp_wq = WR(tcps->tcps_g_q);
4589 4596 CONN_DEC_REF(listener->tcp_connp);
4590 4597 } else {
4591 4598 mutex_exit(&listener->tcp_eager_lock);
4592 4599 }
4593 4600 }
4594 4601
4595 4602 /* Stop all the timers */
4596 4603 tcp_timers_stop(tcp);
4597 4604
4598 4605 if (tcp->tcp_state == TCPS_LISTEN) {
4599 4606 if (tcp->tcp_ip_addr_cache) {
4600 4607 kmem_free((void *)tcp->tcp_ip_addr_cache,
4601 4608 IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
4602 4609 tcp->tcp_ip_addr_cache = NULL;
4603 4610 }
4604 4611 }
4605 4612 mutex_enter(&tcp->tcp_non_sq_lock);
4606 4613 if (tcp->tcp_flow_stopped)
4607 4614 tcp_clrqfull(tcp);
4608 4615 mutex_exit(&tcp->tcp_non_sq_lock);
4609 4616
4610 4617 tcp_bind_hash_remove(tcp);
4611 4618 /*
4612 4619 * If the tcp_time_wait_collector (which runs outside the squeue)
4613 4620 * is trying to remove this tcp from the time wait list, we will
4614 4621 * block in tcp_time_wait_remove while trying to acquire the
4615 4622 * tcp_time_wait_lock. The logic in tcp_time_wait_collector also
4616 4623 * requires the ipcl_hash_remove to be ordered after the
4617 4624 * tcp_time_wait_remove for the refcnt checks to work correctly.
4618 4625 */
4619 4626 if (tcp->tcp_state == TCPS_TIME_WAIT)
4620 4627 (void) tcp_time_wait_remove(tcp, NULL);
4621 4628 CL_INET_DISCONNECT(tcp);
4622 4629 ipcl_hash_remove(connp);
4623 4630
4624 4631 /*
4625 4632 * Delete the cached ire in conn_ire_cache and also mark
4626 4633 * the conn as CONDEMNED
4627 4634 */
4628 4635 mutex_enter(&connp->conn_lock);
4629 4636 connp->conn_state_flags |= CONN_CONDEMNED;
|
↓ open down ↓ |
713 lines elided |
↑ open up ↑ |
4630 4637 ire = connp->conn_ire_cache;
4631 4638 connp->conn_ire_cache = NULL;
4632 4639 mutex_exit(&connp->conn_lock);
4633 4640 if (ire != NULL)
4634 4641 IRE_REFRELE_NOTR(ire);
4635 4642
4636 4643 /* Need to cleanup any pending ioctls */
4637 4644 ASSERT(tcp->tcp_time_wait_next == NULL);
4638 4645 ASSERT(tcp->tcp_time_wait_prev == NULL);
4639 4646 ASSERT(tcp->tcp_time_wait_expire == 0);
4647 + if (connp->conn_fully_bound) {
4648 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
4649 + tcp_t *, tcp, int32_t, TCPS_CLOSED);
4650 + }
4640 4651 tcp->tcp_state = TCPS_CLOSED;
4641 4652
4642 4653 /* Release any SSL context */
4643 4654 if (tcp->tcp_kssl_ent != NULL) {
4644 4655 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4645 4656 tcp->tcp_kssl_ent = NULL;
4646 4657 }
4647 4658 if (tcp->tcp_kssl_ctx != NULL) {
4648 4659 kssl_release_ctx(tcp->tcp_kssl_ctx);
4649 4660 tcp->tcp_kssl_ctx = NULL;
4650 4661 }
4651 4662 tcp->tcp_kssl_pending = B_FALSE;
4652 4663
4653 4664 tcp_ipsec_cleanup(tcp);
4654 4665 }
4655 4666
4656 4667 /*
4657 4668 * tcp is dying (called from ipcl_conn_destroy and error cases).
4658 4669 * Free the tcp_t in either case.
4659 4670 */
4660 4671 void
4661 4672 tcp_free(tcp_t *tcp)
4662 4673 {
4663 4674 mblk_t *mp;
4664 4675 ip6_pkt_t *ipp;
4665 4676
4666 4677 ASSERT(tcp != NULL);
4667 4678 ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL);
4668 4679
4669 4680 tcp->tcp_rq = NULL;
4670 4681 tcp->tcp_wq = NULL;
4671 4682
4672 4683 tcp_close_mpp(&tcp->tcp_xmit_head);
4673 4684 tcp_close_mpp(&tcp->tcp_reass_head);
4674 4685 if (tcp->tcp_rcv_list != NULL) {
4675 4686 /* Free b_next chain */
4676 4687 tcp_close_mpp(&tcp->tcp_rcv_list);
4677 4688 }
4678 4689 if ((mp = tcp->tcp_urp_mp) != NULL) {
4679 4690 freemsg(mp);
4680 4691 }
4681 4692 if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
4682 4693 freemsg(mp);
4683 4694 }
4684 4695
4685 4696 if (tcp->tcp_fused_sigurg_mp != NULL) {
4686 4697 freeb(tcp->tcp_fused_sigurg_mp);
4687 4698 tcp->tcp_fused_sigurg_mp = NULL;
4688 4699 }
4689 4700
4690 4701 if (tcp->tcp_sack_info != NULL) {
4691 4702 if (tcp->tcp_notsack_list != NULL) {
4692 4703 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
4693 4704 }
4694 4705 bzero(tcp->tcp_sack_info, sizeof (tcp_sack_info_t));
4695 4706 }
4696 4707
4697 4708 if (tcp->tcp_hopopts != NULL) {
4698 4709 mi_free(tcp->tcp_hopopts);
4699 4710 tcp->tcp_hopopts = NULL;
4700 4711 tcp->tcp_hopoptslen = 0;
4701 4712 }
4702 4713 ASSERT(tcp->tcp_hopoptslen == 0);
4703 4714 if (tcp->tcp_dstopts != NULL) {
4704 4715 mi_free(tcp->tcp_dstopts);
4705 4716 tcp->tcp_dstopts = NULL;
4706 4717 tcp->tcp_dstoptslen = 0;
4707 4718 }
4708 4719 ASSERT(tcp->tcp_dstoptslen == 0);
4709 4720 if (tcp->tcp_rtdstopts != NULL) {
4710 4721 mi_free(tcp->tcp_rtdstopts);
4711 4722 tcp->tcp_rtdstopts = NULL;
4712 4723 tcp->tcp_rtdstoptslen = 0;
4713 4724 }
4714 4725 ASSERT(tcp->tcp_rtdstoptslen == 0);
4715 4726 if (tcp->tcp_rthdr != NULL) {
4716 4727 mi_free(tcp->tcp_rthdr);
4717 4728 tcp->tcp_rthdr = NULL;
4718 4729 tcp->tcp_rthdrlen = 0;
4719 4730 }
4720 4731 ASSERT(tcp->tcp_rthdrlen == 0);
4721 4732
4722 4733 ipp = &tcp->tcp_sticky_ipp;
4723 4734 if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS |
4724 4735 IPPF_RTHDR))
4725 4736 ip6_pkt_free(ipp);
4726 4737
4727 4738 /*
4728 4739 * Free memory associated with the tcp/ip header template.
4729 4740 */
4730 4741
4731 4742 if (tcp->tcp_iphc != NULL)
4732 4743 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
4733 4744
4734 4745 /*
4735 4746 * Following is really a blowing away a union.
4736 4747 * It happens to have exactly two members of identical size
4737 4748 * the following code is enough.
4738 4749 */
4739 4750 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
4740 4751
4741 4752 if (tcp->tcp_tracebuf != NULL) {
4742 4753 kmem_free(tcp->tcp_tracebuf, sizeof (tcptrch_t));
4743 4754 tcp->tcp_tracebuf = NULL;
4744 4755 }
4745 4756 }
4746 4757
4747 4758
4748 4759 /*
4749 4760 * Put a connection confirmation message upstream built from the
4750 4761 * address information within 'iph' and 'tcph'. Report our success or failure.
4751 4762 */
4752 4763 static boolean_t
4753 4764 tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
4754 4765 mblk_t **defermp)
4755 4766 {
4756 4767 sin_t sin;
4757 4768 sin6_t sin6;
4758 4769 mblk_t *mp;
4759 4770 char *optp = NULL;
4760 4771 int optlen = 0;
4761 4772 cred_t *cr;
4762 4773
4763 4774 if (defermp != NULL)
4764 4775 *defermp = NULL;
4765 4776
4766 4777 if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
4767 4778 /*
4768 4779 * Return in T_CONN_CON results of option negotiation through
4769 4780 * the T_CONN_REQ. Note: If there is an real end-to-end option
4770 4781 * negotiation, then what is received from remote end needs
4771 4782 * to be taken into account but there is no such thing (yet?)
4772 4783 * in our TCP/IP.
4773 4784 * Note: We do not use mi_offset_param() here as
4774 4785 * tcp_opts_conn_req contents do not directly come from
4775 4786 * an application and are either generated in kernel or
4776 4787 * from user input that was already verified.
4777 4788 */
4778 4789 mp = tcp->tcp_conn.tcp_opts_conn_req;
4779 4790 optp = (char *)(mp->b_rptr +
4780 4791 ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
4781 4792 optlen = (int)
4782 4793 ((struct T_conn_req *)mp->b_rptr)->OPT_length;
4783 4794 }
4784 4795
4785 4796 if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
4786 4797 ipha_t *ipha = (ipha_t *)iphdr;
4787 4798
4788 4799 /* packet is IPv4 */
4789 4800 if (tcp->tcp_family == AF_INET) {
4790 4801 sin = sin_null;
4791 4802 sin.sin_addr.s_addr = ipha->ipha_src;
4792 4803 sin.sin_port = *(uint16_t *)tcph->th_lport;
4793 4804 sin.sin_family = AF_INET;
4794 4805 mp = mi_tpi_conn_con(NULL, (char *)&sin,
4795 4806 (int)sizeof (sin_t), optp, optlen);
4796 4807 } else {
4797 4808 sin6 = sin6_null;
4798 4809 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4799 4810 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4800 4811 sin6.sin6_family = AF_INET6;
4801 4812 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4802 4813 (int)sizeof (sin6_t), optp, optlen);
4803 4814
4804 4815 }
4805 4816 } else {
4806 4817 ip6_t *ip6h = (ip6_t *)iphdr;
4807 4818
4808 4819 ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
4809 4820 ASSERT(tcp->tcp_family == AF_INET6);
4810 4821 sin6 = sin6_null;
4811 4822 sin6.sin6_addr = ip6h->ip6_src;
4812 4823 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4813 4824 sin6.sin6_family = AF_INET6;
4814 4825 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4815 4826 mp = mi_tpi_conn_con(NULL, (char *)&sin6,
4816 4827 (int)sizeof (sin6_t), optp, optlen);
4817 4828 }
4818 4829
4819 4830 if (!mp)
4820 4831 return (B_FALSE);
4821 4832
4822 4833 if ((cr = DB_CRED(idmp)) != NULL) {
4823 4834 mblk_setcred(mp, cr);
4824 4835 DB_CPID(mp) = DB_CPID(idmp);
4825 4836 }
4826 4837
4827 4838 if (defermp == NULL)
4828 4839 putnext(tcp->tcp_rq, mp);
4829 4840 else
4830 4841 *defermp = mp;
4831 4842
4832 4843 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
4833 4844 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
4834 4845 return (B_TRUE);
4835 4846 }
4836 4847
4837 4848 /*
4838 4849 * Defense for the SYN attack -
4839 4850 * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
4840 4851 * one from the list of droppable eagers. This list is a subset of q0.
4841 4852 * see comments before the definition of MAKE_DROPPABLE().
4842 4853 * 2. Don't drop a SYN request before its first timeout. This gives every
4843 4854 * request at least til the first timeout to complete its 3-way handshake.
4844 4855 * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
4845 4856 * requests currently on the queue that has timed out. This will be used
4846 4857 * as an indicator of whether an attack is under way, so that appropriate
4847 4858 * actions can be taken. (It's incremented in tcp_timer() and decremented
4848 4859 * either when eager goes into ESTABLISHED, or gets freed up.)
4849 4860 * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on
4850 4861 * # of timeout drops back to <= q0len/32 => SYN alert off
4851 4862 */
4852 4863 static boolean_t
4853 4864 tcp_drop_q0(tcp_t *tcp)
4854 4865 {
4855 4866 tcp_t *eager;
4856 4867 mblk_t *mp;
4857 4868 tcp_stack_t *tcps = tcp->tcp_tcps;
4858 4869
4859 4870 ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
4860 4871 ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
4861 4872
4862 4873 /* Pick oldest eager from the list of droppable eagers */
4863 4874 eager = tcp->tcp_eager_prev_drop_q0;
4864 4875
4865 4876 /* If list is empty. return B_FALSE */
4866 4877 if (eager == tcp) {
4867 4878 return (B_FALSE);
4868 4879 }
4869 4880
4870 4881 /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
4871 4882 if ((mp = allocb(0, BPRI_HI)) == NULL)
4872 4883 return (B_FALSE);
4873 4884
4874 4885 /*
4875 4886 * Take this eager out from the list of droppable eagers since we are
4876 4887 * going to drop it.
4877 4888 */
4878 4889 MAKE_UNDROPPABLE(eager);
4879 4890
4880 4891 if (tcp->tcp_debug) {
4881 4892 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
4882 4893 "tcp_drop_q0: listen half-open queue (max=%d) overflow"
4883 4894 " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0,
4884 4895 tcp->tcp_conn_req_cnt_q0,
4885 4896 tcp_display(tcp, NULL, DISP_PORT_ONLY));
4886 4897 }
4887 4898
4888 4899 BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop);
4889 4900
4890 4901 /* Put a reference on the conn as we are enqueueing it in the sqeue */
4891 4902 CONN_INC_REF(eager->tcp_connp);
4892 4903
4893 4904 /* Mark the IRE created for this SYN request temporary */
4894 4905 tcp_ip_ire_mark_advice(eager);
4895 4906 squeue_fill(eager->tcp_connp->conn_sqp, mp,
4896 4907 tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0);
4897 4908
4898 4909 return (B_TRUE);
4899 4910 }
4900 4911
4901 4912 int
4902 4913 tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
4903 4914 tcph_t *tcph, uint_t ipvers, mblk_t *idmp)
4904 4915 {
4905 4916 tcp_t *ltcp = lconnp->conn_tcp;
4906 4917 tcp_t *tcp = connp->conn_tcp;
4907 4918 mblk_t *tpi_mp;
4908 4919 ipha_t *ipha;
4909 4920 ip6_t *ip6h;
4910 4921 sin6_t sin6;
4911 4922 in6_addr_t v6dst;
4912 4923 int err;
4913 4924 int ifindex = 0;
4914 4925 cred_t *cr;
4915 4926 tcp_stack_t *tcps = tcp->tcp_tcps;
4916 4927
4917 4928 if (ipvers == IPV4_VERSION) {
4918 4929 ipha = (ipha_t *)mp->b_rptr;
4919 4930
4920 4931 connp->conn_send = ip_output;
4921 4932 connp->conn_recv = tcp_input;
4922 4933
4923 4934 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
4924 4935 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
4925 4936
4926 4937 sin6 = sin6_null;
4927 4938 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr);
4928 4939 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst);
4929 4940 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4930 4941 sin6.sin6_family = AF_INET6;
4931 4942 sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst,
4932 4943 lconnp->conn_zoneid, tcps->tcps_netstack);
4933 4944 if (tcp->tcp_recvdstaddr) {
4934 4945 sin6_t sin6d;
4935 4946
4936 4947 sin6d = sin6_null;
4937 4948 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst,
4938 4949 &sin6d.sin6_addr);
4939 4950 sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4940 4951 sin6d.sin6_family = AF_INET;
4941 4952 tpi_mp = mi_tpi_extconn_ind(NULL,
4942 4953 (char *)&sin6d, sizeof (sin6_t),
4943 4954 (char *)&tcp,
4944 4955 (t_scalar_t)sizeof (intptr_t),
4945 4956 (char *)&sin6d, sizeof (sin6_t),
4946 4957 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4947 4958 } else {
4948 4959 tpi_mp = mi_tpi_conn_ind(NULL,
4949 4960 (char *)&sin6, sizeof (sin6_t),
4950 4961 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4951 4962 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4952 4963 }
4953 4964 } else {
4954 4965 ip6h = (ip6_t *)mp->b_rptr;
4955 4966
4956 4967 connp->conn_send = ip_output_v6;
4957 4968 connp->conn_recv = tcp_input;
4958 4969
4959 4970 connp->conn_srcv6 = ip6h->ip6_dst;
4960 4971 connp->conn_remv6 = ip6h->ip6_src;
4961 4972
4962 4973 /* db_cksumstuff is set at ip_fanout_tcp_v6 */
4963 4974 ifindex = (int)DB_CKSUMSTUFF(mp);
4964 4975 DB_CKSUMSTUFF(mp) = 0;
4965 4976
4966 4977 sin6 = sin6_null;
4967 4978 sin6.sin6_addr = ip6h->ip6_src;
4968 4979 sin6.sin6_port = *(uint16_t *)tcph->th_lport;
4969 4980 sin6.sin6_family = AF_INET6;
4970 4981 sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
4971 4982 sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst,
4972 4983 lconnp->conn_zoneid, tcps->tcps_netstack);
4973 4984
4974 4985 if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
4975 4986 /* Pass up the scope_id of remote addr */
4976 4987 sin6.sin6_scope_id = ifindex;
4977 4988 } else {
4978 4989 sin6.sin6_scope_id = 0;
4979 4990 }
4980 4991 if (tcp->tcp_recvdstaddr) {
4981 4992 sin6_t sin6d;
4982 4993
4983 4994 sin6d = sin6_null;
4984 4995 sin6.sin6_addr = ip6h->ip6_dst;
4985 4996 sin6d.sin6_port = *(uint16_t *)tcph->th_fport;
4986 4997 sin6d.sin6_family = AF_INET;
4987 4998 tpi_mp = mi_tpi_extconn_ind(NULL,
4988 4999 (char *)&sin6d, sizeof (sin6_t),
4989 5000 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4990 5001 (char *)&sin6d, sizeof (sin6_t),
4991 5002 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4992 5003 } else {
4993 5004 tpi_mp = mi_tpi_conn_ind(NULL,
4994 5005 (char *)&sin6, sizeof (sin6_t),
4995 5006 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
4996 5007 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
4997 5008 }
4998 5009 }
4999 5010
5000 5011 if (tpi_mp == NULL)
5001 5012 return (ENOMEM);
5002 5013
5003 5014 connp->conn_fport = *(uint16_t *)tcph->th_lport;
5004 5015 connp->conn_lport = *(uint16_t *)tcph->th_fport;
5005 5016 connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER);
5006 5017 connp->conn_fully_bound = B_FALSE;
5007 5018
5008 5019 if (tcps->tcps_trace)
5009 5020 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
5010 5021
5011 5022 /* Inherit information from the "parent" */
5012 5023 tcp->tcp_ipversion = ltcp->tcp_ipversion;
5013 5024 tcp->tcp_family = ltcp->tcp_family;
5014 5025 tcp->tcp_wq = ltcp->tcp_wq;
5015 5026 tcp->tcp_rq = ltcp->tcp_rq;
5016 5027 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
5017 5028 tcp->tcp_detached = B_TRUE;
5018 5029 if ((err = tcp_init_values(tcp)) != 0) {
5019 5030 freemsg(tpi_mp);
5020 5031 return (err);
5021 5032 }
5022 5033
5023 5034 if (ipvers == IPV4_VERSION) {
5024 5035 if ((err = tcp_header_init_ipv4(tcp)) != 0) {
5025 5036 freemsg(tpi_mp);
5026 5037 return (err);
5027 5038 }
5028 5039 ASSERT(tcp->tcp_ipha != NULL);
5029 5040 } else {
5030 5041 /* ifindex must be already set */
5031 5042 ASSERT(ifindex != 0);
5032 5043
5033 5044 if (ltcp->tcp_bound_if != 0) {
5034 5045 /*
5035 5046 * Set newtcp's bound_if equal to
5036 5047 * listener's value. If ifindex is
5037 5048 * not the same as ltcp->tcp_bound_if,
5038 5049 * it must be a packet for the ipmp group
5039 5050 * of interfaces
5040 5051 */
5041 5052 tcp->tcp_bound_if = ltcp->tcp_bound_if;
5042 5053 } else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) {
5043 5054 tcp->tcp_bound_if = ifindex;
5044 5055 }
5045 5056
5046 5057 tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary;
5047 5058 tcp->tcp_recvifindex = 0;
5048 5059 tcp->tcp_recvhops = 0xffffffffU;
5049 5060 ASSERT(tcp->tcp_ip6h != NULL);
5050 5061 }
5051 5062
5052 5063 tcp->tcp_lport = ltcp->tcp_lport;
5053 5064
5054 5065 if (ltcp->tcp_ipversion == tcp->tcp_ipversion) {
5055 5066 if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) {
5056 5067 /*
5057 5068 * Listener had options of some sort; eager inherits.
5058 5069 * Free up the eager template and allocate one
5059 5070 * of the right size.
5060 5071 */
5061 5072 if (tcp->tcp_hdr_grown) {
5062 5073 kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len);
5063 5074 } else {
5064 5075 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
5065 5076 kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc);
5066 5077 }
5067 5078 tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len,
5068 5079 KM_NOSLEEP);
5069 5080 if (tcp->tcp_iphc == NULL) {
5070 5081 tcp->tcp_iphc_len = 0;
5071 5082 freemsg(tpi_mp);
5072 5083 return (ENOMEM);
5073 5084 }
5074 5085 tcp->tcp_iphc_len = ltcp->tcp_iphc_len;
5075 5086 tcp->tcp_hdr_grown = B_TRUE;
5076 5087 }
5077 5088 tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
5078 5089 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
5079 5090 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5080 5091 tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops;
5081 5092 tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf;
5082 5093
5083 5094 /*
5084 5095 * Copy the IP+TCP header template from listener to eager
5085 5096 */
5086 5097 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
5087 5098 if (tcp->tcp_ipversion == IPV6_VERSION) {
5088 5099 if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt ==
5089 5100 IPPROTO_RAW) {
5090 5101 tcp->tcp_ip6h =
5091 5102 (ip6_t *)(tcp->tcp_iphc +
5092 5103 sizeof (ip6i_t));
5093 5104 } else {
5094 5105 tcp->tcp_ip6h =
5095 5106 (ip6_t *)(tcp->tcp_iphc);
5096 5107 }
5097 5108 tcp->tcp_ipha = NULL;
5098 5109 } else {
5099 5110 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
5100 5111 tcp->tcp_ip6h = NULL;
5101 5112 }
5102 5113 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
5103 5114 tcp->tcp_ip_hdr_len);
5104 5115 } else {
5105 5116 /*
5106 5117 * only valid case when ipversion of listener and
5107 5118 * eager differ is when listener is IPv6 and
5108 5119 * eager is IPv4.
5109 5120 * Eager header template has been initialized to the
5110 5121 * maximum v4 header sizes, which includes space for
5111 5122 * TCP and IP options.
5112 5123 */
5113 5124 ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) &&
5114 5125 (tcp->tcp_ipversion == IPV4_VERSION));
5115 5126 ASSERT(tcp->tcp_iphc_len >=
5116 5127 TCP_MAX_COMBINED_HEADER_LENGTH);
5117 5128 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5118 5129 /* copy IP header fields individually */
5119 5130 tcp->tcp_ipha->ipha_ttl =
5120 5131 ltcp->tcp_ip6h->ip6_hops;
5121 5132 bcopy(ltcp->tcp_tcph->th_lport,
5122 5133 tcp->tcp_tcph->th_lport, sizeof (ushort_t));
5123 5134 }
5124 5135
5125 5136 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
5126 5137 bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport,
5127 5138 sizeof (in_port_t));
5128 5139
5129 5140 if (ltcp->tcp_lport == 0) {
5130 5141 tcp->tcp_lport = *(in_port_t *)tcph->th_fport;
5131 5142 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport,
5132 5143 sizeof (in_port_t));
5133 5144 }
5134 5145
5135 5146 if (tcp->tcp_ipversion == IPV4_VERSION) {
5136 5147 ASSERT(ipha != NULL);
5137 5148 tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
5138 5149 tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
5139 5150
5140 5151 /* Source routing option copyover (reverse it) */
5141 5152 if (tcps->tcps_rev_src_routes)
5142 5153 tcp_opt_reverse(tcp, ipha);
5143 5154 } else {
5144 5155 ASSERT(ip6h != NULL);
5145 5156 tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src;
5146 5157 tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst;
5147 5158 }
5148 5159
5149 5160 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
5150 5161 ASSERT(!tcp->tcp_tconnind_started);
5151 5162 /*
5152 5163 * If the SYN contains a credential, it's a loopback packet; attach
5153 5164 * the credential to the TPI message.
5154 5165 */
5155 5166 if ((cr = DB_CRED(idmp)) != NULL) {
5156 5167 mblk_setcred(tpi_mp, cr);
5157 5168 DB_CPID(tpi_mp) = DB_CPID(idmp);
5158 5169 }
5159 5170 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
5160 5171
5161 5172 /* Inherit the listener's SSL protection state */
5162 5173
5163 5174 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
5164 5175 kssl_hold_ent(tcp->tcp_kssl_ent);
5165 5176 tcp->tcp_kssl_pending = B_TRUE;
5166 5177 }
5167 5178
5168 5179 return (0);
5169 5180 }
5170 5181
5171 5182
5172 5183 int
5173 5184 tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
5174 5185 tcph_t *tcph, mblk_t *idmp)
5175 5186 {
5176 5187 tcp_t *ltcp = lconnp->conn_tcp;
5177 5188 tcp_t *tcp = connp->conn_tcp;
5178 5189 sin_t sin;
5179 5190 mblk_t *tpi_mp = NULL;
5180 5191 int err;
5181 5192 cred_t *cr;
5182 5193 tcp_stack_t *tcps = tcp->tcp_tcps;
5183 5194
5184 5195 sin = sin_null;
5185 5196 sin.sin_addr.s_addr = ipha->ipha_src;
5186 5197 sin.sin_port = *(uint16_t *)tcph->th_lport;
5187 5198 sin.sin_family = AF_INET;
5188 5199 if (ltcp->tcp_recvdstaddr) {
5189 5200 sin_t sind;
5190 5201
5191 5202 sind = sin_null;
5192 5203 sind.sin_addr.s_addr = ipha->ipha_dst;
5193 5204 sind.sin_port = *(uint16_t *)tcph->th_fport;
5194 5205 sind.sin_family = AF_INET;
5195 5206 tpi_mp = mi_tpi_extconn_ind(NULL,
5196 5207 (char *)&sind, sizeof (sin_t), (char *)&tcp,
5197 5208 (t_scalar_t)sizeof (intptr_t), (char *)&sind,
5198 5209 sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5199 5210 } else {
5200 5211 tpi_mp = mi_tpi_conn_ind(NULL,
5201 5212 (char *)&sin, sizeof (sin_t),
5202 5213 (char *)&tcp, (t_scalar_t)sizeof (intptr_t),
5203 5214 (t_scalar_t)ltcp->tcp_conn_req_seqnum);
5204 5215 }
5205 5216
5206 5217 if (tpi_mp == NULL) {
5207 5218 return (ENOMEM);
5208 5219 }
5209 5220
5210 5221 connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER);
5211 5222 connp->conn_send = ip_output;
5212 5223 connp->conn_recv = tcp_input;
5213 5224 connp->conn_fully_bound = B_FALSE;
5214 5225
5215 5226 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6);
5216 5227 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6);
5217 5228 connp->conn_fport = *(uint16_t *)tcph->th_lport;
5218 5229 connp->conn_lport = *(uint16_t *)tcph->th_fport;
5219 5230
5220 5231 if (tcps->tcps_trace) {
5221 5232 tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP);
5222 5233 }
5223 5234
5224 5235 /* Inherit information from the "parent" */
5225 5236 tcp->tcp_ipversion = ltcp->tcp_ipversion;
5226 5237 tcp->tcp_family = ltcp->tcp_family;
5227 5238 tcp->tcp_wq = ltcp->tcp_wq;
5228 5239 tcp->tcp_rq = ltcp->tcp_rq;
5229 5240 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
5230 5241 tcp->tcp_detached = B_TRUE;
5231 5242 if ((err = tcp_init_values(tcp)) != 0) {
5232 5243 freemsg(tpi_mp);
5233 5244 return (err);
5234 5245 }
5235 5246
5236 5247 /*
5237 5248 * Let's make sure that eager tcp template has enough space to
5238 5249 * copy IPv4 listener's tcp template. Since the conn_t structure is
5239 5250 * preserved and tcp_iphc_len is also preserved, an eager conn_t may
5240 5251 * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or
5241 5252 * more (in case of re-allocation of conn_t with tcp-IPv6 template with
5242 5253 * extension headers or with ip6i_t struct). Note that bcopy() below
5243 5254 * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_
5244 5255 * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener.
5245 5256 */
5246 5257 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
5247 5258 ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH);
5248 5259
5249 5260 tcp->tcp_hdr_len = ltcp->tcp_hdr_len;
5250 5261 tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len;
5251 5262 tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len;
5252 5263 tcp->tcp_ttl = ltcp->tcp_ttl;
5253 5264 tcp->tcp_tos = ltcp->tcp_tos;
5254 5265
5255 5266 /* Copy the IP+TCP header template from listener to eager */
5256 5267 bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len);
5257 5268 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
5258 5269 tcp->tcp_ip6h = NULL;
5259 5270 tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc +
5260 5271 tcp->tcp_ip_hdr_len);
5261 5272
5262 5273 /* Initialize the IP addresses and Ports */
5263 5274 tcp->tcp_ipha->ipha_dst = ipha->ipha_src;
5264 5275 tcp->tcp_ipha->ipha_src = ipha->ipha_dst;
5265 5276 bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t));
5266 5277 bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t));
5267 5278
5268 5279 /* Source routing option copyover (reverse it) */
5269 5280 if (tcps->tcps_rev_src_routes)
5270 5281 tcp_opt_reverse(tcp, ipha);
5271 5282
5272 5283 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
5273 5284 ASSERT(!tcp->tcp_tconnind_started);
5274 5285
5275 5286 /*
5276 5287 * If the SYN contains a credential, it's a loopback packet; attach
5277 5288 * the credential to the TPI message.
5278 5289 */
5279 5290 if ((cr = DB_CRED(idmp)) != NULL) {
5280 5291 mblk_setcred(tpi_mp, cr);
5281 5292 DB_CPID(tpi_mp) = DB_CPID(idmp);
5282 5293 }
5283 5294 tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp;
5284 5295
5285 5296 /* Inherit the listener's SSL protection state */
5286 5297 if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) {
5287 5298 kssl_hold_ent(tcp->tcp_kssl_ent);
5288 5299 tcp->tcp_kssl_pending = B_TRUE;
5289 5300 }
5290 5301
5291 5302 return (0);
5292 5303 }
5293 5304
5294 5305 /*
5295 5306 * sets up conn for ipsec.
5296 5307 * if the first mblk is M_CTL it is consumed and mpp is updated.
5297 5308 * in case of error mpp is freed.
5298 5309 */
5299 5310 conn_t *
5300 5311 tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp)
5301 5312 {
5302 5313 conn_t *connp = tcp->tcp_connp;
5303 5314 conn_t *econnp;
5304 5315 squeue_t *new_sqp;
5305 5316 mblk_t *first_mp = *mpp;
5306 5317 mblk_t *mp = *mpp;
5307 5318 boolean_t mctl_present = B_FALSE;
5308 5319 uint_t ipvers;
5309 5320
5310 5321 econnp = tcp_get_conn(sqp, tcp->tcp_tcps);
5311 5322 if (econnp == NULL) {
5312 5323 freemsg(first_mp);
5313 5324 return (NULL);
5314 5325 }
5315 5326 if (DB_TYPE(mp) == M_CTL) {
5316 5327 if (mp->b_cont == NULL ||
5317 5328 mp->b_cont->b_datap->db_type != M_DATA) {
5318 5329 freemsg(first_mp);
5319 5330 return (NULL);
5320 5331 }
5321 5332 mp = mp->b_cont;
5322 5333 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) {
5323 5334 freemsg(first_mp);
5324 5335 return (NULL);
5325 5336 }
5326 5337
5327 5338 mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5328 5339 first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5329 5340 mctl_present = B_TRUE;
5330 5341 } else {
5331 5342 ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY);
5332 5343 mp->b_datap->db_struioflag &= ~STRUIO_POLICY;
5333 5344 }
5334 5345
5335 5346 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5336 5347 DB_CKSUMSTART(mp) = 0;
5337 5348
5338 5349 ASSERT(OK_32PTR(mp->b_rptr));
5339 5350 ipvers = IPH_HDR_VERSION(mp->b_rptr);
5340 5351 if (ipvers == IPV4_VERSION) {
5341 5352 uint16_t *up;
5342 5353 uint32_t ports;
5343 5354 ipha_t *ipha;
5344 5355
5345 5356 ipha = (ipha_t *)mp->b_rptr;
5346 5357 up = (uint16_t *)((uchar_t *)ipha +
5347 5358 IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET);
5348 5359 ports = *(uint32_t *)up;
5349 5360 IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP,
5350 5361 ipha->ipha_dst, ipha->ipha_src, ports);
5351 5362 } else {
5352 5363 uint16_t *up;
5353 5364 uint32_t ports;
5354 5365 uint16_t ip_hdr_len;
5355 5366 uint8_t *nexthdrp;
5356 5367 ip6_t *ip6h;
5357 5368 tcph_t *tcph;
5358 5369
5359 5370 ip6h = (ip6_t *)mp->b_rptr;
5360 5371 if (ip6h->ip6_nxt == IPPROTO_TCP) {
5361 5372 ip_hdr_len = IPV6_HDR_LEN;
5362 5373 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len,
5363 5374 &nexthdrp) || *nexthdrp != IPPROTO_TCP) {
5364 5375 CONN_DEC_REF(econnp);
5365 5376 freemsg(first_mp);
5366 5377 return (NULL);
5367 5378 }
5368 5379 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5369 5380 up = (uint16_t *)tcph->th_lport;
5370 5381 ports = *(uint32_t *)up;
5371 5382 IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP,
5372 5383 ip6h->ip6_dst, ip6h->ip6_src, ports);
5373 5384 }
5374 5385
5375 5386 /*
5376 5387 * The caller already ensured that there is a sqp present.
5377 5388 */
5378 5389 econnp->conn_sqp = new_sqp;
5379 5390
5380 5391 if (connp->conn_policy != NULL) {
5381 5392 ipsec_in_t *ii;
5382 5393 ii = (ipsec_in_t *)(first_mp->b_rptr);
5383 5394 ASSERT(ii->ipsec_in_policy == NULL);
5384 5395 IPPH_REFHOLD(connp->conn_policy);
5385 5396 ii->ipsec_in_policy = connp->conn_policy;
5386 5397
5387 5398 first_mp->b_datap->db_type = IPSEC_POLICY_SET;
5388 5399 if (!ip_bind_ipsec_policy_set(econnp, first_mp)) {
5389 5400 CONN_DEC_REF(econnp);
5390 5401 freemsg(first_mp);
5391 5402 return (NULL);
5392 5403 }
5393 5404 }
5394 5405
5395 5406 if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) {
5396 5407 CONN_DEC_REF(econnp);
5397 5408 freemsg(first_mp);
5398 5409 return (NULL);
5399 5410 }
5400 5411
5401 5412 /*
5402 5413 * If we know we have some policy, pass the "IPSEC"
5403 5414 * options size TCP uses this adjust the MSS.
5404 5415 */
5405 5416 econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp);
5406 5417 if (mctl_present) {
5407 5418 freeb(first_mp);
5408 5419 *mpp = mp;
5409 5420 }
5410 5421
5411 5422 return (econnp);
5412 5423 }
5413 5424
5414 5425 /*
5415 5426 * tcp_get_conn/tcp_free_conn
5416 5427 *
5417 5428 * tcp_get_conn is used to get a clean tcp connection structure.
5418 5429 * It tries to reuse the connections put on the freelist by the
5419 5430 * time_wait_collector failing which it goes to kmem_cache. This
5420 5431 * way has two benefits compared to just allocating from and
5421 5432 * freeing to kmem_cache.
5422 5433 * 1) The time_wait_collector can free (which includes the cleanup)
5423 5434 * outside the squeue. So when the interrupt comes, we have a clean
5424 5435 * connection sitting in the freelist. Obviously, this buys us
5425 5436 * performance.
5426 5437 *
5427 5438 * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request
5428 5439 * has multiple disadvantages - tying up the squeue during alloc, and the
5429 5440 * fact that IPSec policy initialization has to happen here which
5430 5441 * requires us sending a M_CTL and checking for it i.e. real ugliness.
5431 5442 * But allocating the conn/tcp in IP land is also not the best since
5432 5443 * we can't check the 'q' and 'q0' which are protected by squeue and
5433 5444 * blindly allocate memory which might have to be freed here if we are
5434 5445 * not allowed to accept the connection. By using the freelist and
5435 5446 * putting the conn/tcp back in freelist, we don't pay a penalty for
5436 5447 * allocating memory without checking 'q/q0' and freeing it if we can't
5437 5448 * accept the connection.
5438 5449 *
5439 5450 * Care should be taken to put the conn back in the same squeue's freelist
5440 5451 * from which it was allocated. Best results are obtained if conn is
5441 5452 * allocated from listener's squeue and freed to the same. Time wait
5442 5453 * collector will free up the freelist is the connection ends up sitting
5443 5454 * there for too long.
5444 5455 */
5445 5456 void *
5446 5457 tcp_get_conn(void *arg, tcp_stack_t *tcps)
5447 5458 {
5448 5459 tcp_t *tcp = NULL;
5449 5460 conn_t *connp = NULL;
5450 5461 squeue_t *sqp = (squeue_t *)arg;
5451 5462 tcp_squeue_priv_t *tcp_time_wait;
5452 5463 netstack_t *ns;
5453 5464
5454 5465 tcp_time_wait =
5455 5466 *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
5456 5467
5457 5468 mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
5458 5469 tcp = tcp_time_wait->tcp_free_list;
5459 5470 ASSERT((tcp != NULL) ^ (tcp_time_wait->tcp_free_list_cnt == 0));
5460 5471 if (tcp != NULL) {
5461 5472 tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
5462 5473 tcp_time_wait->tcp_free_list_cnt--;
5463 5474 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5464 5475 tcp->tcp_time_wait_next = NULL;
5465 5476 connp = tcp->tcp_connp;
5466 5477 connp->conn_flags |= IPCL_REUSED;
5467 5478
5468 5479 ASSERT(tcp->tcp_tcps == NULL);
5469 5480 ASSERT(connp->conn_netstack == NULL);
5470 5481 ns = tcps->tcps_netstack;
5471 5482 netstack_hold(ns);
5472 5483 connp->conn_netstack = ns;
5473 5484 tcp->tcp_tcps = tcps;
5474 5485 TCPS_REFHOLD(tcps);
5475 5486 ipcl_globalhash_insert(connp);
5476 5487 return ((void *)connp);
5477 5488 }
5478 5489 mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
5479 5490 if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP,
5480 5491 tcps->tcps_netstack)) == NULL)
5481 5492 return (NULL);
5482 5493 tcp = connp->conn_tcp;
5483 5494 tcp->tcp_tcps = tcps;
5484 5495 TCPS_REFHOLD(tcps);
5485 5496 return ((void *)connp);
5486 5497 }
5487 5498
5488 5499 /*
5489 5500 * Update the cached label for the given tcp_t. This should be called once per
5490 5501 * connection, and before any packets are sent or tcp_process_options is
5491 5502 * invoked. Returns B_FALSE if the correct label could not be constructed.
5492 5503 */
5493 5504 static boolean_t
5494 5505 tcp_update_label(tcp_t *tcp, const cred_t *cr)
5495 5506 {
5496 5507 conn_t *connp = tcp->tcp_connp;
5497 5508
5498 5509 if (tcp->tcp_ipversion == IPV4_VERSION) {
5499 5510 uchar_t optbuf[IP_MAX_OPT_LENGTH];
5500 5511 int added;
5501 5512
5502 5513 if (tsol_compute_label(cr, tcp->tcp_remote, optbuf,
5503 5514 connp->conn_mac_exempt,
5504 5515 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
5505 5516 return (B_FALSE);
5506 5517
5507 5518 added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len);
5508 5519 if (added == -1)
5509 5520 return (B_FALSE);
5510 5521 tcp->tcp_hdr_len += added;
5511 5522 tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added);
5512 5523 tcp->tcp_ip_hdr_len += added;
5513 5524 if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) {
5514 5525 tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3;
5515 5526 added = tsol_prepend_option(optbuf, tcp->tcp_ipha,
5516 5527 tcp->tcp_hdr_len);
5517 5528 if (added == -1)
5518 5529 return (B_FALSE);
5519 5530 tcp->tcp_hdr_len += added;
5520 5531 tcp->tcp_tcph = (tcph_t *)
5521 5532 ((uchar_t *)tcp->tcp_tcph + added);
5522 5533 tcp->tcp_ip_hdr_len += added;
5523 5534 }
5524 5535 } else {
5525 5536 uchar_t optbuf[TSOL_MAX_IPV6_OPTION];
5526 5537
5527 5538 if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf,
5528 5539 connp->conn_mac_exempt,
5529 5540 tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0)
5530 5541 return (B_FALSE);
5531 5542 if (tsol_update_sticky(&tcp->tcp_sticky_ipp,
5532 5543 &tcp->tcp_label_len, optbuf) != 0)
5533 5544 return (B_FALSE);
5534 5545 if (tcp_build_hdrs(tcp->tcp_rq, tcp) != 0)
5535 5546 return (B_FALSE);
5536 5547 }
5537 5548
5538 5549 connp->conn_ulp_labeled = 1;
5539 5550
5540 5551 return (B_TRUE);
5541 5552 }
5542 5553
5543 5554 /* BEGIN CSTYLED */
5544 5555 /*
5545 5556 *
5546 5557 * The sockfs ACCEPT path:
5547 5558 * =======================
5548 5559 *
5549 5560 * The eager is now established in its own perimeter as soon as SYN is
5550 5561 * received in tcp_conn_request(). When sockfs receives conn_ind, it
5551 5562 * completes the accept processing on the acceptor STREAM. The sending
5552 5563 * of conn_ind part is common for both sockfs listener and a TLI/XTI
5553 5564 * listener but a TLI/XTI listener completes the accept processing
5554 5565 * on the listener perimeter.
5555 5566 *
5556 5567 * Common control flow for 3 way handshake:
5557 5568 * ----------------------------------------
5558 5569 *
5559 5570 * incoming SYN (listener perimeter) -> tcp_rput_data()
5560 5571 * -> tcp_conn_request()
5561 5572 *
5562 5573 * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data()
5563 5574 * send T_CONN_IND (listener perim) -> tcp_send_conn_ind()
5564 5575 *
5565 5576 * Sockfs ACCEPT Path:
5566 5577 * -------------------
5567 5578 *
5568 5579 * open acceptor stream (tcp_open allocates tcp_wput_accept()
5569 5580 * as STREAM entry point)
5570 5581 *
5571 5582 * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept()
5572 5583 *
5573 5584 * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager
5574 5585 * association (we are not behind eager's squeue but sockfs is protecting us
5575 5586 * and no one knows about this stream yet. The STREAMS entry point q->q_info
5576 5587 * is changed to point at tcp_wput().
5577 5588 *
5578 5589 * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to
5579 5590 * listener (done on listener's perimeter).
5580 5591 *
5581 5592 * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish
5582 5593 * accept.
5583 5594 *
5584 5595 * TLI/XTI client ACCEPT path:
5585 5596 * ---------------------------
5586 5597 *
5587 5598 * soaccept() sends T_CONN_RES on the listener STREAM.
5588 5599 *
5589 5600 * tcp_accept() -> tcp_accept_swap() complete the processing and send
5590 5601 * the bind_mp to eager perimeter to finish accept (tcp_rput_other()).
5591 5602 *
5592 5603 * Locks:
5593 5604 * ======
5594 5605 *
5595 5606 * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and
5596 5607 * and listeners->tcp_eager_next_q.
5597 5608 *
5598 5609 * Referencing:
5599 5610 * ============
5600 5611 *
5601 5612 * 1) We start out in tcp_conn_request by eager placing a ref on
5602 5613 * listener and listener adding eager to listeners->tcp_eager_next_q0.
5603 5614 *
5604 5615 * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before
5605 5616 * doing so we place a ref on the eager. This ref is finally dropped at the
5606 5617 * end of tcp_accept_finish() while unwinding from the squeue, i.e. the
5607 5618 * reference is dropped by the squeue framework.
5608 5619 *
5609 5620 * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish
5610 5621 *
5611 5622 * The reference must be released by the same entity that added the reference
5612 5623 * In the above scheme, the eager is the entity that adds and releases the
5613 5624 * references. Note that tcp_accept_finish executes in the squeue of the eager
5614 5625 * (albeit after it is attached to the acceptor stream). Though 1. executes
5615 5626 * in the listener's squeue, the eager is nascent at this point and the
5616 5627 * reference can be considered to have been added on behalf of the eager.
5617 5628 *
5618 5629 * Eager getting a Reset or listener closing:
5619 5630 * ==========================================
5620 5631 *
5621 5632 * Once the listener and eager are linked, the listener never does the unlink.
5622 5633 * If the listener needs to close, tcp_eager_cleanup() is called which queues
5623 5634 * a message on all eager perimeter. The eager then does the unlink, clears
5624 5635 * any pointers to the listener's queue and drops the reference to the
5625 5636 * listener. The listener waits in tcp_close outside the squeue until its
5626 5637 * refcount has dropped to 1. This ensures that the listener has waited for
5627 5638 * all eagers to clear their association with the listener.
5628 5639 *
5629 5640 * Similarly, if eager decides to go away, it can unlink itself and close.
5630 5641 * When the T_CONN_RES comes down, we check if eager has closed. Note that
5631 5642 * the reference to eager is still valid because of the extra ref we put
5632 5643 * in tcp_send_conn_ind.
5633 5644 *
5634 5645 * Listener can always locate the eager under the protection
5635 5646 * of the listener->tcp_eager_lock, and then do a refhold
5636 5647 * on the eager during the accept processing.
5637 5648 *
5638 5649 * The acceptor stream accesses the eager in the accept processing
5639 5650 * based on the ref placed on eager before sending T_conn_ind.
5640 5651 * The only entity that can negate this refhold is a listener close
5641 5652 * which is mutually exclusive with an active acceptor stream.
5642 5653 *
5643 5654 * Eager's reference on the listener
5644 5655 * ===================================
5645 5656 *
5646 5657 * If the accept happens (even on a closed eager) the eager drops its
5647 5658 * reference on the listener at the start of tcp_accept_finish. If the
5648 5659 * eager is killed due to an incoming RST before the T_conn_ind is sent up,
5649 5660 * the reference is dropped in tcp_closei_local. If the listener closes,
5650 5661 * the reference is dropped in tcp_eager_kill. In all cases the reference
5651 5662 * is dropped while executing in the eager's context (squeue).
5652 5663 */
5653 5664 /* END CSTYLED */
5654 5665
5655 5666 /* Process the SYN packet, mp, directed at the listener 'tcp' */
5656 5667
5657 5668 /*
5658 5669 * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN.
5659 5670 * tcp_rput_data will not see any SYN packets.
5660 5671 */
5661 5672 /* ARGSUSED */
5662 5673 void
5663 5674 tcp_conn_request(void *arg, mblk_t *mp, void *arg2)
5664 5675 {
5665 5676 tcph_t *tcph;
5666 5677 uint32_t seg_seq;
5667 5678 tcp_t *eager;
5668 5679 uint_t ipvers;
5669 5680 ipha_t *ipha;
5670 5681 ip6_t *ip6h;
5671 5682 int err;
5672 5683 conn_t *econnp = NULL;
5673 5684 squeue_t *new_sqp;
5674 5685 mblk_t *mp1;
5675 5686 uint_t ip_hdr_len;
5676 5687 conn_t *connp = (conn_t *)arg;
5677 5688 tcp_t *tcp = connp->conn_tcp;
5678 5689 cred_t *credp;
5679 5690 tcp_stack_t *tcps = tcp->tcp_tcps;
5680 5691 ip_stack_t *ipst;
5681 5692
5682 5693 if (tcp->tcp_state != TCPS_LISTEN)
5683 5694 goto error2;
5684 5695
5685 5696 ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0);
5686 5697
5687 5698 mutex_enter(&tcp->tcp_eager_lock);
5688 5699 if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) {
5689 5700 mutex_exit(&tcp->tcp_eager_lock);
5690 5701 TCP_STAT(tcps, tcp_listendrop);
5691 5702 BUMP_MIB(&tcps->tcps_mib, tcpListenDrop);
5692 5703 if (tcp->tcp_debug) {
5693 5704 (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
5694 5705 "tcp_conn_request: listen backlog (max=%d) "
5695 5706 "overflow (%d pending) on %s",
5696 5707 tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q,
5697 5708 tcp_display(tcp, NULL, DISP_PORT_ONLY));
5698 5709 }
5699 5710 goto error2;
5700 5711 }
5701 5712
5702 5713 if (tcp->tcp_conn_req_cnt_q0 >=
5703 5714 tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) {
5704 5715 /*
5705 5716 * Q0 is full. Drop a pending half-open req from the queue
5706 5717 * to make room for the new SYN req. Also mark the time we
5707 5718 * drop a SYN.
5708 5719 *
5709 5720 * A more aggressive defense against SYN attack will
5710 5721 * be to set the "tcp_syn_defense" flag now.
5711 5722 */
5712 5723 TCP_STAT(tcps, tcp_listendropq0);
5713 5724 tcp->tcp_last_rcv_lbolt = lbolt64;
5714 5725 if (!tcp_drop_q0(tcp)) {
5715 5726 mutex_exit(&tcp->tcp_eager_lock);
5716 5727 BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0);
5717 5728 if (tcp->tcp_debug) {
5718 5729 (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
5719 5730 "tcp_conn_request: listen half-open queue "
5720 5731 "(max=%d) full (%d pending) on %s",
5721 5732 tcps->tcps_conn_req_max_q0,
5722 5733 tcp->tcp_conn_req_cnt_q0,
5723 5734 tcp_display(tcp, NULL,
5724 5735 DISP_PORT_ONLY));
5725 5736 }
5726 5737 goto error2;
5727 5738 }
5728 5739 }
5729 5740 mutex_exit(&tcp->tcp_eager_lock);
5730 5741
5731 5742 /*
5732 5743 * IP adds STRUIO_EAGER and ensures that the received packet is
5733 5744 * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6
5734 5745 * link local address. If IPSec is enabled, db_struioflag has
5735 5746 * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER);
5736 5747 * otherwise an error case if neither of them is set.
5737 5748 */
5738 5749 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
5739 5750 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
5740 5751 DB_CKSUMSTART(mp) = 0;
5741 5752 mp->b_datap->db_struioflag &= ~STRUIO_EAGER;
5742 5753 econnp = (conn_t *)tcp_get_conn(arg2, tcps);
5743 5754 if (econnp == NULL)
5744 5755 goto error2;
5745 5756 ASSERT(econnp->conn_netstack == connp->conn_netstack);
5746 5757 econnp->conn_sqp = new_sqp;
5747 5758 } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) {
5748 5759 /*
5749 5760 * mp is updated in tcp_get_ipsec_conn().
5750 5761 */
5751 5762 econnp = tcp_get_ipsec_conn(tcp, arg2, &mp);
5752 5763 if (econnp == NULL) {
5753 5764 /*
5754 5765 * mp freed by tcp_get_ipsec_conn.
5755 5766 */
5756 5767 return;
5757 5768 }
5758 5769 ASSERT(econnp->conn_netstack == connp->conn_netstack);
5759 5770 } else {
5760 5771 goto error2;
5761 5772 }
5762 5773
5763 5774 ASSERT(DB_TYPE(mp) == M_DATA);
5764 5775
5765 5776 ipvers = IPH_HDR_VERSION(mp->b_rptr);
5766 5777 ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION);
5767 5778 ASSERT(OK_32PTR(mp->b_rptr));
5768 5779 if (ipvers == IPV4_VERSION) {
5769 5780 ipha = (ipha_t *)mp->b_rptr;
5770 5781 ip_hdr_len = IPH_HDR_LENGTH(ipha);
5771 5782 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5772 5783 } else {
5773 5784 ip6h = (ip6_t *)mp->b_rptr;
5774 5785 ip_hdr_len = ip_hdr_length_v6(mp, ip6h);
5775 5786 tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len];
5776 5787 }
5777 5788
5778 5789 if (tcp->tcp_family == AF_INET) {
5779 5790 ASSERT(ipvers == IPV4_VERSION);
5780 5791 err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp);
5781 5792 } else {
5782 5793 err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp);
5783 5794 }
5784 5795
5785 5796 if (err)
5786 5797 goto error3;
5787 5798
5788 5799 eager = econnp->conn_tcp;
5789 5800
5790 5801 /* Inherit various TCP parameters from the listener */
5791 5802 eager->tcp_naglim = tcp->tcp_naglim;
5792 5803 eager->tcp_first_timer_threshold =
5793 5804 tcp->tcp_first_timer_threshold;
5794 5805 eager->tcp_second_timer_threshold =
5795 5806 tcp->tcp_second_timer_threshold;
5796 5807
5797 5808 eager->tcp_first_ctimer_threshold =
5798 5809 tcp->tcp_first_ctimer_threshold;
5799 5810 eager->tcp_second_ctimer_threshold =
5800 5811 tcp->tcp_second_ctimer_threshold;
5801 5812
5802 5813 /*
5803 5814 * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics.
5804 5815 * If it does not, the eager's receive window will be set to the
5805 5816 * listener's receive window later in this function.
5806 5817 */
5807 5818 eager->tcp_rwnd = 0;
5808 5819
5809 5820 /*
5810 5821 * Inherit listener's tcp_init_cwnd. Need to do this before
5811 5822 * calling tcp_process_options() where tcp_mss_set() is called
5812 5823 * to set the initial cwnd.
5813 5824 */
5814 5825 eager->tcp_init_cwnd = tcp->tcp_init_cwnd;
5815 5826
5816 5827 /*
5817 5828 * Zones: tcp_adapt_ire() and tcp_send_data() both need the
5818 5829 * zone id before the accept is completed in tcp_wput_accept().
5819 5830 */
5820 5831 econnp->conn_zoneid = connp->conn_zoneid;
5821 5832 econnp->conn_allzones = connp->conn_allzones;
5822 5833
5823 5834 /* Copy nexthop information from listener to eager */
5824 5835 if (connp->conn_nexthop_set) {
5825 5836 econnp->conn_nexthop_set = connp->conn_nexthop_set;
5826 5837 econnp->conn_nexthop_v4 = connp->conn_nexthop_v4;
5827 5838 }
5828 5839
5829 5840 /*
5830 5841 * TSOL: tsol_input_proc() needs the eager's cred before the
5831 5842 * eager is accepted
5832 5843 */
5833 5844 econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred;
5834 5845 crhold(credp);
5835 5846
5836 5847 /*
5837 5848 * If the caller has the process-wide flag set, then default to MAC
5838 5849 * exempt mode. This allows read-down to unlabeled hosts.
5839 5850 */
5840 5851 if (getpflags(NET_MAC_AWARE, credp) != 0)
5841 5852 econnp->conn_mac_exempt = B_TRUE;
5842 5853
5843 5854 if (is_system_labeled()) {
5844 5855 cred_t *cr;
5845 5856
5846 5857 if (connp->conn_mlp_type != mlptSingle) {
5847 5858 cr = econnp->conn_peercred = DB_CRED(mp);
5848 5859 if (cr != NULL)
5849 5860 crhold(cr);
5850 5861 else
5851 5862 cr = econnp->conn_cred;
5852 5863 DTRACE_PROBE2(mlp_syn_accept, conn_t *,
5853 5864 econnp, cred_t *, cr)
5854 5865 } else {
5855 5866 cr = econnp->conn_cred;
5856 5867 DTRACE_PROBE2(syn_accept, conn_t *,
5857 5868 econnp, cred_t *, cr)
5858 5869 }
5859 5870
5860 5871 if (!tcp_update_label(eager, cr)) {
5861 5872 DTRACE_PROBE3(
5862 5873 tx__ip__log__error__connrequest__tcp,
5863 5874 char *, "eager connp(1) label on SYN mp(2) failed",
5864 5875 conn_t *, econnp, mblk_t *, mp);
5865 5876 goto error3;
5866 5877 }
5867 5878 }
5868 5879
5869 5880 eager->tcp_hard_binding = B_TRUE;
5870 5881
|
↓ open down ↓ |
1221 lines elided |
↑ open up ↑ |
5871 5882 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
5872 5883 TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5873 5884
5874 5885 CL_INET_CONNECT(eager);
5875 5886
5876 5887 /*
5877 5888 * No need to check for multicast destination since ip will only pass
5878 5889 * up multicasts to those that have expressed interest
5879 5890 * TODO: what about rejecting broadcasts?
5880 5891 * Also check that source is not a multicast or broadcast address.
5892 + *
5893 + * DTrace tcp:::state-change is probed a little further down,
5894 + * where it is set for the second time.
5881 5895 */
5882 5896 eager->tcp_state = TCPS_SYN_RCVD;
5883 5897
5884 5898
5885 5899 /*
5886 5900 * There should be no ire in the mp as we are being called after
5887 5901 * receiving the SYN.
5888 5902 */
5889 5903 ASSERT(tcp_ire_mp(mp) == NULL);
5890 5904
5891 5905 /*
5892 5906 * Adapt our mss, ttl, ... according to information provided in IRE.
5893 5907 */
5894 5908
5895 5909 if (tcp_adapt_ire(eager, NULL) == 0) {
5896 5910 /* Undo the bind_hash_insert */
5897 5911 tcp_bind_hash_remove(eager);
5898 5912 goto error3;
5899 5913 }
5900 5914
5915 + /*
5916 + * DTrace the first SYN as a tcp:::receive. This is placed after
5917 + * tcp_adapt_ire() so that tcp->tcp_loopback has been set.
5918 + */
5919 + DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
5920 + mp->b_rptr, tcp_t *, tcp, tcph_t *, tcph);
5921 +
5901 5922 /* Process all TCP options. */
5902 5923 tcp_process_options(eager, tcph);
5903 5924
5904 5925 /* Is the other end ECN capable? */
5905 5926 if (tcps->tcps_ecn_permitted >= 1 &&
5906 5927 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5907 5928 eager->tcp_ecn_ok = B_TRUE;
5908 5929 }
5909 5930
5910 5931 /*
5911 5932 * listener->tcp_rq->q_hiwat should be the default window size or a
5912 5933 * window size changed via SO_RCVBUF option. First round up the
5913 5934 * eager's tcp_rwnd to the nearest MSS. Then find out the window
5914 5935 * scale option value if needed. Call tcp_rwnd_set() to finish the
5915 5936 * setting.
5916 5937 *
5917 5938 * Note if there is a rpipe metric associated with the remote host,
5918 5939 * we should not inherit receive window size from listener.
5919 5940 */
5920 5941 eager->tcp_rwnd = MSS_ROUNDUP(
5921 5942 (eager->tcp_rwnd == 0 ? tcp->tcp_rq->q_hiwat :
5922 5943 eager->tcp_rwnd), eager->tcp_mss);
5923 5944 if (eager->tcp_snd_ws_ok)
5924 5945 tcp_set_ws_value(eager);
5925 5946 /*
5926 5947 * Note that this is the only place tcp_rwnd_set() is called for
5927 5948 * accepting a connection. We need to call it here instead of
5928 5949 * after the 3-way handshake because we need to tell the other
5929 5950 * side our rwnd in the SYN-ACK segment.
5930 5951 */
5931 5952 (void) tcp_rwnd_set(eager, eager->tcp_rwnd);
5932 5953
5933 5954 /*
5934 5955 * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ
5935 5956 * via soaccept()->soinheritoptions() which essentially applies
5936 5957 * all the listener options to the new STREAM. The options that we
5937 5958 * need to take care of are:
5938 5959 * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST,
5939 5960 * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER,
5940 5961 * SO_SNDBUF, SO_RCVBUF.
5941 5962 *
5942 5963 * SO_RCVBUF: tcp_rwnd_set() above takes care of it.
5943 5964 * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When
5944 5965 * tcp_maxpsz_set() gets called later from
5945 5966 * tcp_accept_finish(), the option takes effect.
5946 5967 *
5947 5968 */
5948 5969 /* Set the TCP options */
5949 5970 eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater;
5950 5971 eager->tcp_dgram_errind = tcp->tcp_dgram_errind;
5951 5972 eager->tcp_oobinline = tcp->tcp_oobinline;
5952 5973 eager->tcp_reuseaddr = tcp->tcp_reuseaddr;
5953 5974 eager->tcp_broadcast = tcp->tcp_broadcast;
5954 5975 eager->tcp_useloopback = tcp->tcp_useloopback;
5955 5976 eager->tcp_dontroute = tcp->tcp_dontroute;
5956 5977 eager->tcp_linger = tcp->tcp_linger;
5957 5978 eager->tcp_lingertime = tcp->tcp_lingertime;
5958 5979 if (tcp->tcp_ka_enabled)
5959 5980 eager->tcp_ka_enabled = 1;
5960 5981
5961 5982 /* Set the IP options */
5962 5983 econnp->conn_broadcast = connp->conn_broadcast;
5963 5984 econnp->conn_loopback = connp->conn_loopback;
5964 5985 econnp->conn_dontroute = connp->conn_dontroute;
5965 5986 econnp->conn_reuseaddr = connp->conn_reuseaddr;
5966 5987
5967 5988 /* Put a ref on the listener for the eager. */
5968 5989 CONN_INC_REF(connp);
5969 5990 mutex_enter(&tcp->tcp_eager_lock);
5970 5991 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager;
5971 5992 eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0;
5972 5993 tcp->tcp_eager_next_q0 = eager;
5973 5994 eager->tcp_eager_prev_q0 = tcp;
5974 5995
5975 5996 /* Set tcp_listener before adding it to tcp_conn_fanout */
5976 5997 eager->tcp_listener = tcp;
5977 5998 eager->tcp_saved_listener = tcp;
5978 5999
5979 6000 /*
5980 6001 * Tag this detached tcp vector for later retrieval
5981 6002 * by our listener client in tcp_accept().
5982 6003 */
5983 6004 eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum;
5984 6005 tcp->tcp_conn_req_cnt_q0++;
5985 6006 if (++tcp->tcp_conn_req_seqnum == -1) {
5986 6007 /*
5987 6008 * -1 is "special" and defined in TPI as something
5988 6009 * that should never be used in T_CONN_IND
5989 6010 */
5990 6011 ++tcp->tcp_conn_req_seqnum;
5991 6012 }
5992 6013 mutex_exit(&tcp->tcp_eager_lock);
5993 6014
5994 6015 if (tcp->tcp_syn_defense) {
5995 6016 /* Don't drop the SYN that comes from a good IP source */
5996 6017 ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache);
5997 6018 if (addr_cache != NULL && eager->tcp_remote ==
5998 6019 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
5999 6020 eager->tcp_dontdrop = B_TRUE;
6000 6021 }
6001 6022 }
6002 6023
6003 6024 /*
6004 6025 * We need to insert the eager in its own perimeter but as soon
6005 6026 * as we do that, we expose the eager to the classifier and
6006 6027 * should not touch any field outside the eager's perimeter.
|
↓ open down ↓ |
96 lines elided |
↑ open up ↑ |
6007 6028 * So do all the work necessary before inserting the eager
6008 6029 * in its own perimeter. Be optimistic that ipcl_conn_insert()
6009 6030 * will succeed but undo everything if it fails.
6010 6031 */
6011 6032 seg_seq = ABE32_TO_U32(tcph->th_seq);
6012 6033 eager->tcp_irs = seg_seq;
6013 6034 eager->tcp_rack = seg_seq;
6014 6035 eager->tcp_rnxt = seg_seq + 1;
6015 6036 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
6016 6037 BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
6038 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, eager,
6039 + int32_t, TCPS_SYN_RCVD);
6017 6040 eager->tcp_state = TCPS_SYN_RCVD;
6018 6041 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
6019 6042 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
6020 6043 if (mp1 == NULL) {
6021 6044 /*
6022 6045 * Increment the ref count as we are going to
6023 6046 * enqueueing an mp in squeue
6024 6047 */
6025 6048 CONN_INC_REF(econnp);
6026 6049 goto error;
6027 6050 }
6028 6051 DB_CPID(mp1) = tcp->tcp_cpid;
6029 6052 eager->tcp_cpid = tcp->tcp_cpid;
6030 6053 eager->tcp_open_time = lbolt64;
6031 6054
6032 6055 /*
6033 6056 * We need to start the rto timer. In normal case, we start
6034 6057 * the timer after sending the packet on the wire (or at
6035 6058 * least believing that packet was sent by waiting for
6036 6059 * CALL_IP_WPUT() to return). Since this is the first packet
6037 6060 * being sent on the wire for the eager, our initial tcp_rto
6038 6061 * is at least tcp_rexmit_interval_min which is a fairly
6039 6062 * large value to allow the algorithm to adjust slowly to large
6040 6063 * fluctuations of RTT during first few transmissions.
6041 6064 *
6042 6065 * Starting the timer first and then sending the packet in this
6043 6066 * case shouldn't make much difference since tcp_rexmit_interval_min
6044 6067 * is of the order of several 100ms and starting the timer
6045 6068 * first and then sending the packet will result in difference
6046 6069 * of few micro seconds.
6047 6070 *
6048 6071 * Without this optimization, we are forced to hold the fanout
6049 6072 * lock across the ipcl_bind_insert() and sending the packet
6050 6073 * so that we don't race against an incoming packet (maybe RST)
6051 6074 * for this eager.
6052 6075 *
6053 6076 * It is necessary to acquire an extra reference on the eager
6054 6077 * at this point and hold it until after tcp_send_data() to
6055 6078 * ensure against an eager close race.
6056 6079 */
6057 6080
6058 6081 CONN_INC_REF(eager->tcp_connp);
6059 6082
6060 6083 TCP_RECORD_TRACE(eager, mp1, TCP_TRACE_SEND_PKT);
6061 6084 TCP_TIMER_RESTART(eager, eager->tcp_rto);
6062 6085
6063 6086
6064 6087 /*
6065 6088 * Insert the eager in its own perimeter now. We are ready to deal
6066 6089 * with any packets on eager.
6067 6090 */
6068 6091 if (eager->tcp_ipversion == IPV4_VERSION) {
6069 6092 if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) {
6070 6093 goto error;
6071 6094 }
6072 6095 } else {
6073 6096 if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) {
6074 6097 goto error;
6075 6098 }
6076 6099 }
6077 6100
6078 6101 /* mark conn as fully-bound */
6079 6102 econnp->conn_fully_bound = B_TRUE;
6080 6103
6081 6104 /* Send the SYN-ACK */
6082 6105 tcp_send_data(eager, eager->tcp_wq, mp1);
6083 6106 CONN_DEC_REF(eager->tcp_connp);
6084 6107 freemsg(mp);
6085 6108
6086 6109 return;
6087 6110 error:
6088 6111 freemsg(mp1);
6089 6112 eager->tcp_closemp_used = B_TRUE;
6090 6113 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
6091 6114 squeue_fill(econnp->conn_sqp, &eager->tcp_closemp, tcp_eager_kill,
6092 6115 econnp, SQTAG_TCP_CONN_REQ_2);
6093 6116
6094 6117 /*
6095 6118 * If a connection already exists, send the mp to that connections so
6096 6119 * that it can be appropriately dealt with.
6097 6120 */
6098 6121 ipst = tcps->tcps_netstack->netstack_ip;
6099 6122
6100 6123 if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) {
6101 6124 if (!IPCL_IS_CONNECTED(econnp)) {
6102 6125 /*
6103 6126 * Something bad happened. ipcl_conn_insert()
6104 6127 * failed because a connection already existed
6105 6128 * in connected hash but we can't find it
6106 6129 * anymore (someone blew it away). Just
6107 6130 * free this message and hopefully remote
6108 6131 * will retransmit at which time the SYN can be
6109 6132 * treated as a new connection or dealth with
6110 6133 * a TH_RST if a connection already exists.
6111 6134 */
6112 6135 CONN_DEC_REF(econnp);
6113 6136 freemsg(mp);
6114 6137 } else {
6115 6138 squeue_fill(econnp->conn_sqp, mp, tcp_input,
|
↓ open down ↓ |
89 lines elided |
↑ open up ↑ |
6116 6139 econnp, SQTAG_TCP_CONN_REQ_1);
6117 6140 }
6118 6141 } else {
6119 6142 /* Nobody wants this packet */
6120 6143 freemsg(mp);
6121 6144 }
6122 6145 return;
6123 6146 error3:
6124 6147 CONN_DEC_REF(econnp);
6125 6148 error2:
6149 + /*
6150 + * DTrace this tcp:::receive event, as we skipped the previous receive
6151 + * probe. For DTrace only, we find the IP header length so that the
6152 + * TCP header can be found.
6153 + */
6154 + ipvers = IPH_HDR_VERSION(mp->b_rptr);
6155 + if (OK_32PTR(mp->b_rptr) &&
6156 + (ipvers == IPV4_VERSION || ipvers == IPV6_VERSION)) {
6157 + if (ipvers == IPV4_VERSION)
6158 + ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)mp->b_rptr);
6159 + else
6160 + ip_hdr_len = ip_hdr_length_v6(mp, (ip6_t *)mp->b_rptr);
6161 + DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL,
6162 + void_ip_t *, mp->b_rptr, tcp_t *, NULL, tcph_t *,
6163 + &mp->b_rptr[ip_hdr_len]);
6164 + }
6165 +
6126 6166 freemsg(mp);
6127 6167 }
6128 6168
6129 6169 /*
6130 6170 * In an ideal case of vertical partition in NUMA architecture, its
6131 6171 * beneficial to have the listener and all the incoming connections
6132 6172 * tied to the same squeue. The other constraint is that incoming
6133 6173 * connections should be tied to the squeue attached to interrupted
6134 6174 * CPU for obvious locality reason so this leaves the listener to
6135 6175 * be tied to the same squeue. Our only problem is that when listener
6136 6176 * is binding, the CPU that will get interrupted by the NIC whose
6137 6177 * IP address the listener is binding to is not even known. So
6138 6178 * the code below allows us to change that binding at the time the
6139 6179 * CPU is interrupted by virtue of incoming connection's squeue.
6140 6180 *
6141 6181 * This is usefull only in case of a listener bound to a specific IP
6142 6182 * address. For other kind of listeners, they get bound the
6143 6183 * very first time and there is no attempt to rebind them.
6144 6184 */
6145 6185 void
6146 6186 tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
6147 6187 {
6148 6188 conn_t *connp = (conn_t *)arg;
6149 6189 squeue_t *sqp = (squeue_t *)arg2;
6150 6190 squeue_t *new_sqp;
6151 6191 uint32_t conn_flags;
6152 6192
6153 6193 if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) {
6154 6194 new_sqp = (squeue_t *)DB_CKSUMSTART(mp);
6155 6195 } else {
6156 6196 goto done;
6157 6197 }
6158 6198
6159 6199 if (connp->conn_fanout == NULL)
6160 6200 goto done;
6161 6201
6162 6202 if (!(connp->conn_flags & IPCL_FULLY_BOUND)) {
6163 6203 mutex_enter(&connp->conn_fanout->connf_lock);
6164 6204 mutex_enter(&connp->conn_lock);
6165 6205 /*
6166 6206 * No one from read or write side can access us now
6167 6207 * except for already queued packets on this squeue.
6168 6208 * But since we haven't changed the squeue yet, they
6169 6209 * can't execute. If they are processed after we have
6170 6210 * changed the squeue, they are sent back to the
6171 6211 * correct squeue down below.
6172 6212 * But a listner close can race with processing of
6173 6213 * incoming SYN. If incoming SYN processing changes
6174 6214 * the squeue then the listener close which is waiting
6175 6215 * to enter the squeue would operate on the wrong
6176 6216 * squeue. Hence we don't change the squeue here unless
6177 6217 * the refcount is exactly the minimum refcount. The
6178 6218 * minimum refcount of 4 is counted as - 1 each for
6179 6219 * TCP and IP, 1 for being in the classifier hash, and
6180 6220 * 1 for the mblk being processed.
6181 6221 */
6182 6222
6183 6223 if (connp->conn_ref != 4 ||
6184 6224 connp->conn_tcp->tcp_state != TCPS_LISTEN) {
6185 6225 mutex_exit(&connp->conn_lock);
6186 6226 mutex_exit(&connp->conn_fanout->connf_lock);
6187 6227 goto done;
6188 6228 }
6189 6229 if (connp->conn_sqp != new_sqp) {
6190 6230 while (connp->conn_sqp != new_sqp)
6191 6231 (void) casptr(&connp->conn_sqp, sqp, new_sqp);
6192 6232 }
6193 6233
6194 6234 do {
6195 6235 conn_flags = connp->conn_flags;
6196 6236 conn_flags |= IPCL_FULLY_BOUND;
6197 6237 (void) cas32(&connp->conn_flags, connp->conn_flags,
6198 6238 conn_flags);
6199 6239 } while (!(connp->conn_flags & IPCL_FULLY_BOUND));
6200 6240
6201 6241 mutex_exit(&connp->conn_fanout->connf_lock);
6202 6242 mutex_exit(&connp->conn_lock);
6203 6243 }
6204 6244
6205 6245 done:
6206 6246 if (connp->conn_sqp != sqp) {
6207 6247 CONN_INC_REF(connp);
6208 6248 squeue_fill(connp->conn_sqp, mp,
6209 6249 connp->conn_recv, connp, SQTAG_TCP_CONN_REQ_UNBOUND);
6210 6250 } else {
6211 6251 tcp_conn_request(connp, mp, sqp);
6212 6252 }
6213 6253 }
6214 6254
6215 6255 /*
6216 6256 * Successful connect request processing begins when our client passes
6217 6257 * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes
6218 6258 * our T_OK_ACK reply message upstream. The control flow looks like this:
6219 6259 * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_connect() -> IP
6220 6260 * upstream <- tcp_rput() <- IP
6221 6261 * After various error checks are completed, tcp_connect() lays
6222 6262 * the target address and port into the composite header template,
6223 6263 * preallocates the T_OK_ACK reply message, construct a full 12 byte bind
6224 6264 * request followed by an IRE request, and passes the three mblk message
6225 6265 * down to IP looking like this:
6226 6266 * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client
6227 6267 * Processing continues in tcp_rput() when we receive the following message:
6228 6268 * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client
6229 6269 * After consuming the first two mblks, tcp_rput() calls tcp_timer(),
6230 6270 * to fire off the connection request, and then passes the T_OK_ACK mblk
6231 6271 * upstream that we filled in below. There are, of course, numerous
6232 6272 * error conditions along the way which truncate the processing described
6233 6273 * above.
6234 6274 */
6235 6275 static void
6236 6276 tcp_connect(tcp_t *tcp, mblk_t *mp)
6237 6277 {
6238 6278 sin_t *sin;
6239 6279 sin6_t *sin6;
6240 6280 queue_t *q = tcp->tcp_wq;
6241 6281 struct T_conn_req *tcr;
6242 6282 ipaddr_t *dstaddrp;
6243 6283 in_port_t dstport;
6244 6284 uint_t srcid;
6245 6285
6246 6286 tcr = (struct T_conn_req *)mp->b_rptr;
6247 6287
6248 6288 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
6249 6289 if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
6250 6290 tcp_err_ack(tcp, mp, TPROTO, 0);
6251 6291 return;
6252 6292 }
6253 6293
6254 6294 /*
6255 6295 * Determine packet type based on type of address passed in
6256 6296 * the request should contain an IPv4 or IPv6 address.
6257 6297 * Make sure that address family matches the type of
6258 6298 * family of the the address passed down
6259 6299 */
6260 6300 switch (tcr->DEST_length) {
6261 6301 default:
6262 6302 tcp_err_ack(tcp, mp, TBADADDR, 0);
6263 6303 return;
6264 6304
6265 6305 case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
6266 6306 /*
6267 6307 * XXX: The check for valid DEST_length was not there
6268 6308 * in earlier releases and some buggy
6269 6309 * TLI apps (e.g Sybase) got away with not feeding
6270 6310 * in sin_zero part of address.
6271 6311 * We allow that bug to keep those buggy apps humming.
6272 6312 * Test suites require the check on DEST_length.
6273 6313 * We construct a new mblk with valid DEST_length
6274 6314 * free the original so the rest of the code does
6275 6315 * not have to keep track of this special shorter
6276 6316 * length address case.
6277 6317 */
6278 6318 mblk_t *nmp;
6279 6319 struct T_conn_req *ntcr;
6280 6320 sin_t *nsin;
6281 6321
6282 6322 nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
6283 6323 tcr->OPT_length, BPRI_HI);
6284 6324 if (nmp == NULL) {
6285 6325 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
6286 6326 return;
6287 6327 }
6288 6328 ntcr = (struct T_conn_req *)nmp->b_rptr;
6289 6329 bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
6290 6330 ntcr->PRIM_type = T_CONN_REQ;
6291 6331 ntcr->DEST_length = sizeof (sin_t);
6292 6332 ntcr->DEST_offset = sizeof (struct T_conn_req);
6293 6333
6294 6334 nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
6295 6335 *nsin = sin_null;
6296 6336 /* Get pointer to shorter address to copy from original mp */
6297 6337 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6298 6338 tcr->DEST_length); /* extract DEST_length worth of sin_t */
6299 6339 if (sin == NULL || !OK_32PTR((char *)sin)) {
6300 6340 freemsg(nmp);
6301 6341 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6302 6342 return;
6303 6343 }
6304 6344 nsin->sin_family = sin->sin_family;
6305 6345 nsin->sin_port = sin->sin_port;
6306 6346 nsin->sin_addr = sin->sin_addr;
6307 6347 /* Note:nsin->sin_zero zero-fill with sin_null assign above */
6308 6348 nmp->b_wptr = (uchar_t *)&nsin[1];
6309 6349 if (tcr->OPT_length != 0) {
6310 6350 ntcr->OPT_length = tcr->OPT_length;
6311 6351 ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
6312 6352 bcopy((uchar_t *)tcr + tcr->OPT_offset,
6313 6353 (uchar_t *)ntcr + ntcr->OPT_offset,
6314 6354 tcr->OPT_length);
6315 6355 nmp->b_wptr += tcr->OPT_length;
6316 6356 }
6317 6357 freemsg(mp); /* original mp freed */
6318 6358 mp = nmp; /* re-initialize original variables */
6319 6359 tcr = ntcr;
6320 6360 }
6321 6361 /* FALLTHRU */
6322 6362
6323 6363 case sizeof (sin_t):
6324 6364 sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
6325 6365 sizeof (sin_t));
6326 6366 if (sin == NULL || !OK_32PTR((char *)sin)) {
6327 6367 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6328 6368 return;
6329 6369 }
6330 6370 if (tcp->tcp_family != AF_INET ||
6331 6371 sin->sin_family != AF_INET) {
6332 6372 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6333 6373 return;
6334 6374 }
6335 6375 if (sin->sin_port == 0) {
6336 6376 tcp_err_ack(tcp, mp, TBADADDR, 0);
6337 6377 return;
6338 6378 }
6339 6379 if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) {
6340 6380 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6341 6381 return;
6342 6382 }
6343 6383
6344 6384 break;
6345 6385
6346 6386 case sizeof (sin6_t):
6347 6387 sin6 = (sin6_t *)mi_offset_param(mp, tcr->DEST_offset,
6348 6388 sizeof (sin6_t));
6349 6389 if (sin6 == NULL || !OK_32PTR((char *)sin6)) {
6350 6390 tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
6351 6391 return;
6352 6392 }
6353 6393 if (tcp->tcp_family != AF_INET6 ||
6354 6394 sin6->sin6_family != AF_INET6) {
6355 6395 tcp_err_ack(tcp, mp, TSYSERR, EAFNOSUPPORT);
6356 6396 return;
6357 6397 }
6358 6398 if (sin6->sin6_port == 0) {
6359 6399 tcp_err_ack(tcp, mp, TBADADDR, 0);
6360 6400 return;
6361 6401 }
6362 6402 break;
6363 6403 }
6364 6404 /*
6365 6405 * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
6366 6406 * should key on their sequence number and cut them loose.
6367 6407 */
6368 6408
6369 6409 /*
6370 6410 * If options passed in, feed it for verification and handling
6371 6411 */
6372 6412 if (tcr->OPT_length != 0) {
6373 6413 mblk_t *ok_mp;
6374 6414 mblk_t *discon_mp;
6375 6415 mblk_t *conn_opts_mp;
6376 6416 int t_error, sys_error, do_disconnect;
6377 6417
6378 6418 conn_opts_mp = NULL;
6379 6419
6380 6420 if (tcp_conprim_opt_process(tcp, mp,
6381 6421 &do_disconnect, &t_error, &sys_error) < 0) {
6382 6422 if (do_disconnect) {
6383 6423 ASSERT(t_error == 0 && sys_error == 0);
6384 6424 discon_mp = mi_tpi_discon_ind(NULL,
6385 6425 ECONNREFUSED, 0);
6386 6426 if (!discon_mp) {
6387 6427 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6388 6428 TSYSERR, ENOMEM);
6389 6429 return;
6390 6430 }
6391 6431 ok_mp = mi_tpi_ok_ack_alloc(mp);
6392 6432 if (!ok_mp) {
6393 6433 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6394 6434 TSYSERR, ENOMEM);
6395 6435 return;
6396 6436 }
6397 6437 qreply(q, ok_mp);
6398 6438 qreply(q, discon_mp); /* no flush! */
6399 6439 } else {
6400 6440 ASSERT(t_error != 0);
6401 6441 tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
6402 6442 sys_error);
6403 6443 }
6404 6444 return;
6405 6445 }
6406 6446 /*
6407 6447 * Success in setting options, the mp option buffer represented
6408 6448 * by OPT_length/offset has been potentially modified and
6409 6449 * contains results of option processing. We copy it in
6410 6450 * another mp to save it for potentially influencing returning
6411 6451 * it in T_CONN_CONN.
6412 6452 */
6413 6453 if (tcr->OPT_length != 0) { /* there are resulting options */
6414 6454 conn_opts_mp = copyb(mp);
6415 6455 if (!conn_opts_mp) {
6416 6456 tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
6417 6457 TSYSERR, ENOMEM);
6418 6458 return;
6419 6459 }
6420 6460 ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
6421 6461 tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
6422 6462 /*
6423 6463 * Note:
6424 6464 * These resulting option negotiation can include any
6425 6465 * end-to-end negotiation options but there no such
6426 6466 * thing (yet?) in our TCP/IP.
6427 6467 */
6428 6468 }
6429 6469 }
6430 6470
6431 6471 /*
6432 6472 * If we're connecting to an IPv4-mapped IPv6 address, we need to
6433 6473 * make sure that the template IP header in the tcp structure is an
6434 6474 * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We
6435 6475 * need to this before we call tcp_bindi() so that the port lookup
6436 6476 * code will look for ports in the correct port space (IPv4 and
6437 6477 * IPv6 have separate port spaces).
6438 6478 */
6439 6479 if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION &&
6440 6480 IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6441 6481 int err = 0;
6442 6482
6443 6483 err = tcp_header_init_ipv4(tcp);
6444 6484 if (err != 0) {
6445 6485 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6446 6486 goto connect_failed;
6447 6487 }
6448 6488 if (tcp->tcp_lport != 0)
6449 6489 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
6450 6490 }
6451 6491
6452 6492 if (tcp->tcp_issocket) {
6453 6493 /*
6454 6494 * TCP is _D_SODIRECT and sockfs is directly above so save
6455 6495 * the shared sonode sodirect_t pointer (if any) to enable
6456 6496 * TCP sodirect.
6457 6497 */
6458 6498 tcp->tcp_sodirect = SOD_QTOSODP(tcp->tcp_rq);
6459 6499 }
6460 6500
6461 6501 switch (tcp->tcp_state) {
6462 6502 case TCPS_IDLE:
6463 6503 /*
6464 6504 * We support quick connect, refer to comments in
6465 6505 * tcp_connect_*()
6466 6506 */
6467 6507 /* FALLTHRU */
6468 6508 case TCPS_BOUND:
6469 6509 case TCPS_LISTEN:
6470 6510 if (tcp->tcp_family == AF_INET6) {
6471 6511 if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) {
6472 6512 tcp_connect_ipv6(tcp, mp,
6473 6513 &sin6->sin6_addr,
6474 6514 sin6->sin6_port, sin6->sin6_flowinfo,
6475 6515 sin6->__sin6_src_id, sin6->sin6_scope_id);
6476 6516 return;
6477 6517 }
6478 6518 /*
6479 6519 * Destination adress is mapped IPv6 address.
6480 6520 * Source bound address should be unspecified or
6481 6521 * IPv6 mapped address as well.
6482 6522 */
6483 6523 if (!IN6_IS_ADDR_UNSPECIFIED(
6484 6524 &tcp->tcp_bound_source_v6) &&
6485 6525 !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) {
6486 6526 mp = mi_tpi_err_ack_alloc(mp, TSYSERR,
6487 6527 EADDRNOTAVAIL);
6488 6528 break;
6489 6529 }
6490 6530 dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr));
6491 6531 dstport = sin6->sin6_port;
6492 6532 srcid = sin6->__sin6_src_id;
6493 6533 } else {
6494 6534 dstaddrp = &sin->sin_addr.s_addr;
6495 6535 dstport = sin->sin_port;
6496 6536 srcid = 0;
6497 6537 }
6498 6538
6499 6539 tcp_connect_ipv4(tcp, mp, dstaddrp, dstport, srcid);
6500 6540 return;
6501 6541 default:
6502 6542 mp = mi_tpi_err_ack_alloc(mp, TOUTSTATE, 0);
6503 6543 break;
6504 6544 }
6505 6545 /*
6506 6546 * Note: Code below is the "failure" case
6507 6547 */
6508 6548 /* return error ack and blow away saved option results if any */
6509 6549 connect_failed:
6510 6550 if (mp != NULL)
6511 6551 putnext(tcp->tcp_rq, mp);
6512 6552 else {
6513 6553 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6514 6554 TSYSERR, ENOMEM);
6515 6555 }
6516 6556 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6517 6557 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6518 6558 }
6519 6559
6520 6560 /*
6521 6561 * Handle connect to IPv4 destinations, including connections for AF_INET6
6522 6562 * sockets connecting to IPv4 mapped IPv6 destinations.
6523 6563 */
6524 6564 static void
6525 6565 tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport,
6526 6566 uint_t srcid)
6527 6567 {
6528 6568 tcph_t *tcph;
6529 6569 mblk_t *mp1;
6530 6570 ipaddr_t dstaddr = *dstaddrp;
6531 6571 int32_t oldstate;
6532 6572 uint16_t lport;
6533 6573 tcp_stack_t *tcps = tcp->tcp_tcps;
6534 6574
6535 6575 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
6536 6576
6537 6577 /* Check for attempt to connect to INADDR_ANY */
6538 6578 if (dstaddr == INADDR_ANY) {
6539 6579 /*
6540 6580 * SunOS 4.x and 4.3 BSD allow an application
6541 6581 * to connect a TCP socket to INADDR_ANY.
6542 6582 * When they do this, the kernel picks the
6543 6583 * address of one interface and uses it
6544 6584 * instead. The kernel usually ends up
6545 6585 * picking the address of the loopback
6546 6586 * interface. This is an undocumented feature.
6547 6587 * However, we provide the same thing here
6548 6588 * in order to have source and binary
6549 6589 * compatibility with SunOS 4.x.
6550 6590 * Update the T_CONN_REQ (sin/sin6) since it is used to
6551 6591 * generate the T_CONN_CON.
6552 6592 */
6553 6593 dstaddr = htonl(INADDR_LOOPBACK);
6554 6594 *dstaddrp = dstaddr;
6555 6595 }
6556 6596
6557 6597 /* Handle __sin6_src_id if socket not bound to an IP address */
6558 6598 if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) {
6559 6599 ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6,
6560 6600 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
6561 6601 IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6,
6562 6602 tcp->tcp_ipha->ipha_src);
6563 6603 }
6564 6604
6565 6605 /*
6566 6606 * Don't let an endpoint connect to itself. Note that
6567 6607 * the test here does not catch the case where the
6568 6608 * source IP addr was left unspecified by the user. In
6569 6609 * this case, the source addr is set in tcp_adapt_ire()
6570 6610 * using the reply to the T_BIND message that we send
6571 6611 * down to IP here and the check is repeated in tcp_rput_other.
6572 6612 */
6573 6613 if (dstaddr == tcp->tcp_ipha->ipha_src &&
6574 6614 dstport == tcp->tcp_lport) {
6575 6615 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6576 6616 goto failed;
6577 6617 }
6578 6618
6579 6619 tcp->tcp_ipha->ipha_dst = dstaddr;
6580 6620 IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6);
6581 6621
6582 6622 /*
6583 6623 * Massage a source route if any putting the first hop
6584 6624 * in iph_dst. Compute a starting value for the checksum which
6585 6625 * takes into account that the original iph_dst should be
6586 6626 * included in the checksum but that ip will include the
6587 6627 * first hop in the source route in the tcp checksum.
6588 6628 */
6589 6629 tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack);
6590 6630 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6591 6631 tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) +
6592 6632 (tcp->tcp_ipha->ipha_dst & 0xffff));
6593 6633 if ((int)tcp->tcp_sum < 0)
6594 6634 tcp->tcp_sum--;
6595 6635 tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16);
6596 6636 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6597 6637 (tcp->tcp_sum >> 16));
6598 6638 tcph = tcp->tcp_tcph;
6599 6639 *(uint16_t *)tcph->th_fport = dstport;
6600 6640 tcp->tcp_fport = dstport;
6601 6641
6602 6642 oldstate = tcp->tcp_state;
6603 6643 /*
6604 6644 * At this point the remote destination address and remote port fields
6605 6645 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6606 6646 * have to see which state tcp was in so we can take apropriate action.
6607 6647 */
6608 6648 if (oldstate == TCPS_IDLE) {
6609 6649 /*
6610 6650 * We support a quick connect capability here, allowing
6611 6651 * clients to transition directly from IDLE to SYN_SENT
6612 6652 * tcp_bindi will pick an unused port, insert the connection
6613 6653 * in the bind hash and transition to BOUND state.
|
↓ open down ↓ |
478 lines elided |
↑ open up ↑ |
6614 6654 */
6615 6655 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6616 6656 tcp, B_TRUE);
6617 6657 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6618 6658 B_FALSE, B_FALSE);
6619 6659 if (lport == 0) {
6620 6660 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6621 6661 goto failed;
6622 6662 }
6623 6663 }
6664 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6665 + int32_t, TCPS_SYN_SENT);
6624 6666 tcp->tcp_state = TCPS_SYN_SENT;
6625 6667
6626 6668 /*
6627 6669 * TODO: allow data with connect requests
6628 6670 * by unlinking M_DATA trailers here and
6629 6671 * linking them in behind the T_OK_ACK mblk.
6630 6672 * The tcp_rput() bind ack handler would then
6631 6673 * feed them to tcp_wput_data() rather than call
6632 6674 * tcp_timer().
6633 6675 */
6634 6676 mp = mi_tpi_ok_ack_alloc(mp);
6635 6677 if (!mp) {
6678 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6679 + tcp_t *, tcp, int32_t, oldstate);
6636 6680 tcp->tcp_state = oldstate;
6637 6681 goto failed;
6638 6682 }
6639 6683 if (tcp->tcp_family == AF_INET) {
6640 6684 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6641 6685 sizeof (ipa_conn_t));
6642 6686 } else {
6643 6687 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6644 6688 sizeof (ipa6_conn_t));
6645 6689 }
6646 6690 if (mp1) {
6647 6691 /*
6648 6692 * We need to make sure that the conn_recv is set to a non-null
6649 6693 * value before we insert the conn_t into the classifier table.
6650 6694 * This is to avoid a race with an incoming packet which does
6651 6695 * an ipcl_classify().
6652 6696 */
6653 6697 tcp->tcp_connp->conn_recv = tcp_input;
6654 6698
6655 6699 /* Hang onto the T_OK_ACK for later. */
6656 6700 linkb(mp1, mp);
6657 6701 mblk_setcred(mp1, tcp->tcp_cred);
6658 6702 if (tcp->tcp_family == AF_INET)
6659 6703 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6660 6704 else {
6661 6705 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6662 6706 &tcp->tcp_sticky_ipp);
6663 6707 }
6664 6708 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6665 6709 tcp->tcp_active_open = 1;
|
↓ open down ↓ |
20 lines elided |
↑ open up ↑ |
6666 6710 /*
6667 6711 * If the bind cannot complete immediately
6668 6712 * IP will arrange to call tcp_rput_other
6669 6713 * when the bind completes.
6670 6714 */
6671 6715 if (mp1 != NULL)
6672 6716 tcp_rput_other(tcp, mp1);
6673 6717 return;
6674 6718 }
6675 6719 /* Error case */
6720 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6721 + int32_t, oldstate);
6676 6722 tcp->tcp_state = oldstate;
6677 6723 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6678 6724
6679 6725 failed:
6680 6726 /* return error ack and blow away saved option results if any */
6681 6727 if (mp != NULL)
6682 6728 putnext(tcp->tcp_rq, mp);
6683 6729 else {
6684 6730 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6685 6731 TSYSERR, ENOMEM);
6686 6732 }
6687 6733 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6688 6734 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6689 6735
6690 6736 }
6691 6737
6692 6738 /*
6693 6739 * Handle connect to IPv6 destinations.
6694 6740 */
6695 6741 static void
6696 6742 tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
6697 6743 in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id)
6698 6744 {
6699 6745 tcph_t *tcph;
6700 6746 mblk_t *mp1;
6701 6747 ip6_rthdr_t *rth;
6702 6748 int32_t oldstate;
6703 6749 uint16_t lport;
6704 6750 tcp_stack_t *tcps = tcp->tcp_tcps;
6705 6751
6706 6752 ASSERT(tcp->tcp_family == AF_INET6);
6707 6753
6708 6754 /*
6709 6755 * If we're here, it means that the destination address is a native
6710 6756 * IPv6 address. Return an error if tcp_ipversion is not IPv6. A
6711 6757 * reason why it might not be IPv6 is if the socket was bound to an
6712 6758 * IPv4-mapped IPv6 address.
6713 6759 */
6714 6760 if (tcp->tcp_ipversion != IPV6_VERSION) {
6715 6761 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6716 6762 goto failed;
6717 6763 }
6718 6764
6719 6765 /*
6720 6766 * Interpret a zero destination to mean loopback.
6721 6767 * Update the T_CONN_REQ (sin/sin6) since it is used to
6722 6768 * generate the T_CONN_CON.
6723 6769 */
6724 6770 if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) {
6725 6771 *dstaddrp = ipv6_loopback;
6726 6772 }
6727 6773
6728 6774 /* Handle __sin6_src_id if socket not bound to an IP address */
6729 6775 if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
6730 6776 ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src,
6731 6777 tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack);
6732 6778 tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src;
6733 6779 }
6734 6780
6735 6781 /*
6736 6782 * Take care of the scope_id now and add ip6i_t
6737 6783 * if ip6i_t is not already allocated through TCP
6738 6784 * sticky options. At this point tcp_ip6h does not
6739 6785 * have dst info, thus use dstaddrp.
6740 6786 */
6741 6787 if (scope_id != 0 &&
6742 6788 IN6_IS_ADDR_LINKSCOPE(dstaddrp)) {
6743 6789 ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp;
6744 6790 ip6i_t *ip6i;
6745 6791
6746 6792 ipp->ipp_ifindex = scope_id;
6747 6793 ip6i = (ip6i_t *)tcp->tcp_iphc;
6748 6794
6749 6795 if ((ipp->ipp_fields & IPPF_HAS_IP6I) &&
6750 6796 ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) {
6751 6797 /* Already allocated */
6752 6798 ip6i->ip6i_flags |= IP6I_IFINDEX;
6753 6799 ip6i->ip6i_ifindex = ipp->ipp_ifindex;
6754 6800 ipp->ipp_fields |= IPPF_SCOPE_ID;
6755 6801 } else {
6756 6802 int reterr;
6757 6803
6758 6804 ipp->ipp_fields |= IPPF_SCOPE_ID;
6759 6805 if (ipp->ipp_fields & IPPF_HAS_IP6I)
6760 6806 ip2dbg(("tcp_connect_v6: SCOPE_ID set\n"));
6761 6807 reterr = tcp_build_hdrs(tcp->tcp_rq, tcp);
6762 6808 if (reterr != 0)
6763 6809 goto failed;
6764 6810 ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n"));
6765 6811 }
6766 6812 }
6767 6813
6768 6814 /*
6769 6815 * Don't let an endpoint connect to itself. Note that
6770 6816 * the test here does not catch the case where the
6771 6817 * source IP addr was left unspecified by the user. In
6772 6818 * this case, the source addr is set in tcp_adapt_ire()
6773 6819 * using the reply to the T_BIND message that we send
6774 6820 * down to IP here and the check is repeated in tcp_rput_other.
6775 6821 */
6776 6822 if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) &&
6777 6823 (dstport == tcp->tcp_lport)) {
6778 6824 mp = mi_tpi_err_ack_alloc(mp, TBADADDR, 0);
6779 6825 goto failed;
6780 6826 }
6781 6827
6782 6828 tcp->tcp_ip6h->ip6_dst = *dstaddrp;
6783 6829 tcp->tcp_remote_v6 = *dstaddrp;
6784 6830 tcp->tcp_ip6h->ip6_vcf =
6785 6831 (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) |
6786 6832 (flowinfo & ~IPV6_VERS_AND_FLOW_MASK);
6787 6833
6788 6834
6789 6835 /*
6790 6836 * Massage a routing header (if present) putting the first hop
6791 6837 * in ip6_dst. Compute a starting value for the checksum which
6792 6838 * takes into account that the original ip6_dst should be
6793 6839 * included in the checksum but that ip will include the
6794 6840 * first hop in the source route in the tcp checksum.
6795 6841 */
6796 6842 rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph);
6797 6843 if (rth != NULL) {
6798 6844 tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth,
6799 6845 tcps->tcps_netstack);
6800 6846 tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) +
6801 6847 (tcp->tcp_sum >> 16));
6802 6848 } else {
6803 6849 tcp->tcp_sum = 0;
6804 6850 }
6805 6851
6806 6852 tcph = tcp->tcp_tcph;
6807 6853 *(uint16_t *)tcph->th_fport = dstport;
6808 6854 tcp->tcp_fport = dstport;
6809 6855
6810 6856 oldstate = tcp->tcp_state;
6811 6857 /*
6812 6858 * At this point the remote destination address and remote port fields
6813 6859 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6814 6860 * have to see which state tcp was in so we can take apropriate action.
6815 6861 */
6816 6862 if (oldstate == TCPS_IDLE) {
6817 6863 /*
6818 6864 * We support a quick connect capability here, allowing
6819 6865 * clients to transition directly from IDLE to SYN_SENT
6820 6866 * tcp_bindi will pick an unused port, insert the connection
6821 6867 * in the bind hash and transition to BOUND state.
|
↓ open down ↓ |
136 lines elided |
↑ open up ↑ |
6822 6868 */
6823 6869 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6824 6870 tcp, B_TRUE);
6825 6871 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6826 6872 B_FALSE, B_FALSE);
6827 6873 if (lport == 0) {
6828 6874 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6829 6875 goto failed;
6830 6876 }
6831 6877 }
6878 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6879 + int32_t, TCPS_SYN_SENT);
6832 6880 tcp->tcp_state = TCPS_SYN_SENT;
6833 6881 /*
6834 6882 * TODO: allow data with connect requests
6835 6883 * by unlinking M_DATA trailers here and
6836 6884 * linking them in behind the T_OK_ACK mblk.
6837 6885 * The tcp_rput() bind ack handler would then
6838 6886 * feed them to tcp_wput_data() rather than call
6839 6887 * tcp_timer().
6840 6888 */
6841 6889 mp = mi_tpi_ok_ack_alloc(mp);
6842 6890 if (!mp) {
6891 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6892 + tcp_t *, tcp, int32_t, oldstate);
6843 6893 tcp->tcp_state = oldstate;
6844 6894 goto failed;
6845 6895 }
6846 6896 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6847 6897 if (mp1) {
6848 6898 /*
6849 6899 * We need to make sure that the conn_recv is set to a non-null
6850 6900 * value before we insert the conn_t into the classifier table.
6851 6901 * This is to avoid a race with an incoming packet which does
6852 6902 * an ipcl_classify().
6853 6903 */
6854 6904 tcp->tcp_connp->conn_recv = tcp_input;
6855 6905
6856 6906 /* Hang onto the T_OK_ACK for later. */
6857 6907 linkb(mp1, mp);
6858 6908 mblk_setcred(mp1, tcp->tcp_cred);
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
6859 6909 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6860 6910 &tcp->tcp_sticky_ipp);
6861 6911 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6862 6912 tcp->tcp_active_open = 1;
6863 6913 /* ip_bind_v6() may return ACK or ERROR */
6864 6914 if (mp1 != NULL)
6865 6915 tcp_rput_other(tcp, mp1);
6866 6916 return;
6867 6917 }
6868 6918 /* Error case */
6919 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6920 + int32_t, oldstate);
6869 6921 tcp->tcp_state = oldstate;
6870 6922 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6871 6923
6872 6924 failed:
6873 6925 /* return error ack and blow away saved option results if any */
6874 6926 if (mp != NULL)
6875 6927 putnext(tcp->tcp_rq, mp);
6876 6928 else {
6877 6929 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6878 6930 TSYSERR, ENOMEM);
6879 6931 }
6880 6932 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6881 6933 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6882 6934 }
6883 6935
6884 6936 /*
6885 6937 * We need a stream q for detached closing tcp connections
6886 6938 * to use. Our client hereby indicates that this q is the
6887 6939 * one to use.
6888 6940 */
6889 6941 static void
6890 6942 tcp_def_q_set(tcp_t *tcp, mblk_t *mp)
6891 6943 {
6892 6944 struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
6893 6945 queue_t *q = tcp->tcp_wq;
6894 6946 tcp_stack_t *tcps = tcp->tcp_tcps;
6895 6947
6896 6948 #ifdef NS_DEBUG
6897 6949 (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n",
6898 6950 tcps->tcps_netstack->netstack_stackid);
6899 6951 #endif
6900 6952 mp->b_datap->db_type = M_IOCACK;
6901 6953 iocp->ioc_count = 0;
6902 6954 mutex_enter(&tcps->tcps_g_q_lock);
6903 6955 if (tcps->tcps_g_q != NULL) {
6904 6956 mutex_exit(&tcps->tcps_g_q_lock);
6905 6957 iocp->ioc_error = EALREADY;
6906 6958 } else {
6907 6959 mblk_t *mp1;
6908 6960
6909 6961 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0);
6910 6962 if (mp1 == NULL) {
6911 6963 mutex_exit(&tcps->tcps_g_q_lock);
6912 6964 iocp->ioc_error = ENOMEM;
6913 6965 } else {
6914 6966 tcps->tcps_g_q = tcp->tcp_rq;
6915 6967 mutex_exit(&tcps->tcps_g_q_lock);
6916 6968 iocp->ioc_error = 0;
6917 6969 iocp->ioc_rval = 0;
6918 6970 /*
6919 6971 * We are passing tcp_sticky_ipp as NULL
6920 6972 * as it is not useful for tcp_default queue
6921 6973 *
6922 6974 * Set conn_recv just in case.
6923 6975 */
6924 6976 tcp->tcp_connp->conn_recv = tcp_conn_request;
6925 6977
6926 6978 mp1 = ip_bind_v6(q, mp1, tcp->tcp_connp, NULL);
6927 6979 if (mp1 != NULL)
6928 6980 tcp_rput_other(tcp, mp1);
6929 6981 }
6930 6982 }
6931 6983 qreply(q, mp);
6932 6984 }
6933 6985
6934 6986 /*
6935 6987 * Our client hereby directs us to reject the connection request
6936 6988 * that tcp_conn_request() marked with 'seqnum'. Rejection consists
6937 6989 * of sending the appropriate RST, not an ICMP error.
6938 6990 */
6939 6991 static void
6940 6992 tcp_disconnect(tcp_t *tcp, mblk_t *mp)
6941 6993 {
6942 6994 tcp_t *ltcp = NULL;
6943 6995 t_scalar_t seqnum;
6944 6996 conn_t *connp;
6945 6997 tcp_stack_t *tcps = tcp->tcp_tcps;
6946 6998
6947 6999 ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
6948 7000 if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) {
6949 7001 tcp_err_ack(tcp, mp, TPROTO, 0);
6950 7002 return;
6951 7003 }
6952 7004
6953 7005 /*
6954 7006 * Right now, upper modules pass down a T_DISCON_REQ to TCP,
6955 7007 * when the stream is in BOUND state. Do not send a reset,
6956 7008 * since the destination IP address is not valid, and it can
6957 7009 * be the initialized value of all zeros (broadcast address).
6958 7010 *
6959 7011 * If TCP has sent down a bind request to IP and has not
6960 7012 * received the reply, reject the request. Otherwise, TCP
6961 7013 * will be confused.
6962 7014 */
6963 7015 if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) {
6964 7016 if (tcp->tcp_debug) {
6965 7017 (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
6966 7018 "tcp_disconnect: bad state, %d", tcp->tcp_state);
6967 7019 }
6968 7020 tcp_err_ack(tcp, mp, TOUTSTATE, 0);
6969 7021 return;
6970 7022 }
6971 7023
6972 7024 seqnum = ((struct T_discon_req *)mp->b_rptr)->SEQ_number;
6973 7025
6974 7026 if (seqnum == -1 || tcp->tcp_conn_req_max == 0) {
6975 7027
6976 7028 /*
6977 7029 * According to TPI, for non-listeners, ignore seqnum
6978 7030 * and disconnect.
6979 7031 * Following interpretation of -1 seqnum is historical
6980 7032 * and implied TPI ? (TPI only states that for T_CONN_IND,
6981 7033 * a valid seqnum should not be -1).
6982 7034 *
6983 7035 * -1 means disconnect everything
6984 7036 * regardless even on a listener.
6985 7037 */
6986 7038
6987 7039 int old_state = tcp->tcp_state;
6988 7040 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
6989 7041
6990 7042 /*
6991 7043 * The connection can't be on the tcp_time_wait_head list
6992 7044 * since it is not detached.
6993 7045 */
6994 7046 ASSERT(tcp->tcp_time_wait_next == NULL);
6995 7047 ASSERT(tcp->tcp_time_wait_prev == NULL);
6996 7048 ASSERT(tcp->tcp_time_wait_expire == 0);
6997 7049 ltcp = NULL;
6998 7050 /*
6999 7051 * If it used to be a listener, check to make sure no one else
7000 7052 * has taken the port before switching back to LISTEN state.
7001 7053 */
7002 7054 if (tcp->tcp_ipversion == IPV4_VERSION) {
7003 7055 connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
7004 7056 tcp->tcp_ipha->ipha_src,
7005 7057 tcp->tcp_connp->conn_zoneid, ipst);
7006 7058 if (connp != NULL)
|
↓ open down ↓ |
128 lines elided |
↑ open up ↑ |
7007 7059 ltcp = connp->conn_tcp;
7008 7060 } else {
7009 7061 /* Allow tcp_bound_if listeners? */
7010 7062 connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
7011 7063 &tcp->tcp_ip6h->ip6_src, 0,
7012 7064 tcp->tcp_connp->conn_zoneid, ipst);
7013 7065 if (connp != NULL)
7014 7066 ltcp = connp->conn_tcp;
7015 7067 }
7016 7068 if (tcp->tcp_conn_req_max && ltcp == NULL) {
7069 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7070 + tcp_t *, tcp, int32_t, TCPS_LISTEN);
7017 7071 tcp->tcp_state = TCPS_LISTEN;
7018 7072 } else if (old_state > TCPS_BOUND) {
7019 7073 tcp->tcp_conn_req_max = 0;
7074 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7075 + tcp_t *, tcp, int32_t, TCPS_BOUND);
7020 7076 tcp->tcp_state = TCPS_BOUND;
7021 7077 }
7022 7078 if (ltcp != NULL)
7023 7079 CONN_DEC_REF(ltcp->tcp_connp);
7024 7080 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
7025 7081 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
7026 7082 } else if (old_state == TCPS_ESTABLISHED ||
7027 7083 old_state == TCPS_CLOSE_WAIT) {
7028 7084 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
7029 7085 }
7030 7086
7031 7087 if (tcp->tcp_fused)
7032 7088 tcp_unfuse(tcp);
7033 7089
7034 7090 mutex_enter(&tcp->tcp_eager_lock);
7035 7091 if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
7036 7092 (tcp->tcp_conn_req_cnt_q != 0)) {
7037 7093 tcp_eager_cleanup(tcp, 0);
7038 7094 }
7039 7095 mutex_exit(&tcp->tcp_eager_lock);
7040 7096
7041 7097 tcp_xmit_ctl("tcp_disconnect", tcp, tcp->tcp_snxt,
7042 7098 tcp->tcp_rnxt, TH_RST | TH_ACK);
7043 7099
7044 7100 tcp_reinit(tcp);
7045 7101
7046 7102 if (old_state >= TCPS_ESTABLISHED) {
7047 7103 /* Send M_FLUSH according to TPI */
7048 7104 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
7049 7105 }
7050 7106 mp = mi_tpi_ok_ack_alloc(mp);
7051 7107 if (mp)
7052 7108 putnext(tcp->tcp_rq, mp);
7053 7109 return;
7054 7110 } else if (!tcp_eager_blowoff(tcp, seqnum)) {
7055 7111 tcp_err_ack(tcp, mp, TBADSEQ, 0);
7056 7112 return;
7057 7113 }
7058 7114 if (tcp->tcp_state >= TCPS_ESTABLISHED) {
7059 7115 /* Send M_FLUSH according to TPI */
7060 7116 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
7061 7117 }
7062 7118 mp = mi_tpi_ok_ack_alloc(mp);
7063 7119 if (mp)
7064 7120 putnext(tcp->tcp_rq, mp);
7065 7121 }
7066 7122
7067 7123 /*
7068 7124 * Diagnostic routine used to return a string associated with the tcp state.
7069 7125 * Note that if the caller does not supply a buffer, it will use an internal
7070 7126 * static string. This means that if multiple threads call this function at
7071 7127 * the same time, output can be corrupted... Note also that this function
7072 7128 * does not check the size of the supplied buffer. The caller has to make
7073 7129 * sure that it is big enough.
7074 7130 */
7075 7131 static char *
7076 7132 tcp_display(tcp_t *tcp, char *sup_buf, char format)
7077 7133 {
7078 7134 char buf1[30];
7079 7135 static char priv_buf[INET6_ADDRSTRLEN * 2 + 80];
7080 7136 char *buf;
7081 7137 char *cp;
7082 7138 in6_addr_t local, remote;
7083 7139 char local_addrbuf[INET6_ADDRSTRLEN];
7084 7140 char remote_addrbuf[INET6_ADDRSTRLEN];
7085 7141
7086 7142 if (sup_buf != NULL)
7087 7143 buf = sup_buf;
7088 7144 else
7089 7145 buf = priv_buf;
7090 7146
7091 7147 if (tcp == NULL)
7092 7148 return ("NULL_TCP");
7093 7149 switch (tcp->tcp_state) {
7094 7150 case TCPS_CLOSED:
7095 7151 cp = "TCP_CLOSED";
7096 7152 break;
7097 7153 case TCPS_IDLE:
7098 7154 cp = "TCP_IDLE";
7099 7155 break;
7100 7156 case TCPS_BOUND:
7101 7157 cp = "TCP_BOUND";
7102 7158 break;
7103 7159 case TCPS_LISTEN:
7104 7160 cp = "TCP_LISTEN";
7105 7161 break;
7106 7162 case TCPS_SYN_SENT:
7107 7163 cp = "TCP_SYN_SENT";
7108 7164 break;
7109 7165 case TCPS_SYN_RCVD:
7110 7166 cp = "TCP_SYN_RCVD";
7111 7167 break;
7112 7168 case TCPS_ESTABLISHED:
7113 7169 cp = "TCP_ESTABLISHED";
7114 7170 break;
7115 7171 case TCPS_CLOSE_WAIT:
7116 7172 cp = "TCP_CLOSE_WAIT";
7117 7173 break;
7118 7174 case TCPS_FIN_WAIT_1:
7119 7175 cp = "TCP_FIN_WAIT_1";
7120 7176 break;
7121 7177 case TCPS_CLOSING:
7122 7178 cp = "TCP_CLOSING";
7123 7179 break;
7124 7180 case TCPS_LAST_ACK:
7125 7181 cp = "TCP_LAST_ACK";
7126 7182 break;
7127 7183 case TCPS_FIN_WAIT_2:
7128 7184 cp = "TCP_FIN_WAIT_2";
7129 7185 break;
7130 7186 case TCPS_TIME_WAIT:
7131 7187 cp = "TCP_TIME_WAIT";
7132 7188 break;
7133 7189 default:
7134 7190 (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state);
7135 7191 cp = buf1;
7136 7192 break;
7137 7193 }
7138 7194 switch (format) {
7139 7195 case DISP_ADDR_AND_PORT:
7140 7196 if (tcp->tcp_ipversion == IPV4_VERSION) {
7141 7197 /*
7142 7198 * Note that we use the remote address in the tcp_b
7143 7199 * structure. This means that it will print out
7144 7200 * the real destination address, not the next hop's
7145 7201 * address if source routing is used.
7146 7202 */
7147 7203 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local);
7148 7204 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote);
7149 7205
7150 7206 } else {
7151 7207 local = tcp->tcp_ip_src_v6;
7152 7208 remote = tcp->tcp_remote_v6;
7153 7209 }
7154 7210 (void) inet_ntop(AF_INET6, &local, local_addrbuf,
7155 7211 sizeof (local_addrbuf));
7156 7212 (void) inet_ntop(AF_INET6, &remote, remote_addrbuf,
7157 7213 sizeof (remote_addrbuf));
7158 7214 (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s",
7159 7215 local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf,
7160 7216 ntohs(tcp->tcp_fport), cp);
7161 7217 break;
7162 7218 case DISP_PORT_ONLY:
7163 7219 default:
7164 7220 (void) mi_sprintf(buf, "[%u, %u] %s",
7165 7221 ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp);
7166 7222 break;
7167 7223 }
7168 7224
7169 7225 return (buf);
7170 7226 }
7171 7227
7172 7228 /*
7173 7229 * Called via squeue to get on to eager's perimeter. It sends a
7174 7230 * TH_RST if eager is in the fanout table. The listener wants the
7175 7231 * eager to disappear either by means of tcp_eager_blowoff() or
7176 7232 * tcp_eager_cleanup() being called. tcp_eager_kill() can also be
7177 7233 * called (via squeue) if the eager cannot be inserted in the
7178 7234 * fanout table in tcp_conn_request().
7179 7235 */
7180 7236 /* ARGSUSED */
7181 7237 void
7182 7238 tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
7183 7239 {
7184 7240 conn_t *econnp = (conn_t *)arg;
7185 7241 tcp_t *eager = econnp->conn_tcp;
7186 7242 tcp_t *listener = eager->tcp_listener;
7187 7243 tcp_stack_t *tcps = eager->tcp_tcps;
7188 7244
7189 7245 /*
7190 7246 * We could be called because listener is closing. Since
7191 7247 * the eager is using listener's queue's, its not safe.
7192 7248 * Better use the default queue just to send the TH_RST
7193 7249 * out.
7194 7250 */
7195 7251 ASSERT(tcps->tcps_g_q != NULL);
7196 7252 eager->tcp_rq = tcps->tcps_g_q;
7197 7253 eager->tcp_wq = WR(tcps->tcps_g_q);
7198 7254
7199 7255 /*
7200 7256 * An eager's conn_fanout will be NULL if it's a duplicate
7201 7257 * for an existing 4-tuples in the conn fanout table.
7202 7258 * We don't want to send an RST out in such case.
7203 7259 */
7204 7260 if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) {
7205 7261 tcp_xmit_ctl("tcp_eager_kill, can't wait",
7206 7262 eager, eager->tcp_snxt, 0, TH_RST);
7207 7263 }
7208 7264
7209 7265 /* We are here because listener wants this eager gone */
7210 7266 if (listener != NULL) {
7211 7267 mutex_enter(&listener->tcp_eager_lock);
7212 7268 tcp_eager_unlink(eager);
7213 7269 if (eager->tcp_tconnind_started) {
7214 7270 /*
7215 7271 * The eager has sent a conn_ind up to the
7216 7272 * listener but listener decides to close
7217 7273 * instead. We need to drop the extra ref
7218 7274 * placed on eager in tcp_rput_data() before
7219 7275 * sending the conn_ind to listener.
7220 7276 */
7221 7277 CONN_DEC_REF(econnp);
7222 7278 }
7223 7279 mutex_exit(&listener->tcp_eager_lock);
7224 7280 CONN_DEC_REF(listener->tcp_connp);
7225 7281 }
7226 7282
7227 7283 if (eager->tcp_state > TCPS_BOUND)
7228 7284 tcp_close_detached(eager);
7229 7285 }
7230 7286
7231 7287 /*
7232 7288 * Reset any eager connection hanging off this listener marked
7233 7289 * with 'seqnum' and then reclaim it's resources.
7234 7290 */
7235 7291 static boolean_t
7236 7292 tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum)
7237 7293 {
7238 7294 tcp_t *eager;
7239 7295 mblk_t *mp;
7240 7296 tcp_stack_t *tcps = listener->tcp_tcps;
7241 7297
7242 7298 TCP_STAT(tcps, tcp_eager_blowoff_calls);
7243 7299 eager = listener;
7244 7300 mutex_enter(&listener->tcp_eager_lock);
7245 7301 do {
7246 7302 eager = eager->tcp_eager_next_q;
7247 7303 if (eager == NULL) {
7248 7304 mutex_exit(&listener->tcp_eager_lock);
7249 7305 return (B_FALSE);
7250 7306 }
7251 7307 } while (eager->tcp_conn_req_seqnum != seqnum);
7252 7308
7253 7309 if (eager->tcp_closemp_used) {
7254 7310 mutex_exit(&listener->tcp_eager_lock);
7255 7311 return (B_TRUE);
7256 7312 }
7257 7313 eager->tcp_closemp_used = B_TRUE;
7258 7314 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7259 7315 CONN_INC_REF(eager->tcp_connp);
7260 7316 mutex_exit(&listener->tcp_eager_lock);
7261 7317 mp = &eager->tcp_closemp;
7262 7318 squeue_fill(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill,
7263 7319 eager->tcp_connp, SQTAG_TCP_EAGER_BLOWOFF);
7264 7320 return (B_TRUE);
7265 7321 }
7266 7322
7267 7323 /*
7268 7324 * Reset any eager connection hanging off this listener
7269 7325 * and then reclaim it's resources.
7270 7326 */
7271 7327 static void
7272 7328 tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
7273 7329 {
7274 7330 tcp_t *eager;
7275 7331 mblk_t *mp;
7276 7332 tcp_stack_t *tcps = listener->tcp_tcps;
7277 7333
7278 7334 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
7279 7335
7280 7336 if (!q0_only) {
7281 7337 /* First cleanup q */
7282 7338 TCP_STAT(tcps, tcp_eager_blowoff_q);
7283 7339 eager = listener->tcp_eager_next_q;
7284 7340 while (eager != NULL) {
7285 7341 if (!eager->tcp_closemp_used) {
7286 7342 eager->tcp_closemp_used = B_TRUE;
7287 7343 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7288 7344 CONN_INC_REF(eager->tcp_connp);
7289 7345 mp = &eager->tcp_closemp;
7290 7346 squeue_fill(eager->tcp_connp->conn_sqp, mp,
7291 7347 tcp_eager_kill, eager->tcp_connp,
7292 7348 SQTAG_TCP_EAGER_CLEANUP);
7293 7349 }
7294 7350 eager = eager->tcp_eager_next_q;
7295 7351 }
7296 7352 }
7297 7353 /* Then cleanup q0 */
7298 7354 TCP_STAT(tcps, tcp_eager_blowoff_q0);
7299 7355 eager = listener->tcp_eager_next_q0;
7300 7356 while (eager != listener) {
7301 7357 if (!eager->tcp_closemp_used) {
7302 7358 eager->tcp_closemp_used = B_TRUE;
7303 7359 TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
7304 7360 CONN_INC_REF(eager->tcp_connp);
7305 7361 mp = &eager->tcp_closemp;
7306 7362 squeue_fill(eager->tcp_connp->conn_sqp, mp,
7307 7363 tcp_eager_kill, eager->tcp_connp,
7308 7364 SQTAG_TCP_EAGER_CLEANUP_Q0);
7309 7365 }
7310 7366 eager = eager->tcp_eager_next_q0;
7311 7367 }
7312 7368 }
7313 7369
7314 7370 /*
7315 7371 * If we are an eager connection hanging off a listener that hasn't
7316 7372 * formally accepted the connection yet, get off his list and blow off
7317 7373 * any data that we have accumulated.
7318 7374 */
7319 7375 static void
7320 7376 tcp_eager_unlink(tcp_t *tcp)
7321 7377 {
7322 7378 tcp_t *listener = tcp->tcp_listener;
7323 7379
7324 7380 ASSERT(MUTEX_HELD(&listener->tcp_eager_lock));
7325 7381 ASSERT(listener != NULL);
7326 7382 if (tcp->tcp_eager_next_q0 != NULL) {
7327 7383 ASSERT(tcp->tcp_eager_prev_q0 != NULL);
7328 7384
7329 7385 /* Remove the eager tcp from q0 */
7330 7386 tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
7331 7387 tcp->tcp_eager_prev_q0;
7332 7388 tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
7333 7389 tcp->tcp_eager_next_q0;
7334 7390 ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
7335 7391 listener->tcp_conn_req_cnt_q0--;
7336 7392
7337 7393 tcp->tcp_eager_next_q0 = NULL;
7338 7394 tcp->tcp_eager_prev_q0 = NULL;
7339 7395
7340 7396 /*
7341 7397 * Take the eager out, if it is in the list of droppable
7342 7398 * eagers.
7343 7399 */
7344 7400 MAKE_UNDROPPABLE(tcp);
7345 7401
7346 7402 if (tcp->tcp_syn_rcvd_timeout != 0) {
7347 7403 /* we have timed out before */
7348 7404 ASSERT(listener->tcp_syn_rcvd_timeout > 0);
7349 7405 listener->tcp_syn_rcvd_timeout--;
7350 7406 }
7351 7407 } else {
7352 7408 tcp_t **tcpp = &listener->tcp_eager_next_q;
7353 7409 tcp_t *prev = NULL;
7354 7410
7355 7411 for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) {
7356 7412 if (tcpp[0] == tcp) {
7357 7413 if (listener->tcp_eager_last_q == tcp) {
7358 7414 /*
7359 7415 * If we are unlinking the last
7360 7416 * element on the list, adjust
7361 7417 * tail pointer. Set tail pointer
7362 7418 * to nil when list is empty.
7363 7419 */
7364 7420 ASSERT(tcp->tcp_eager_next_q == NULL);
7365 7421 if (listener->tcp_eager_last_q ==
7366 7422 listener->tcp_eager_next_q) {
7367 7423 listener->tcp_eager_last_q =
7368 7424 NULL;
7369 7425 } else {
7370 7426 /*
7371 7427 * We won't get here if there
7372 7428 * is only one eager in the
7373 7429 * list.
7374 7430 */
7375 7431 ASSERT(prev != NULL);
7376 7432 listener->tcp_eager_last_q =
7377 7433 prev;
7378 7434 }
7379 7435 }
7380 7436 tcpp[0] = tcp->tcp_eager_next_q;
7381 7437 tcp->tcp_eager_next_q = NULL;
7382 7438 tcp->tcp_eager_last_q = NULL;
7383 7439 ASSERT(listener->tcp_conn_req_cnt_q > 0);
7384 7440 listener->tcp_conn_req_cnt_q--;
7385 7441 break;
7386 7442 }
7387 7443 prev = tcpp[0];
7388 7444 }
7389 7445 }
7390 7446 tcp->tcp_listener = NULL;
7391 7447 }
7392 7448
7393 7449 /* Shorthand to generate and send TPI error acks to our client */
7394 7450 static void
7395 7451 tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
7396 7452 {
7397 7453 if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
7398 7454 putnext(tcp->tcp_rq, mp);
7399 7455 }
7400 7456
7401 7457 /* Shorthand to generate and send TPI error acks to our client */
7402 7458 static void
7403 7459 tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
7404 7460 int t_error, int sys_error)
7405 7461 {
7406 7462 struct T_error_ack *teackp;
7407 7463
7408 7464 if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
7409 7465 M_PCPROTO, T_ERROR_ACK)) != NULL) {
7410 7466 teackp = (struct T_error_ack *)mp->b_rptr;
7411 7467 teackp->ERROR_prim = primitive;
7412 7468 teackp->TLI_error = t_error;
7413 7469 teackp->UNIX_error = sys_error;
7414 7470 putnext(tcp->tcp_rq, mp);
7415 7471 }
7416 7472 }
7417 7473
7418 7474 /*
7419 7475 * Note: No locks are held when inspecting tcp_g_*epriv_ports
7420 7476 * but instead the code relies on:
7421 7477 * - the fact that the address of the array and its size never changes
7422 7478 * - the atomic assignment of the elements of the array
7423 7479 */
7424 7480 /* ARGSUSED */
7425 7481 static int
7426 7482 tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr)
7427 7483 {
7428 7484 int i;
7429 7485 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7430 7486
7431 7487 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7432 7488 if (tcps->tcps_g_epriv_ports[i] != 0)
7433 7489 (void) mi_mpprintf(mp, "%d ",
7434 7490 tcps->tcps_g_epriv_ports[i]);
7435 7491 }
7436 7492 return (0);
7437 7493 }
7438 7494
7439 7495 /*
7440 7496 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7441 7497 * threads from changing it at the same time.
7442 7498 */
7443 7499 /* ARGSUSED */
7444 7500 static int
7445 7501 tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7446 7502 cred_t *cr)
7447 7503 {
7448 7504 long new_value;
7449 7505 int i;
7450 7506 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7451 7507
7452 7508 /*
7453 7509 * Fail the request if the new value does not lie within the
7454 7510 * port number limits.
7455 7511 */
7456 7512 if (ddi_strtol(value, NULL, 10, &new_value) != 0 ||
7457 7513 new_value <= 0 || new_value >= 65536) {
7458 7514 return (EINVAL);
7459 7515 }
7460 7516
7461 7517 mutex_enter(&tcps->tcps_epriv_port_lock);
7462 7518 /* Check if the value is already in the list */
7463 7519 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7464 7520 if (new_value == tcps->tcps_g_epriv_ports[i]) {
7465 7521 mutex_exit(&tcps->tcps_epriv_port_lock);
7466 7522 return (EEXIST);
7467 7523 }
7468 7524 }
7469 7525 /* Find an empty slot */
7470 7526 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7471 7527 if (tcps->tcps_g_epriv_ports[i] == 0)
7472 7528 break;
7473 7529 }
7474 7530 if (i == tcps->tcps_g_num_epriv_ports) {
7475 7531 mutex_exit(&tcps->tcps_epriv_port_lock);
7476 7532 return (EOVERFLOW);
7477 7533 }
7478 7534 /* Set the new value */
7479 7535 tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value;
7480 7536 mutex_exit(&tcps->tcps_epriv_port_lock);
7481 7537 return (0);
7482 7538 }
7483 7539
7484 7540 /*
7485 7541 * Hold a lock while changing tcp_g_epriv_ports to prevent multiple
7486 7542 * threads from changing it at the same time.
7487 7543 */
7488 7544 /* ARGSUSED */
7489 7545 static int
7490 7546 tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
7491 7547 cred_t *cr)
7492 7548 {
7493 7549 long new_value;
7494 7550 int i;
7495 7551 tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps;
7496 7552
7497 7553 /*
7498 7554 * Fail the request if the new value does not lie within the
7499 7555 * port number limits.
7500 7556 */
7501 7557 if (ddi_strtol(value, NULL, 10, &new_value) != 0 || new_value <= 0 ||
7502 7558 new_value >= 65536) {
7503 7559 return (EINVAL);
7504 7560 }
7505 7561
7506 7562 mutex_enter(&tcps->tcps_epriv_port_lock);
7507 7563 /* Check that the value is already in the list */
7508 7564 for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) {
7509 7565 if (tcps->tcps_g_epriv_ports[i] == new_value)
7510 7566 break;
7511 7567 }
7512 7568 if (i == tcps->tcps_g_num_epriv_ports) {
7513 7569 mutex_exit(&tcps->tcps_epriv_port_lock);
7514 7570 return (ESRCH);
7515 7571 }
7516 7572 /* Clear the value */
7517 7573 tcps->tcps_g_epriv_ports[i] = 0;
7518 7574 mutex_exit(&tcps->tcps_epriv_port_lock);
7519 7575 return (0);
7520 7576 }
7521 7577
7522 7578 /* Return the TPI/TLI equivalent of our current tcp_state */
7523 7579 static int
7524 7580 tcp_tpistate(tcp_t *tcp)
7525 7581 {
7526 7582 switch (tcp->tcp_state) {
7527 7583 case TCPS_IDLE:
7528 7584 return (TS_UNBND);
7529 7585 case TCPS_LISTEN:
7530 7586 /*
7531 7587 * Return whether there are outstanding T_CONN_IND waiting
7532 7588 * for the matching T_CONN_RES. Therefore don't count q0.
7533 7589 */
7534 7590 if (tcp->tcp_conn_req_cnt_q > 0)
7535 7591 return (TS_WRES_CIND);
7536 7592 else
7537 7593 return (TS_IDLE);
7538 7594 case TCPS_BOUND:
7539 7595 return (TS_IDLE);
7540 7596 case TCPS_SYN_SENT:
7541 7597 return (TS_WCON_CREQ);
7542 7598 case TCPS_SYN_RCVD:
7543 7599 /*
7544 7600 * Note: assumption: this has to the active open SYN_RCVD.
7545 7601 * The passive instance is detached in SYN_RCVD stage of
7546 7602 * incoming connection processing so we cannot get request
7547 7603 * for T_info_ack on it.
7548 7604 */
7549 7605 return (TS_WACK_CRES);
7550 7606 case TCPS_ESTABLISHED:
7551 7607 return (TS_DATA_XFER);
7552 7608 case TCPS_CLOSE_WAIT:
7553 7609 return (TS_WREQ_ORDREL);
7554 7610 case TCPS_FIN_WAIT_1:
7555 7611 return (TS_WIND_ORDREL);
7556 7612 case TCPS_FIN_WAIT_2:
7557 7613 return (TS_WIND_ORDREL);
7558 7614
7559 7615 case TCPS_CLOSING:
7560 7616 case TCPS_LAST_ACK:
7561 7617 case TCPS_TIME_WAIT:
7562 7618 case TCPS_CLOSED:
7563 7619 /*
7564 7620 * Following TS_WACK_DREQ7 is a rendition of "not
7565 7621 * yet TS_IDLE" TPI state. There is no best match to any
7566 7622 * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
7567 7623 * choose a value chosen that will map to TLI/XTI level
7568 7624 * state of TSTATECHNG (state is process of changing) which
7569 7625 * captures what this dummy state represents.
7570 7626 */
7571 7627 return (TS_WACK_DREQ7);
7572 7628 default:
7573 7629 cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
7574 7630 tcp->tcp_state, tcp_display(tcp, NULL,
7575 7631 DISP_PORT_ONLY));
7576 7632 return (TS_UNBND);
7577 7633 }
7578 7634 }
7579 7635
7580 7636 static void
7581 7637 tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
7582 7638 {
7583 7639 tcp_stack_t *tcps = tcp->tcp_tcps;
7584 7640
7585 7641 if (tcp->tcp_family == AF_INET6)
7586 7642 *tia = tcp_g_t_info_ack_v6;
7587 7643 else
7588 7644 *tia = tcp_g_t_info_ack;
7589 7645 tia->CURRENT_state = tcp_tpistate(tcp);
7590 7646 tia->OPT_size = tcp_max_optsize;
7591 7647 if (tcp->tcp_mss == 0) {
7592 7648 /* Not yet set - tcp_open does not set mss */
7593 7649 if (tcp->tcp_ipversion == IPV4_VERSION)
7594 7650 tia->TIDU_size = tcps->tcps_mss_def_ipv4;
7595 7651 else
7596 7652 tia->TIDU_size = tcps->tcps_mss_def_ipv6;
7597 7653 } else {
7598 7654 tia->TIDU_size = tcp->tcp_mss;
7599 7655 }
7600 7656 /* TODO: Default ETSDU is 1. Is that correct for tcp? */
7601 7657 }
7602 7658
7603 7659 /*
7604 7660 * This routine responds to T_CAPABILITY_REQ messages. It is called by
7605 7661 * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from
7606 7662 * tcp_g_t_info_ack. The current state of the stream is copied from
7607 7663 * tcp_state.
7608 7664 */
7609 7665 static void
7610 7666 tcp_capability_req(tcp_t *tcp, mblk_t *mp)
7611 7667 {
7612 7668 t_uscalar_t cap_bits1;
7613 7669 struct T_capability_ack *tcap;
7614 7670
7615 7671 if (MBLKL(mp) < sizeof (struct T_capability_req)) {
7616 7672 freemsg(mp);
7617 7673 return;
7618 7674 }
7619 7675
7620 7676 cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
7621 7677
7622 7678 mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
7623 7679 mp->b_datap->db_type, T_CAPABILITY_ACK);
7624 7680 if (mp == NULL)
7625 7681 return;
7626 7682
7627 7683 tcap = (struct T_capability_ack *)mp->b_rptr;
7628 7684 tcap->CAP_bits1 = 0;
7629 7685
7630 7686 if (cap_bits1 & TC1_INFO) {
7631 7687 tcp_copy_info(&tcap->INFO_ack, tcp);
7632 7688 tcap->CAP_bits1 |= TC1_INFO;
7633 7689 }
7634 7690
7635 7691 if (cap_bits1 & TC1_ACCEPTOR_ID) {
7636 7692 tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
7637 7693 tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
7638 7694 }
7639 7695
7640 7696 putnext(tcp->tcp_rq, mp);
7641 7697 }
7642 7698
7643 7699 /*
7644 7700 * This routine responds to T_INFO_REQ messages. It is called by tcp_wput.
7645 7701 * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
7646 7702 * The current state of the stream is copied from tcp_state.
7647 7703 */
7648 7704 static void
7649 7705 tcp_info_req(tcp_t *tcp, mblk_t *mp)
7650 7706 {
7651 7707 mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
7652 7708 T_INFO_ACK);
7653 7709 if (!mp) {
7654 7710 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7655 7711 return;
7656 7712 }
7657 7713 tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
7658 7714 putnext(tcp->tcp_rq, mp);
7659 7715 }
7660 7716
7661 7717 /* Respond to the TPI addr request */
7662 7718 static void
7663 7719 tcp_addr_req(tcp_t *tcp, mblk_t *mp)
7664 7720 {
7665 7721 sin_t *sin;
7666 7722 mblk_t *ackmp;
7667 7723 struct T_addr_ack *taa;
7668 7724
7669 7725 /* Make it large enough for worst case */
7670 7726 ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
7671 7727 2 * sizeof (sin6_t), 1);
7672 7728 if (ackmp == NULL) {
7673 7729 tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
7674 7730 return;
7675 7731 }
7676 7732
7677 7733 if (tcp->tcp_ipversion == IPV6_VERSION) {
7678 7734 tcp_addr_req_ipv6(tcp, ackmp);
7679 7735 return;
7680 7736 }
7681 7737 taa = (struct T_addr_ack *)ackmp->b_rptr;
7682 7738
7683 7739 bzero(taa, sizeof (struct T_addr_ack));
7684 7740 ackmp->b_wptr = (uchar_t *)&taa[1];
7685 7741
7686 7742 taa->PRIM_type = T_ADDR_ACK;
7687 7743 ackmp->b_datap->db_type = M_PCPROTO;
7688 7744
7689 7745 /*
7690 7746 * Note: Following code assumes 32 bit alignment of basic
7691 7747 * data structures like sin_t and struct T_addr_ack.
7692 7748 */
7693 7749 if (tcp->tcp_state >= TCPS_BOUND) {
7694 7750 /*
7695 7751 * Fill in local address
7696 7752 */
7697 7753 taa->LOCADDR_length = sizeof (sin_t);
7698 7754 taa->LOCADDR_offset = sizeof (*taa);
7699 7755
7700 7756 sin = (sin_t *)&taa[1];
7701 7757
7702 7758 /* Fill zeroes and then intialize non-zero fields */
7703 7759 *sin = sin_null;
7704 7760
7705 7761 sin->sin_family = AF_INET;
7706 7762
7707 7763 sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src;
7708 7764 sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport;
7709 7765
7710 7766 ackmp->b_wptr = (uchar_t *)&sin[1];
7711 7767
7712 7768 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7713 7769 /*
7714 7770 * Fill in Remote address
7715 7771 */
7716 7772 taa->REMADDR_length = sizeof (sin_t);
7717 7773 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7718 7774 taa->LOCADDR_length);
7719 7775
7720 7776 sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7721 7777 *sin = sin_null;
7722 7778 sin->sin_family = AF_INET;
7723 7779 sin->sin_addr.s_addr = tcp->tcp_remote;
7724 7780 sin->sin_port = tcp->tcp_fport;
7725 7781
7726 7782 ackmp->b_wptr = (uchar_t *)&sin[1];
7727 7783 }
7728 7784 }
7729 7785 putnext(tcp->tcp_rq, ackmp);
7730 7786 }
7731 7787
7732 7788 /* Assumes that tcp_addr_req gets enough space and alignment */
7733 7789 static void
7734 7790 tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp)
7735 7791 {
7736 7792 sin6_t *sin6;
7737 7793 struct T_addr_ack *taa;
7738 7794
7739 7795 ASSERT(tcp->tcp_ipversion == IPV6_VERSION);
7740 7796 ASSERT(OK_32PTR(ackmp->b_rptr));
7741 7797 ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) +
7742 7798 2 * sizeof (sin6_t));
7743 7799
7744 7800 taa = (struct T_addr_ack *)ackmp->b_rptr;
7745 7801
7746 7802 bzero(taa, sizeof (struct T_addr_ack));
7747 7803 ackmp->b_wptr = (uchar_t *)&taa[1];
7748 7804
7749 7805 taa->PRIM_type = T_ADDR_ACK;
7750 7806 ackmp->b_datap->db_type = M_PCPROTO;
7751 7807
7752 7808 /*
7753 7809 * Note: Following code assumes 32 bit alignment of basic
7754 7810 * data structures like sin6_t and struct T_addr_ack.
7755 7811 */
7756 7812 if (tcp->tcp_state >= TCPS_BOUND) {
7757 7813 /*
7758 7814 * Fill in local address
7759 7815 */
7760 7816 taa->LOCADDR_length = sizeof (sin6_t);
7761 7817 taa->LOCADDR_offset = sizeof (*taa);
7762 7818
7763 7819 sin6 = (sin6_t *)&taa[1];
7764 7820 *sin6 = sin6_null;
7765 7821
7766 7822 sin6->sin6_family = AF_INET6;
7767 7823 sin6->sin6_addr = tcp->tcp_ip6h->ip6_src;
7768 7824 sin6->sin6_port = tcp->tcp_lport;
7769 7825
7770 7826 ackmp->b_wptr = (uchar_t *)&sin6[1];
7771 7827
7772 7828 if (tcp->tcp_state >= TCPS_SYN_RCVD) {
7773 7829 /*
7774 7830 * Fill in Remote address
7775 7831 */
7776 7832 taa->REMADDR_length = sizeof (sin6_t);
7777 7833 taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset +
7778 7834 taa->LOCADDR_length);
7779 7835
7780 7836 sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset);
7781 7837 *sin6 = sin6_null;
7782 7838 sin6->sin6_family = AF_INET6;
7783 7839 sin6->sin6_flowinfo =
7784 7840 tcp->tcp_ip6h->ip6_vcf &
7785 7841 ~IPV6_VERS_AND_FLOW_MASK;
7786 7842 sin6->sin6_addr = tcp->tcp_remote_v6;
7787 7843 sin6->sin6_port = tcp->tcp_fport;
7788 7844
7789 7845 ackmp->b_wptr = (uchar_t *)&sin6[1];
7790 7846 }
7791 7847 }
7792 7848 putnext(tcp->tcp_rq, ackmp);
7793 7849 }
7794 7850
7795 7851 /*
7796 7852 * Handle reinitialization of a tcp structure.
7797 7853 * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE.
7798 7854 */
7799 7855 static void
7800 7856 tcp_reinit(tcp_t *tcp)
7801 7857 {
7802 7858 mblk_t *mp;
7803 7859 int err;
7804 7860 tcp_stack_t *tcps = tcp->tcp_tcps;
7805 7861
7806 7862 TCP_STAT(tcps, tcp_reinit_calls);
7807 7863
7808 7864 /* tcp_reinit should never be called for detached tcp_t's */
7809 7865 ASSERT(tcp->tcp_listener == NULL);
7810 7866 ASSERT((tcp->tcp_family == AF_INET &&
7811 7867 tcp->tcp_ipversion == IPV4_VERSION) ||
7812 7868 (tcp->tcp_family == AF_INET6 &&
7813 7869 (tcp->tcp_ipversion == IPV4_VERSION ||
7814 7870 tcp->tcp_ipversion == IPV6_VERSION)));
7815 7871
7816 7872 /* Cancel outstanding timers */
7817 7873 tcp_timers_stop(tcp);
7818 7874
7819 7875 /*
7820 7876 * Reset everything in the state vector, after updating global
7821 7877 * MIB data from instance counters.
7822 7878 */
7823 7879 UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs);
7824 7880 tcp->tcp_ibsegs = 0;
7825 7881 UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs);
7826 7882 tcp->tcp_obsegs = 0;
7827 7883
7828 7884 tcp_close_mpp(&tcp->tcp_xmit_head);
7829 7885 if (tcp->tcp_snd_zcopy_aware)
7830 7886 tcp_zcopy_notify(tcp);
7831 7887 tcp->tcp_xmit_last = tcp->tcp_xmit_tail = NULL;
7832 7888 tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0;
7833 7889 mutex_enter(&tcp->tcp_non_sq_lock);
7834 7890 if (tcp->tcp_flow_stopped &&
7835 7891 TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
7836 7892 tcp_clrqfull(tcp);
7837 7893 }
7838 7894 mutex_exit(&tcp->tcp_non_sq_lock);
7839 7895 tcp_close_mpp(&tcp->tcp_reass_head);
7840 7896 tcp->tcp_reass_tail = NULL;
7841 7897 if (tcp->tcp_rcv_list != NULL) {
7842 7898 /* Free b_next chain */
7843 7899 tcp_close_mpp(&tcp->tcp_rcv_list);
7844 7900 tcp->tcp_rcv_last_head = NULL;
7845 7901 tcp->tcp_rcv_last_tail = NULL;
7846 7902 tcp->tcp_rcv_cnt = 0;
7847 7903 }
7848 7904 tcp->tcp_rcv_last_tail = NULL;
7849 7905
7850 7906 if ((mp = tcp->tcp_urp_mp) != NULL) {
7851 7907 freemsg(mp);
7852 7908 tcp->tcp_urp_mp = NULL;
7853 7909 }
7854 7910 if ((mp = tcp->tcp_urp_mark_mp) != NULL) {
7855 7911 freemsg(mp);
7856 7912 tcp->tcp_urp_mark_mp = NULL;
7857 7913 }
7858 7914 if (tcp->tcp_fused_sigurg_mp != NULL) {
7859 7915 freeb(tcp->tcp_fused_sigurg_mp);
7860 7916 tcp->tcp_fused_sigurg_mp = NULL;
7861 7917 }
7862 7918
7863 7919 /*
7864 7920 * Following is a union with two members which are
7865 7921 * identical types and size so the following cleanup
7866 7922 * is enough.
7867 7923 */
7868 7924 tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
7869 7925
7870 7926 CL_INET_DISCONNECT(tcp);
7871 7927
7872 7928 /*
7873 7929 * The connection can't be on the tcp_time_wait_head list
7874 7930 * since it is not detached.
7875 7931 */
7876 7932 ASSERT(tcp->tcp_time_wait_next == NULL);
7877 7933 ASSERT(tcp->tcp_time_wait_prev == NULL);
7878 7934 ASSERT(tcp->tcp_time_wait_expire == 0);
7879 7935
7880 7936 if (tcp->tcp_kssl_pending) {
7881 7937 tcp->tcp_kssl_pending = B_FALSE;
7882 7938
7883 7939 /* Don't reset if the initialized by bind. */
7884 7940 if (tcp->tcp_kssl_ent != NULL) {
7885 7941 kssl_release_ent(tcp->tcp_kssl_ent, NULL,
7886 7942 KSSL_NO_PROXY);
7887 7943 }
7888 7944 }
7889 7945 if (tcp->tcp_kssl_ctx != NULL) {
7890 7946 kssl_release_ctx(tcp->tcp_kssl_ctx);
7891 7947 tcp->tcp_kssl_ctx = NULL;
7892 7948 }
7893 7949
7894 7950 /*
7895 7951 * Reset/preserve other values
7896 7952 */
7897 7953 tcp_reinit_values(tcp);
7898 7954 ipcl_hash_remove(tcp->tcp_connp);
7899 7955 conn_delete_ire(tcp->tcp_connp, NULL);
7900 7956 tcp_ipsec_cleanup(tcp);
7901 7957
7902 7958 if (tcp->tcp_conn_req_max != 0) {
7903 7959 /*
7904 7960 * This is the case when a TLI program uses the same
7905 7961 * transport end point to accept a connection. This
7906 7962 * makes the TCP both a listener and acceptor. When
7907 7963 * this connection is closed, we need to set the state
7908 7964 * back to TCPS_LISTEN. Make sure that the eager list
7909 7965 * is reinitialized.
|
↓ open down ↓ |
880 lines elided |
↑ open up ↑ |
7910 7966 *
7911 7967 * Note that this stream is still bound to the four
7912 7968 * tuples of the previous connection in IP. If a new
7913 7969 * SYN with different foreign address comes in, IP will
7914 7970 * not find it and will send it to the global queue. In
7915 7971 * the global queue, TCP will do a tcp_lookup_listener()
7916 7972 * to find this stream. This works because this stream
7917 7973 * is only removed from connected hash.
7918 7974 *
7919 7975 */
7976 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7977 + tcp_t *, tcp, int32_t, TCPS_LISTEN);
7920 7978 tcp->tcp_state = TCPS_LISTEN;
7921 7979 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7922 7980 tcp->tcp_eager_next_drop_q0 = tcp;
7923 7981 tcp->tcp_eager_prev_drop_q0 = tcp;
7924 7982 tcp->tcp_connp->conn_recv = tcp_conn_request;
7925 7983 if (tcp->tcp_family == AF_INET6) {
7926 7984 ASSERT(tcp->tcp_connp->conn_af_isv6);
7927 7985 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7928 7986 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7929 7987 } else {
7930 7988 ASSERT(!tcp->tcp_connp->conn_af_isv6);
7931 7989 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7932 7990 tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7933 7991 }
7934 7992 } else {
7993 + DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7994 + tcp_t *, tcp, int32_t, TCPS_BOUND);
7935 7995 tcp->tcp_state = TCPS_BOUND;
7936 7996 }
7937 7997
7938 7998 /*
7939 7999 * Initialize to default values
7940 8000 * Can't fail since enough header template space already allocated
7941 8001 * at open().
7942 8002 */
7943 8003 err = tcp_init_values(tcp);
7944 8004 ASSERT(err == 0);
7945 8005 /* Restore state in tcp_tcph */
7946 8006 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
7947 8007 if (tcp->tcp_ipversion == IPV4_VERSION)
7948 8008 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
7949 8009 else
7950 8010 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
7951 8011 /*
7952 8012 * Copy of the src addr. in tcp_t is needed in tcp_t
7953 8013 * since the lookup funcs can only lookup on tcp_t
7954 8014 */
7955 8015 tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6;
7956 8016
7957 8017 ASSERT(tcp->tcp_ptpbhn != NULL);
7958 8018 tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat;
7959 8019 tcp->tcp_rwnd = tcps->tcps_recv_hiwat;
7960 8020 tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ?
7961 8021 tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4;
7962 8022 }
7963 8023
7964 8024 /*
7965 8025 * Force values to zero that need be zero.
7966 8026 * Do not touch values asociated with the BOUND or LISTEN state
7967 8027 * since the connection will end up in that state after the reinit.
7968 8028 * NOTE: tcp_reinit_values MUST have a line for each field in the tcp_t
7969 8029 * structure!
7970 8030 */
7971 8031 static void
7972 8032 tcp_reinit_values(tcp)
7973 8033 tcp_t *tcp;
7974 8034 {
7975 8035 tcp_stack_t *tcps = tcp->tcp_tcps;
7976 8036
7977 8037 #ifndef lint
7978 8038 #define DONTCARE(x)
7979 8039 #define PRESERVE(x)
7980 8040 #else
7981 8041 #define DONTCARE(x) ((x) = (x))
7982 8042 #define PRESERVE(x) ((x) = (x))
7983 8043 #endif /* lint */
7984 8044
7985 8045 PRESERVE(tcp->tcp_bind_hash);
7986 8046 PRESERVE(tcp->tcp_ptpbhn);
7987 8047 PRESERVE(tcp->tcp_acceptor_hash);
7988 8048 PRESERVE(tcp->tcp_ptpahn);
7989 8049
7990 8050 /* Should be ASSERT NULL on these with new code! */
7991 8051 ASSERT(tcp->tcp_time_wait_next == NULL);
7992 8052 ASSERT(tcp->tcp_time_wait_prev == NULL);
7993 8053 ASSERT(tcp->tcp_time_wait_expire == 0);
7994 8054 PRESERVE(tcp->tcp_state);
7995 8055 PRESERVE(tcp->tcp_rq);
7996 8056 PRESERVE(tcp->tcp_wq);
7997 8057
7998 8058 ASSERT(tcp->tcp_xmit_head == NULL);
7999 8059 ASSERT(tcp->tcp_xmit_last == NULL);
8000 8060 ASSERT(tcp->tcp_unsent == 0);
8001 8061 ASSERT(tcp->tcp_xmit_tail == NULL);
8002 8062 ASSERT(tcp->tcp_xmit_tail_unsent == 0);
8003 8063
8004 8064 tcp->tcp_snxt = 0; /* Displayed in mib */
8005 8065 tcp->tcp_suna = 0; /* Displayed in mib */
8006 8066 tcp->tcp_swnd = 0;
8007 8067 DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */
8008 8068
8009 8069 ASSERT(tcp->tcp_ibsegs == 0);
8010 8070 ASSERT(tcp->tcp_obsegs == 0);
8011 8071
8012 8072 if (tcp->tcp_iphc != NULL) {
8013 8073 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8014 8074 bzero(tcp->tcp_iphc, tcp->tcp_iphc_len);
8015 8075 }
8016 8076
8017 8077 DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */
8018 8078 DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */
8019 8079 DONTCARE(tcp->tcp_ipha);
8020 8080 DONTCARE(tcp->tcp_ip6h);
8021 8081 DONTCARE(tcp->tcp_ip_hdr_len);
8022 8082 DONTCARE(tcp->tcp_tcph);
8023 8083 DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */
8024 8084 tcp->tcp_valid_bits = 0;
8025 8085
8026 8086 DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */
8027 8087 DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */
8028 8088 DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */
8029 8089 tcp->tcp_last_rcv_lbolt = 0;
8030 8090
8031 8091 tcp->tcp_init_cwnd = 0;
8032 8092
8033 8093 tcp->tcp_urp_last_valid = 0;
8034 8094 tcp->tcp_hard_binding = 0;
8035 8095 tcp->tcp_hard_bound = 0;
8036 8096 PRESERVE(tcp->tcp_cred);
8037 8097 PRESERVE(tcp->tcp_cpid);
8038 8098 PRESERVE(tcp->tcp_open_time);
8039 8099 PRESERVE(tcp->tcp_exclbind);
8040 8100
8041 8101 tcp->tcp_fin_acked = 0;
8042 8102 tcp->tcp_fin_rcvd = 0;
8043 8103 tcp->tcp_fin_sent = 0;
8044 8104 tcp->tcp_ordrel_done = 0;
8045 8105
8046 8106 tcp->tcp_debug = 0;
8047 8107 tcp->tcp_dontroute = 0;
8048 8108 tcp->tcp_broadcast = 0;
8049 8109
8050 8110 tcp->tcp_useloopback = 0;
8051 8111 tcp->tcp_reuseaddr = 0;
8052 8112 tcp->tcp_oobinline = 0;
8053 8113 tcp->tcp_dgram_errind = 0;
8054 8114
8055 8115 tcp->tcp_detached = 0;
8056 8116 tcp->tcp_bind_pending = 0;
8057 8117 tcp->tcp_unbind_pending = 0;
8058 8118 tcp->tcp_deferred_clean_death = 0;
8059 8119
8060 8120 tcp->tcp_snd_ws_ok = B_FALSE;
8061 8121 tcp->tcp_snd_ts_ok = B_FALSE;
8062 8122 tcp->tcp_linger = 0;
8063 8123 tcp->tcp_ka_enabled = 0;
8064 8124 tcp->tcp_zero_win_probe = 0;
8065 8125
8066 8126 tcp->tcp_loopback = 0;
8067 8127 tcp->tcp_localnet = 0;
8068 8128 tcp->tcp_syn_defense = 0;
8069 8129 tcp->tcp_set_timer = 0;
8070 8130
8071 8131 tcp->tcp_active_open = 0;
8072 8132 ASSERT(tcp->tcp_timeout == B_FALSE);
8073 8133 tcp->tcp_rexmit = B_FALSE;
8074 8134 tcp->tcp_xmit_zc_clean = B_FALSE;
8075 8135
8076 8136 tcp->tcp_snd_sack_ok = B_FALSE;
8077 8137 PRESERVE(tcp->tcp_recvdstaddr);
8078 8138 tcp->tcp_hwcksum = B_FALSE;
8079 8139
8080 8140 tcp->tcp_ire_ill_check_done = B_FALSE;
8081 8141 DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */
8082 8142
8083 8143 tcp->tcp_mdt = B_FALSE;
8084 8144 tcp->tcp_mdt_hdr_head = 0;
8085 8145 tcp->tcp_mdt_hdr_tail = 0;
8086 8146
8087 8147 tcp->tcp_conn_def_q0 = 0;
8088 8148 tcp->tcp_ip_forward_progress = B_FALSE;
8089 8149 tcp->tcp_anon_priv_bind = 0;
8090 8150 tcp->tcp_ecn_ok = B_FALSE;
8091 8151
8092 8152 tcp->tcp_cwr = B_FALSE;
8093 8153 tcp->tcp_ecn_echo_on = B_FALSE;
8094 8154
8095 8155 if (tcp->tcp_sack_info != NULL) {
8096 8156 if (tcp->tcp_notsack_list != NULL) {
8097 8157 TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list);
8098 8158 }
8099 8159 kmem_cache_free(tcp_sack_info_cache, tcp->tcp_sack_info);
8100 8160 tcp->tcp_sack_info = NULL;
8101 8161 }
8102 8162
8103 8163 tcp->tcp_rcv_ws = 0;
8104 8164 tcp->tcp_snd_ws = 0;
8105 8165 tcp->tcp_ts_recent = 0;
8106 8166 tcp->tcp_rnxt = 0; /* Displayed in mib */
8107 8167 DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */
8108 8168 tcp->tcp_if_mtu = 0;
8109 8169
8110 8170 ASSERT(tcp->tcp_reass_head == NULL);
8111 8171 ASSERT(tcp->tcp_reass_tail == NULL);
8112 8172
8113 8173 tcp->tcp_cwnd_cnt = 0;
8114 8174
8115 8175 ASSERT(tcp->tcp_rcv_list == NULL);
8116 8176 ASSERT(tcp->tcp_rcv_last_head == NULL);
8117 8177 ASSERT(tcp->tcp_rcv_last_tail == NULL);
8118 8178 ASSERT(tcp->tcp_rcv_cnt == 0);
8119 8179
8120 8180 DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */
8121 8181 DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */
8122 8182 tcp->tcp_csuna = 0;
8123 8183
8124 8184 tcp->tcp_rto = 0; /* Displayed in MIB */
8125 8185 DONTCARE(tcp->tcp_rtt_sa); /* Init in tcp_init_values */
8126 8186 DONTCARE(tcp->tcp_rtt_sd); /* Init in tcp_init_values */
8127 8187 tcp->tcp_rtt_update = 0;
8128 8188
8129 8189 DONTCARE(tcp->tcp_swl1); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
8130 8190 DONTCARE(tcp->tcp_swl2); /* Init in case TCPS_LISTEN/TCPS_SYN_SENT */
8131 8191
8132 8192 tcp->tcp_rack = 0; /* Displayed in mib */
8133 8193 tcp->tcp_rack_cnt = 0;
8134 8194 tcp->tcp_rack_cur_max = 0;
8135 8195 tcp->tcp_rack_abs_max = 0;
8136 8196
8137 8197 tcp->tcp_max_swnd = 0;
8138 8198
8139 8199 ASSERT(tcp->tcp_listener == NULL);
8140 8200
8141 8201 DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */
8142 8202
8143 8203 DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */
8144 8204 DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */
8145 8205 DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */
8146 8206 DONTCARE(tcp->tcp_urg); /* tcp_valid_bits cleared */
8147 8207
8148 8208 ASSERT(tcp->tcp_conn_req_cnt_q == 0);
8149 8209 ASSERT(tcp->tcp_conn_req_cnt_q0 == 0);
8150 8210 PRESERVE(tcp->tcp_conn_req_max);
8151 8211 PRESERVE(tcp->tcp_conn_req_seqnum);
8152 8212
8153 8213 DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */
8154 8214 DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */
8155 8215 DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */
8156 8216 DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */
8157 8217 DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */
8158 8218
8159 8219 tcp->tcp_lingertime = 0;
8160 8220
8161 8221 DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */
8162 8222 ASSERT(tcp->tcp_urp_mp == NULL);
8163 8223 ASSERT(tcp->tcp_urp_mark_mp == NULL);
8164 8224 ASSERT(tcp->tcp_fused_sigurg_mp == NULL);
8165 8225
8166 8226 ASSERT(tcp->tcp_eager_next_q == NULL);
8167 8227 ASSERT(tcp->tcp_eager_last_q == NULL);
8168 8228 ASSERT((tcp->tcp_eager_next_q0 == NULL &&
8169 8229 tcp->tcp_eager_prev_q0 == NULL) ||
8170 8230 tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
8171 8231 ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
8172 8232
8173 8233 ASSERT((tcp->tcp_eager_next_drop_q0 == NULL &&
8174 8234 tcp->tcp_eager_prev_drop_q0 == NULL) ||
8175 8235 tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0);
8176 8236
8177 8237 tcp->tcp_client_errno = 0;
8178 8238
8179 8239 DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */
8180 8240
8181 8241 tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */
8182 8242
8183 8243 PRESERVE(tcp->tcp_bound_source_v6);
8184 8244 tcp->tcp_last_sent_len = 0;
8185 8245 tcp->tcp_dupack_cnt = 0;
8186 8246
8187 8247 tcp->tcp_fport = 0; /* Displayed in MIB */
8188 8248 PRESERVE(tcp->tcp_lport);
8189 8249
8190 8250 PRESERVE(tcp->tcp_acceptor_lockp);
8191 8251
8192 8252 ASSERT(tcp->tcp_ordrelid == 0);
8193 8253 PRESERVE(tcp->tcp_acceptor_id);
8194 8254 DONTCARE(tcp->tcp_ipsec_overhead);
8195 8255
8196 8256 /*
8197 8257 * If tcp_tracing flag is ON (i.e. We have a trace buffer
8198 8258 * in tcp structure and now tracing), Re-initialize all
8199 8259 * members of tcp_traceinfo.
8200 8260 */
8201 8261 if (tcp->tcp_tracebuf != NULL) {
8202 8262 bzero(tcp->tcp_tracebuf, sizeof (tcptrch_t));
8203 8263 }
8204 8264
8205 8265 PRESERVE(tcp->tcp_family);
8206 8266 if (tcp->tcp_family == AF_INET6) {
8207 8267 tcp->tcp_ipversion = IPV6_VERSION;
8208 8268 tcp->tcp_mss = tcps->tcps_mss_def_ipv6;
8209 8269 } else {
8210 8270 tcp->tcp_ipversion = IPV4_VERSION;
8211 8271 tcp->tcp_mss = tcps->tcps_mss_def_ipv4;
8212 8272 }
8213 8273
8214 8274 tcp->tcp_bound_if = 0;
8215 8275 tcp->tcp_ipv6_recvancillary = 0;
8216 8276 tcp->tcp_recvifindex = 0;
8217 8277 tcp->tcp_recvhops = 0;
8218 8278 tcp->tcp_closed = 0;
8219 8279 tcp->tcp_cleandeathtag = 0;
8220 8280 if (tcp->tcp_hopopts != NULL) {
8221 8281 mi_free(tcp->tcp_hopopts);
8222 8282 tcp->tcp_hopopts = NULL;
8223 8283 tcp->tcp_hopoptslen = 0;
8224 8284 }
8225 8285 ASSERT(tcp->tcp_hopoptslen == 0);
8226 8286 if (tcp->tcp_dstopts != NULL) {
8227 8287 mi_free(tcp->tcp_dstopts);
8228 8288 tcp->tcp_dstopts = NULL;
8229 8289 tcp->tcp_dstoptslen = 0;
8230 8290 }
8231 8291 ASSERT(tcp->tcp_dstoptslen == 0);
8232 8292 if (tcp->tcp_rtdstopts != NULL) {
8233 8293 mi_free(tcp->tcp_rtdstopts);
8234 8294 tcp->tcp_rtdstopts = NULL;
8235 8295 tcp->tcp_rtdstoptslen = 0;
8236 8296 }
8237 8297 ASSERT(tcp->tcp_rtdstoptslen == 0);
8238 8298 if (tcp->tcp_rthdr != NULL) {
8239 8299 mi_free(tcp->tcp_rthdr);
8240 8300 tcp->tcp_rthdr = NULL;
8241 8301 tcp->tcp_rthdrlen = 0;
8242 8302 }
8243 8303 ASSERT(tcp->tcp_rthdrlen == 0);
8244 8304 PRESERVE(tcp->tcp_drop_opt_ack_cnt);
8245 8305
8246 8306 /* Reset fusion-related fields */
8247 8307 tcp->tcp_fused = B_FALSE;
8248 8308 tcp->tcp_unfusable = B_FALSE;
8249 8309 tcp->tcp_fused_sigurg = B_FALSE;
8250 8310 tcp->tcp_direct_sockfs = B_FALSE;
8251 8311 tcp->tcp_fuse_syncstr_stopped = B_FALSE;
8252 8312 tcp->tcp_fuse_syncstr_plugged = B_FALSE;
8253 8313 tcp->tcp_loopback_peer = NULL;
8254 8314 tcp->tcp_fuse_rcv_hiwater = 0;
8255 8315 tcp->tcp_fuse_rcv_unread_hiwater = 0;
8256 8316 tcp->tcp_fuse_rcv_unread_cnt = 0;
8257 8317
8258 8318 tcp->tcp_lso = B_FALSE;
8259 8319
8260 8320 tcp->tcp_in_ack_unsent = 0;
8261 8321 tcp->tcp_cork = B_FALSE;
8262 8322 tcp->tcp_tconnind_started = B_FALSE;
8263 8323
8264 8324 PRESERVE(tcp->tcp_squeue_bytes);
8265 8325
8266 8326 ASSERT(tcp->tcp_kssl_ctx == NULL);
8267 8327 ASSERT(!tcp->tcp_kssl_pending);
8268 8328 PRESERVE(tcp->tcp_kssl_ent);
8269 8329
8270 8330 /* Sodirect */
8271 8331 tcp->tcp_sodirect = NULL;
8272 8332
8273 8333 tcp->tcp_closemp_used = B_FALSE;
8274 8334
8275 8335 #ifdef DEBUG
8276 8336 DONTCARE(tcp->tcmp_stk[0]);
8277 8337 #endif
8278 8338
8279 8339
8280 8340 #undef DONTCARE
8281 8341 #undef PRESERVE
8282 8342 }
8283 8343
8284 8344 /*
8285 8345 * Allocate necessary resources and initialize state vector.
|
↓ open down ↓ |
341 lines elided |
↑ open up ↑ |
8286 8346 * Guaranteed not to fail so that when an error is returned,
8287 8347 * the caller doesn't need to do any additional cleanup.
8288 8348 */
8289 8349 int
8290 8350 tcp_init(tcp_t *tcp, queue_t *q)
8291 8351 {
8292 8352 int err;
8293 8353
8294 8354 tcp->tcp_rq = q;
8295 8355 tcp->tcp_wq = WR(q);
8356 + /* DTrace ignores this - it isn't a tcp:::state-change */
8296 8357 tcp->tcp_state = TCPS_IDLE;
8297 8358 if ((err = tcp_init_values(tcp)) != 0)
8298 8359 tcp_timers_stop(tcp);
8299 8360 return (err);
8300 8361 }
8301 8362
8302 8363 static int
8303 8364 tcp_init_values(tcp_t *tcp)
8304 8365 {
8305 8366 int err;
8306 8367 tcp_stack_t *tcps = tcp->tcp_tcps;
8307 8368
8308 8369 ASSERT((tcp->tcp_family == AF_INET &&
8309 8370 tcp->tcp_ipversion == IPV4_VERSION) ||
8310 8371 (tcp->tcp_family == AF_INET6 &&
8311 8372 (tcp->tcp_ipversion == IPV4_VERSION ||
8312 8373 tcp->tcp_ipversion == IPV6_VERSION)));
8313 8374
8314 8375 /*
8315 8376 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
8316 8377 * will be close to tcp_rexmit_interval_initial. By doing this, we
8317 8378 * allow the algorithm to adjust slowly to large fluctuations of RTT
8318 8379 * during first few transmissions of a connection as seen in slow
8319 8380 * links.
8320 8381 */
8321 8382 tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2;
8322 8383 tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1;
8323 8384 tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
8324 8385 tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
8325 8386 tcps->tcps_conn_grace_period;
8326 8387 if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min)
8327 8388 tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
8328 8389 tcp->tcp_timer_backoff = 0;
8329 8390 tcp->tcp_ms_we_have_waited = 0;
8330 8391 tcp->tcp_last_recv_time = lbolt;
8331 8392 tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_;
8332 8393 tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
8333 8394 tcp->tcp_snd_burst = TCP_CWND_INFINITE;
8334 8395
8335 8396 tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier;
8336 8397
8337 8398 tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
8338 8399 tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
8339 8400 tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
8340 8401 /*
8341 8402 * Fix it to tcp_ip_abort_linterval later if it turns out to be a
8342 8403 * passive open.
8343 8404 */
8344 8405 tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval;
8345 8406
8346 8407 tcp->tcp_naglim = tcps->tcps_naglim_def;
8347 8408
8348 8409 /* NOTE: ISS is now set in tcp_adapt_ire(). */
8349 8410
8350 8411 tcp->tcp_mdt_hdr_head = 0;
8351 8412 tcp->tcp_mdt_hdr_tail = 0;
8352 8413
8353 8414 /* Reset fusion-related fields */
8354 8415 tcp->tcp_fused = B_FALSE;
8355 8416 tcp->tcp_unfusable = B_FALSE;
8356 8417 tcp->tcp_fused_sigurg = B_FALSE;
8357 8418 tcp->tcp_direct_sockfs = B_FALSE;
8358 8419 tcp->tcp_fuse_syncstr_stopped = B_FALSE;
8359 8420 tcp->tcp_fuse_syncstr_plugged = B_FALSE;
8360 8421 tcp->tcp_loopback_peer = NULL;
8361 8422 tcp->tcp_fuse_rcv_hiwater = 0;
8362 8423 tcp->tcp_fuse_rcv_unread_hiwater = 0;
8363 8424 tcp->tcp_fuse_rcv_unread_cnt = 0;
8364 8425
8365 8426 /* Sodirect */
8366 8427 tcp->tcp_sodirect = NULL;
8367 8428
8368 8429 /* Initialize the header template */
8369 8430 if (tcp->tcp_ipversion == IPV4_VERSION) {
8370 8431 err = tcp_header_init_ipv4(tcp);
8371 8432 } else {
8372 8433 err = tcp_header_init_ipv6(tcp);
8373 8434 }
8374 8435 if (err)
8375 8436 return (err);
8376 8437
8377 8438 /*
8378 8439 * Init the window scale to the max so tcp_rwnd_set() won't pare
8379 8440 * down tcp_rwnd. tcp_adapt_ire() will set the right value later.
8380 8441 */
8381 8442 tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT;
8382 8443 tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat;
8383 8444 tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat;
8384 8445
8385 8446 tcp->tcp_cork = B_FALSE;
8386 8447 /*
8387 8448 * Init the tcp_debug option. This value determines whether TCP
8388 8449 * calls strlog() to print out debug messages. Doing this
8389 8450 * initialization here means that this value is not inherited thru
8390 8451 * tcp_reinit().
8391 8452 */
8392 8453 tcp->tcp_debug = tcps->tcps_dbg;
8393 8454
8394 8455 tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
8395 8456 tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
8396 8457
8397 8458 return (0);
8398 8459 }
8399 8460
8400 8461 /*
8401 8462 * Initialize the IPv4 header. Loses any record of any IP options.
8402 8463 */
8403 8464 static int
8404 8465 tcp_header_init_ipv4(tcp_t *tcp)
8405 8466 {
8406 8467 tcph_t *tcph;
8407 8468 uint32_t sum;
8408 8469 conn_t *connp;
8409 8470 tcp_stack_t *tcps = tcp->tcp_tcps;
8410 8471
8411 8472 /*
8412 8473 * This is a simple initialization. If there's
8413 8474 * already a template, it should never be too small,
8414 8475 * so reuse it. Otherwise, allocate space for the new one.
8415 8476 */
8416 8477 if (tcp->tcp_iphc == NULL) {
8417 8478 ASSERT(tcp->tcp_iphc_len == 0);
8418 8479 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8419 8480 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8420 8481 if (tcp->tcp_iphc == NULL) {
8421 8482 tcp->tcp_iphc_len = 0;
8422 8483 return (ENOMEM);
8423 8484 }
8424 8485 }
8425 8486
8426 8487 /* options are gone; may need a new label */
8427 8488 connp = tcp->tcp_connp;
8428 8489 connp->conn_mlp_type = mlptSingle;
8429 8490 connp->conn_ulp_labeled = !is_system_labeled();
8430 8491 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8431 8492 tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc;
8432 8493 tcp->tcp_ip6h = NULL;
8433 8494 tcp->tcp_ipversion = IPV4_VERSION;
8434 8495 tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t);
8435 8496 tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8436 8497 tcp->tcp_ip_hdr_len = sizeof (ipha_t);
8437 8498 tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t));
8438 8499 tcp->tcp_ipha->ipha_version_and_hdr_length
8439 8500 = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS;
8440 8501 tcp->tcp_ipha->ipha_ident = 0;
8441 8502
8442 8503 tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
8443 8504 tcp->tcp_tos = 0;
8444 8505 tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
8445 8506 tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl;
8446 8507 tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP;
8447 8508
8448 8509 tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t));
8449 8510 tcp->tcp_tcph = tcph;
8450 8511 tcph->th_offset_and_rsrvd[0] = (5 << 4);
8451 8512 /*
8452 8513 * IP wants our header length in the checksum field to
8453 8514 * allow it to perform a single pseudo-header+checksum
8454 8515 * calculation on behalf of TCP.
8455 8516 * Include the adjustment for a source route once IP_OPTIONS is set.
8456 8517 */
8457 8518 sum = sizeof (tcph_t) + tcp->tcp_sum;
8458 8519 sum = (sum >> 16) + (sum & 0xFFFF);
8459 8520 U16_TO_ABE16(sum, tcph->th_sum);
8460 8521 return (0);
8461 8522 }
8462 8523
8463 8524 /*
8464 8525 * Initialize the IPv6 header. Loses any record of any IPv6 extension headers.
8465 8526 */
8466 8527 static int
8467 8528 tcp_header_init_ipv6(tcp_t *tcp)
8468 8529 {
8469 8530 tcph_t *tcph;
8470 8531 uint32_t sum;
8471 8532 conn_t *connp;
8472 8533 tcp_stack_t *tcps = tcp->tcp_tcps;
8473 8534
8474 8535 /*
8475 8536 * This is a simple initialization. If there's
8476 8537 * already a template, it should never be too small,
8477 8538 * so reuse it. Otherwise, allocate space for the new one.
8478 8539 * Ensure that there is enough space to "downgrade" the tcp_t
8479 8540 * to an IPv4 tcp_t. This requires having space for a full load
8480 8541 * of IPv4 options, as well as a full load of TCP options
8481 8542 * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space
8482 8543 * than a v6 header and a TCP header with a full load of TCP options
8483 8544 * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes).
8484 8545 * We want to avoid reallocation in the "downgraded" case when
8485 8546 * processing outbound IPv4 options.
8486 8547 */
8487 8548 if (tcp->tcp_iphc == NULL) {
8488 8549 ASSERT(tcp->tcp_iphc_len == 0);
8489 8550 tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH;
8490 8551 tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP);
8491 8552 if (tcp->tcp_iphc == NULL) {
8492 8553 tcp->tcp_iphc_len = 0;
8493 8554 return (ENOMEM);
8494 8555 }
8495 8556 }
8496 8557
8497 8558 /* options are gone; may need a new label */
8498 8559 connp = tcp->tcp_connp;
8499 8560 connp->conn_mlp_type = mlptSingle;
8500 8561 connp->conn_ulp_labeled = !is_system_labeled();
8501 8562
8502 8563 ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH);
8503 8564 tcp->tcp_ipversion = IPV6_VERSION;
8504 8565 tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t);
8505 8566 tcp->tcp_tcp_hdr_len = sizeof (tcph_t);
8506 8567 tcp->tcp_ip_hdr_len = IPV6_HDR_LEN;
8507 8568 tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc;
8508 8569 tcp->tcp_ipha = NULL;
8509 8570
8510 8571 /* Initialize the header template */
8511 8572
8512 8573 tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
8513 8574 tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t));
8514 8575 tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP;
8515 8576 tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit;
8516 8577
8517 8578 tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN);
8518 8579 tcp->tcp_tcph = tcph;
8519 8580 tcph->th_offset_and_rsrvd[0] = (5 << 4);
8520 8581 /*
8521 8582 * IP wants our header length in the checksum field to
8522 8583 * allow it to perform a single psuedo-header+checksum
8523 8584 * calculation on behalf of TCP.
8524 8585 * Include the adjustment for a source route when IPV6_RTHDR is set.
8525 8586 */
8526 8587 sum = sizeof (tcph_t) + tcp->tcp_sum;
8527 8588 sum = (sum >> 16) + (sum & 0xFFFF);
8528 8589 U16_TO_ABE16(sum, tcph->th_sum);
8529 8590 return (0);
8530 8591 }
8531 8592
8532 8593 /* At minimum we need 8 bytes in the TCP header for the lookup */
8533 8594 #define ICMP_MIN_TCP_HDR 8
8534 8595
8535 8596 /*
8536 8597 * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages
8537 8598 * passed up by IP. The message is always received on the correct tcp_t.
8538 8599 * Assumes that IP has pulled up everything up to and including the ICMP header.
8539 8600 */
8540 8601 void
8541 8602 tcp_icmp_error(tcp_t *tcp, mblk_t *mp)
8542 8603 {
8543 8604 icmph_t *icmph;
8544 8605 ipha_t *ipha;
8545 8606 int iph_hdr_length;
8546 8607 tcph_t *tcph;
8547 8608 boolean_t ipsec_mctl = B_FALSE;
8548 8609 boolean_t secure;
8549 8610 mblk_t *first_mp = mp;
8550 8611 uint32_t new_mss;
8551 8612 uint32_t ratio;
8552 8613 size_t mp_size = MBLKL(mp);
8553 8614 uint32_t seg_seq;
8554 8615 tcp_stack_t *tcps = tcp->tcp_tcps;
8555 8616
8556 8617 /* Assume IP provides aligned packets - otherwise toss */
8557 8618 if (!OK_32PTR(mp->b_rptr)) {
8558 8619 freemsg(mp);
8559 8620 return;
8560 8621 }
8561 8622
8562 8623 /*
8563 8624 * Since ICMP errors are normal data marked with M_CTL when sent
8564 8625 * to TCP or UDP, we have to look for a IPSEC_IN value to identify
8565 8626 * packets starting with an ipsec_info_t, see ipsec_info.h.
8566 8627 */
8567 8628 if ((mp_size == sizeof (ipsec_info_t)) &&
8568 8629 (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) {
8569 8630 ASSERT(mp->b_cont != NULL);
8570 8631 mp = mp->b_cont;
8571 8632 /* IP should have done this */
8572 8633 ASSERT(OK_32PTR(mp->b_rptr));
8573 8634 mp_size = MBLKL(mp);
8574 8635 ipsec_mctl = B_TRUE;
8575 8636 }
8576 8637
8577 8638 /*
8578 8639 * Verify that we have a complete outer IP header. If not, drop it.
8579 8640 */
8580 8641 if (mp_size < sizeof (ipha_t)) {
8581 8642 noticmpv4:
8582 8643 freemsg(first_mp);
8583 8644 return;
8584 8645 }
8585 8646
8586 8647 ipha = (ipha_t *)mp->b_rptr;
8587 8648 /*
8588 8649 * Verify IP version. Anything other than IPv4 or IPv6 packet is sent
8589 8650 * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6.
8590 8651 */
8591 8652 switch (IPH_HDR_VERSION(ipha)) {
8592 8653 case IPV6_VERSION:
8593 8654 tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl);
8594 8655 return;
8595 8656 case IPV4_VERSION:
8596 8657 break;
8597 8658 default:
8598 8659 goto noticmpv4;
8599 8660 }
8600 8661
8601 8662 /* Skip past the outer IP and ICMP headers */
8602 8663 iph_hdr_length = IPH_HDR_LENGTH(ipha);
8603 8664 icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length];
8604 8665 /*
8605 8666 * If we don't have the correct outer IP header length or if the ULP
8606 8667 * is not IPPROTO_ICMP or if we don't have a complete inner IP header
8607 8668 * send it upstream.
8608 8669 */
8609 8670 if (iph_hdr_length < sizeof (ipha_t) ||
8610 8671 ipha->ipha_protocol != IPPROTO_ICMP ||
8611 8672 (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) {
8612 8673 goto noticmpv4;
8613 8674 }
8614 8675 ipha = (ipha_t *)&icmph[1];
8615 8676
8616 8677 /* Skip past the inner IP and find the ULP header */
8617 8678 iph_hdr_length = IPH_HDR_LENGTH(ipha);
8618 8679 tcph = (tcph_t *)((char *)ipha + iph_hdr_length);
8619 8680 /*
8620 8681 * If we don't have the correct inner IP header length or if the ULP
8621 8682 * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR
8622 8683 * bytes of TCP header, drop it.
8623 8684 */
8624 8685 if (iph_hdr_length < sizeof (ipha_t) ||
8625 8686 ipha->ipha_protocol != IPPROTO_TCP ||
8626 8687 (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) {
8627 8688 goto noticmpv4;
8628 8689 }
8629 8690
8630 8691 if (TCP_IS_DETACHED_NONEAGER(tcp)) {
8631 8692 if (ipsec_mctl) {
8632 8693 secure = ipsec_in_is_secure(first_mp);
8633 8694 } else {
8634 8695 secure = B_FALSE;
8635 8696 }
8636 8697 if (secure) {
8637 8698 /*
8638 8699 * If we are willing to accept this in clear
8639 8700 * we don't have to verify policy.
8640 8701 */
8641 8702 if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) {
8642 8703 if (!tcp_check_policy(tcp, first_mp,
8643 8704 ipha, NULL, secure, ipsec_mctl)) {
8644 8705 /*
8645 8706 * tcp_check_policy called
8646 8707 * ip_drop_packet() on failure.
8647 8708 */
8648 8709 return;
8649 8710 }
8650 8711 }
8651 8712 }
8652 8713 } else if (ipsec_mctl) {
8653 8714 /*
8654 8715 * This is a hard_bound connection. IP has already
8655 8716 * verified policy. We don't have to do it again.
8656 8717 */
8657 8718 freeb(first_mp);
8658 8719 first_mp = mp;
8659 8720 ipsec_mctl = B_FALSE;
8660 8721 }
8661 8722
8662 8723 seg_seq = ABE32_TO_U32(tcph->th_seq);
8663 8724 /*
8664 8725 * TCP SHOULD check that the TCP sequence number contained in
8665 8726 * payload of the ICMP error message is within the range
8666 8727 * SND.UNA <= SEG.SEQ < SND.NXT.
8667 8728 */
8668 8729 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
8669 8730 /*
8670 8731 * If the ICMP message is bogus, should we kill the
8671 8732 * connection, or should we just drop the bogus ICMP
8672 8733 * message? It would probably make more sense to just
8673 8734 * drop the message so that if this one managed to get
8674 8735 * in, the real connection should not suffer.
8675 8736 */
8676 8737 goto noticmpv4;
8677 8738 }
8678 8739
8679 8740 switch (icmph->icmph_type) {
8680 8741 case ICMP_DEST_UNREACHABLE:
8681 8742 switch (icmph->icmph_code) {
8682 8743 case ICMP_FRAGMENTATION_NEEDED:
8683 8744 /*
8684 8745 * Reduce the MSS based on the new MTU. This will
8685 8746 * eliminate any fragmentation locally.
8686 8747 * N.B. There may well be some funny side-effects on
8687 8748 * the local send policy and the remote receive policy.
8688 8749 * Pending further research, we provide
8689 8750 * tcp_ignore_path_mtu just in case this proves
8690 8751 * disastrous somewhere.
8691 8752 *
8692 8753 * After updating the MSS, retransmit part of the
8693 8754 * dropped segment using the new mss by calling
8694 8755 * tcp_wput_data(). Need to adjust all those
8695 8756 * params to make sure tcp_wput_data() work properly.
8696 8757 */
8697 8758 if (tcps->tcps_ignore_path_mtu)
8698 8759 break;
8699 8760
8700 8761 /*
8701 8762 * Decrease the MSS by time stamp options
8702 8763 * IP options and IPSEC options. tcp_hdr_len
8703 8764 * includes time stamp option and IP option
8704 8765 * length.
8705 8766 */
8706 8767
8707 8768 new_mss = ntohs(icmph->icmph_du_mtu) -
8708 8769 tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead;
8709 8770
8710 8771 /*
8711 8772 * Only update the MSS if the new one is
8712 8773 * smaller than the previous one. This is
8713 8774 * to avoid problems when getting multiple
8714 8775 * ICMP errors for the same MTU.
8715 8776 */
8716 8777 if (new_mss >= tcp->tcp_mss)
8717 8778 break;
8718 8779
8719 8780 /*
8720 8781 * Stop doing PMTU if new_mss is less than 68
8721 8782 * or less than tcp_mss_min.
8722 8783 * The value 68 comes from rfc 1191.
8723 8784 */
8724 8785 if (new_mss < MAX(68, tcps->tcps_mss_min))
8725 8786 tcp->tcp_ipha->ipha_fragment_offset_and_flags =
8726 8787 0;
8727 8788
8728 8789 ratio = tcp->tcp_cwnd / tcp->tcp_mss;
8729 8790 ASSERT(ratio >= 1);
8730 8791 tcp_mss_set(tcp, new_mss, B_TRUE);
8731 8792
8732 8793 /*
8733 8794 * Make sure we have something to
8734 8795 * send.
8735 8796 */
8736 8797 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
8737 8798 (tcp->tcp_xmit_head != NULL)) {
8738 8799 /*
8739 8800 * Shrink tcp_cwnd in
8740 8801 * proportion to the old MSS/new MSS.
8741 8802 */
8742 8803 tcp->tcp_cwnd = ratio * tcp->tcp_mss;
8743 8804 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
8744 8805 (tcp->tcp_unsent == 0)) {
8745 8806 tcp->tcp_rexmit_max = tcp->tcp_fss;
8746 8807 } else {
8747 8808 tcp->tcp_rexmit_max = tcp->tcp_snxt;
8748 8809 }
8749 8810 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
8750 8811 tcp->tcp_rexmit = B_TRUE;
8751 8812 tcp->tcp_dupack_cnt = 0;
8752 8813 tcp->tcp_snd_burst = TCP_CWND_SS;
8753 8814 tcp_ss_rexmit(tcp);
8754 8815 }
8755 8816 break;
8756 8817 case ICMP_PORT_UNREACHABLE:
8757 8818 case ICMP_PROTOCOL_UNREACHABLE:
8758 8819 switch (tcp->tcp_state) {
8759 8820 case TCPS_SYN_SENT:
8760 8821 case TCPS_SYN_RCVD:
8761 8822 /*
8762 8823 * ICMP can snipe away incipient
8763 8824 * TCP connections as long as
8764 8825 * seq number is same as initial
8765 8826 * send seq number.
8766 8827 */
8767 8828 if (seg_seq == tcp->tcp_iss) {
8768 8829 (void) tcp_clean_death(tcp,
8769 8830 ECONNREFUSED, 6);
8770 8831 }
8771 8832 break;
8772 8833 }
8773 8834 break;
8774 8835 case ICMP_HOST_UNREACHABLE:
8775 8836 case ICMP_NET_UNREACHABLE:
8776 8837 /* Record the error in case we finally time out. */
8777 8838 if (icmph->icmph_code == ICMP_HOST_UNREACHABLE)
8778 8839 tcp->tcp_client_errno = EHOSTUNREACH;
8779 8840 else
8780 8841 tcp->tcp_client_errno = ENETUNREACH;
8781 8842 if (tcp->tcp_state == TCPS_SYN_RCVD) {
8782 8843 if (tcp->tcp_listener != NULL &&
8783 8844 tcp->tcp_listener->tcp_syn_defense) {
8784 8845 /*
8785 8846 * Ditch the half-open connection if we
8786 8847 * suspect a SYN attack is under way.
8787 8848 */
8788 8849 tcp_ip_ire_mark_advice(tcp);
8789 8850 (void) tcp_clean_death(tcp,
8790 8851 tcp->tcp_client_errno, 7);
8791 8852 }
8792 8853 }
8793 8854 break;
8794 8855 default:
8795 8856 break;
8796 8857 }
8797 8858 break;
8798 8859 case ICMP_SOURCE_QUENCH: {
8799 8860 /*
8800 8861 * use a global boolean to control
8801 8862 * whether TCP should respond to ICMP_SOURCE_QUENCH.
8802 8863 * The default is false.
8803 8864 */
8804 8865 if (tcp_icmp_source_quench) {
8805 8866 /*
8806 8867 * Reduce the sending rate as if we got a
8807 8868 * retransmit timeout
8808 8869 */
8809 8870 uint32_t npkt;
8810 8871
8811 8872 npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) /
8812 8873 tcp->tcp_mss;
8813 8874 tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss;
8814 8875 tcp->tcp_cwnd = tcp->tcp_mss;
8815 8876 tcp->tcp_cwnd_cnt = 0;
8816 8877 }
8817 8878 break;
8818 8879 }
8819 8880 }
8820 8881 freemsg(first_mp);
8821 8882 }
8822 8883
8823 8884 /*
8824 8885 * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6
8825 8886 * error messages passed up by IP.
8826 8887 * Assumes that IP has pulled up all the extension headers as well
8827 8888 * as the ICMPv6 header.
8828 8889 */
8829 8890 static void
8830 8891 tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl)
8831 8892 {
8832 8893 icmp6_t *icmp6;
8833 8894 ip6_t *ip6h;
8834 8895 uint16_t iph_hdr_length;
8835 8896 tcpha_t *tcpha;
8836 8897 uint8_t *nexthdrp;
8837 8898 uint32_t new_mss;
8838 8899 uint32_t ratio;
8839 8900 boolean_t secure;
8840 8901 mblk_t *first_mp = mp;
8841 8902 size_t mp_size;
8842 8903 uint32_t seg_seq;
8843 8904 tcp_stack_t *tcps = tcp->tcp_tcps;
8844 8905
8845 8906 /*
8846 8907 * The caller has determined if this is an IPSEC_IN packet and
8847 8908 * set ipsec_mctl appropriately (see tcp_icmp_error).
8848 8909 */
8849 8910 if (ipsec_mctl)
8850 8911 mp = mp->b_cont;
8851 8912
8852 8913 mp_size = MBLKL(mp);
8853 8914
8854 8915 /*
8855 8916 * Verify that we have a complete IP header. If not, send it upstream.
8856 8917 */
8857 8918 if (mp_size < sizeof (ip6_t)) {
8858 8919 noticmpv6:
8859 8920 freemsg(first_mp);
8860 8921 return;
8861 8922 }
8862 8923
8863 8924 /*
8864 8925 * Verify this is an ICMPV6 packet, else send it upstream.
8865 8926 */
8866 8927 ip6h = (ip6_t *)mp->b_rptr;
8867 8928 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
8868 8929 iph_hdr_length = IPV6_HDR_LEN;
8869 8930 } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length,
8870 8931 &nexthdrp) ||
8871 8932 *nexthdrp != IPPROTO_ICMPV6) {
8872 8933 goto noticmpv6;
8873 8934 }
8874 8935 icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length];
8875 8936 ip6h = (ip6_t *)&icmp6[1];
8876 8937 /*
8877 8938 * Verify if we have a complete ICMP and inner IP header.
8878 8939 */
8879 8940 if ((uchar_t *)&ip6h[1] > mp->b_wptr)
8880 8941 goto noticmpv6;
8881 8942
8882 8943 if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp))
8883 8944 goto noticmpv6;
8884 8945 tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length);
8885 8946 /*
8886 8947 * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't
8887 8948 * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the
8888 8949 * packet.
8889 8950 */
8890 8951 if ((*nexthdrp != IPPROTO_TCP) ||
8891 8952 ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) {
8892 8953 goto noticmpv6;
8893 8954 }
8894 8955
8895 8956 /*
8896 8957 * ICMP errors come on the right queue or come on
8897 8958 * listener/global queue for detached connections and
8898 8959 * get switched to the right queue. If it comes on the
8899 8960 * right queue, policy check has already been done by IP
8900 8961 * and thus free the first_mp without verifying the policy.
8901 8962 * If it has come for a non-hard bound connection, we need
8902 8963 * to verify policy as IP may not have done it.
8903 8964 */
8904 8965 if (!tcp->tcp_hard_bound) {
8905 8966 if (ipsec_mctl) {
8906 8967 secure = ipsec_in_is_secure(first_mp);
8907 8968 } else {
8908 8969 secure = B_FALSE;
8909 8970 }
8910 8971 if (secure) {
8911 8972 /*
8912 8973 * If we are willing to accept this in clear
8913 8974 * we don't have to verify policy.
8914 8975 */
8915 8976 if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) {
8916 8977 if (!tcp_check_policy(tcp, first_mp,
8917 8978 NULL, ip6h, secure, ipsec_mctl)) {
8918 8979 /*
8919 8980 * tcp_check_policy called
8920 8981 * ip_drop_packet() on failure.
8921 8982 */
8922 8983 return;
8923 8984 }
8924 8985 }
8925 8986 }
8926 8987 } else if (ipsec_mctl) {
8927 8988 /*
8928 8989 * This is a hard_bound connection. IP has already
8929 8990 * verified policy. We don't have to do it again.
8930 8991 */
8931 8992 freeb(first_mp);
8932 8993 first_mp = mp;
8933 8994 ipsec_mctl = B_FALSE;
8934 8995 }
8935 8996
8936 8997 seg_seq = ntohl(tcpha->tha_seq);
8937 8998 /*
8938 8999 * TCP SHOULD check that the TCP sequence number contained in
8939 9000 * payload of the ICMP error message is within the range
8940 9001 * SND.UNA <= SEG.SEQ < SND.NXT.
8941 9002 */
8942 9003 if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) {
8943 9004 /*
8944 9005 * If the ICMP message is bogus, should we kill the
8945 9006 * connection, or should we just drop the bogus ICMP
8946 9007 * message? It would probably make more sense to just
8947 9008 * drop the message so that if this one managed to get
8948 9009 * in, the real connection should not suffer.
8949 9010 */
8950 9011 goto noticmpv6;
8951 9012 }
8952 9013
8953 9014 switch (icmp6->icmp6_type) {
8954 9015 case ICMP6_PACKET_TOO_BIG:
8955 9016 /*
8956 9017 * Reduce the MSS based on the new MTU. This will
8957 9018 * eliminate any fragmentation locally.
8958 9019 * N.B. There may well be some funny side-effects on
8959 9020 * the local send policy and the remote receive policy.
8960 9021 * Pending further research, we provide
8961 9022 * tcp_ignore_path_mtu just in case this proves
8962 9023 * disastrous somewhere.
8963 9024 *
8964 9025 * After updating the MSS, retransmit part of the
8965 9026 * dropped segment using the new mss by calling
8966 9027 * tcp_wput_data(). Need to adjust all those
8967 9028 * params to make sure tcp_wput_data() work properly.
8968 9029 */
8969 9030 if (tcps->tcps_ignore_path_mtu)
8970 9031 break;
8971 9032
8972 9033 /*
8973 9034 * Decrease the MSS by time stamp options
8974 9035 * IP options and IPSEC options. tcp_hdr_len
8975 9036 * includes time stamp option and IP option
8976 9037 * length.
8977 9038 */
8978 9039 new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len -
8979 9040 tcp->tcp_ipsec_overhead;
8980 9041
8981 9042 /*
8982 9043 * Only update the MSS if the new one is
8983 9044 * smaller than the previous one. This is
8984 9045 * to avoid problems when getting multiple
8985 9046 * ICMP errors for the same MTU.
8986 9047 */
8987 9048 if (new_mss >= tcp->tcp_mss)
8988 9049 break;
8989 9050
8990 9051 ratio = tcp->tcp_cwnd / tcp->tcp_mss;
8991 9052 ASSERT(ratio >= 1);
8992 9053 tcp_mss_set(tcp, new_mss, B_TRUE);
8993 9054
8994 9055 /*
8995 9056 * Make sure we have something to
8996 9057 * send.
8997 9058 */
8998 9059 if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) &&
8999 9060 (tcp->tcp_xmit_head != NULL)) {
9000 9061 /*
9001 9062 * Shrink tcp_cwnd in
9002 9063 * proportion to the old MSS/new MSS.
9003 9064 */
9004 9065 tcp->tcp_cwnd = ratio * tcp->tcp_mss;
9005 9066 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
9006 9067 (tcp->tcp_unsent == 0)) {
9007 9068 tcp->tcp_rexmit_max = tcp->tcp_fss;
9008 9069 } else {
9009 9070 tcp->tcp_rexmit_max = tcp->tcp_snxt;
9010 9071 }
9011 9072 tcp->tcp_rexmit_nxt = tcp->tcp_suna;
9012 9073 tcp->tcp_rexmit = B_TRUE;
9013 9074 tcp->tcp_dupack_cnt = 0;
9014 9075 tcp->tcp_snd_burst = TCP_CWND_SS;
9015 9076 tcp_ss_rexmit(tcp);
9016 9077 }
9017 9078 break;
9018 9079
9019 9080 case ICMP6_DST_UNREACH:
9020 9081 switch (icmp6->icmp6_code) {
9021 9082 case ICMP6_DST_UNREACH_NOPORT:
9022 9083 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
9023 9084 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
9024 9085 (seg_seq == tcp->tcp_iss)) {
9025 9086 (void) tcp_clean_death(tcp,
9026 9087 ECONNREFUSED, 8);
9027 9088 }
9028 9089 break;
9029 9090
9030 9091 case ICMP6_DST_UNREACH_ADMIN:
9031 9092 case ICMP6_DST_UNREACH_NOROUTE:
9032 9093 case ICMP6_DST_UNREACH_BEYONDSCOPE:
9033 9094 case ICMP6_DST_UNREACH_ADDR:
9034 9095 /* Record the error in case we finally time out. */
9035 9096 tcp->tcp_client_errno = EHOSTUNREACH;
9036 9097 if (((tcp->tcp_state == TCPS_SYN_SENT) ||
9037 9098 (tcp->tcp_state == TCPS_SYN_RCVD)) &&
9038 9099 (seg_seq == tcp->tcp_iss)) {
9039 9100 if (tcp->tcp_listener != NULL &&
9040 9101 tcp->tcp_listener->tcp_syn_defense) {
9041 9102 /*
9042 9103 * Ditch the half-open connection if we
9043 9104 * suspect a SYN attack is under way.
9044 9105 */
9045 9106 tcp_ip_ire_mark_advice(tcp);
9046 9107 (void) tcp_clean_death(tcp,
9047 9108 tcp->tcp_client_errno, 9);
9048 9109 }
9049 9110 }
9050 9111
9051 9112
9052 9113 break;
9053 9114 default:
9054 9115 break;
9055 9116 }
9056 9117 break;
9057 9118
9058 9119 case ICMP6_PARAM_PROB:
9059 9120 /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */
9060 9121 if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER &&
9061 9122 (uchar_t *)ip6h + icmp6->icmp6_pptr ==
9062 9123 (uchar_t *)nexthdrp) {
9063 9124 if (tcp->tcp_state == TCPS_SYN_SENT ||
9064 9125 tcp->tcp_state == TCPS_SYN_RCVD) {
9065 9126 (void) tcp_clean_death(tcp,
9066 9127 ECONNREFUSED, 10);
9067 9128 }
9068 9129 break;
9069 9130 }
9070 9131 break;
9071 9132
9072 9133 case ICMP6_TIME_EXCEEDED:
9073 9134 default:
9074 9135 break;
9075 9136 }
9076 9137 freemsg(first_mp);
9077 9138 }
9078 9139
9079 9140 /*
9080 9141 * IP recognizes seven kinds of bind requests:
9081 9142 *
9082 9143 * - A zero-length address binds only to the protocol number.
9083 9144 *
9084 9145 * - A 4-byte address is treated as a request to
9085 9146 * validate that the address is a valid local IPv4
9086 9147 * address, appropriate for an application to bind to.
9087 9148 * IP does the verification, but does not make any note
9088 9149 * of the address at this time.
9089 9150 *
9090 9151 * - A 16-byte address contains is treated as a request
9091 9152 * to validate a local IPv6 address, as the 4-byte
9092 9153 * address case above.
9093 9154 *
9094 9155 * - A 16-byte sockaddr_in to validate the local IPv4 address and also
9095 9156 * use it for the inbound fanout of packets.
9096 9157 *
9097 9158 * - A 24-byte sockaddr_in6 to validate the local IPv6 address and also
9098 9159 * use it for the inbound fanout of packets.
9099 9160 *
9100 9161 * - A 12-byte address (ipa_conn_t) containing complete IPv4 fanout
9101 9162 * information consisting of local and remote addresses
9102 9163 * and ports. In this case, the addresses are both
9103 9164 * validated as appropriate for this operation, and, if
9104 9165 * so, the information is retained for use in the
9105 9166 * inbound fanout.
9106 9167 *
9107 9168 * - A 36-byte address address (ipa6_conn_t) containing complete IPv6
9108 9169 * fanout information, like the 12-byte case above.
9109 9170 *
9110 9171 * IP will also fill in the IRE request mblk with information
9111 9172 * regarding our peer. In all cases, we notify IP of our protocol
9112 9173 * type by appending a single protocol byte to the bind request.
9113 9174 */
9114 9175 static mblk_t *
9115 9176 tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim, t_scalar_t addr_length)
9116 9177 {
9117 9178 char *cp;
9118 9179 mblk_t *mp;
9119 9180 struct T_bind_req *tbr;
9120 9181 ipa_conn_t *ac;
9121 9182 ipa6_conn_t *ac6;
9122 9183 sin_t *sin;
9123 9184 sin6_t *sin6;
9124 9185
9125 9186 ASSERT(bind_prim == O_T_BIND_REQ || bind_prim == T_BIND_REQ);
9126 9187 ASSERT((tcp->tcp_family == AF_INET &&
9127 9188 tcp->tcp_ipversion == IPV4_VERSION) ||
9128 9189 (tcp->tcp_family == AF_INET6 &&
9129 9190 (tcp->tcp_ipversion == IPV4_VERSION ||
9130 9191 tcp->tcp_ipversion == IPV6_VERSION)));
9131 9192
9132 9193 mp = allocb(sizeof (*tbr) + addr_length + 1, BPRI_HI);
9133 9194 if (!mp)
9134 9195 return (mp);
9135 9196 mp->b_datap->db_type = M_PROTO;
9136 9197 tbr = (struct T_bind_req *)mp->b_rptr;
9137 9198 tbr->PRIM_type = bind_prim;
9138 9199 tbr->ADDR_offset = sizeof (*tbr);
9139 9200 tbr->CONIND_number = 0;
9140 9201 tbr->ADDR_length = addr_length;
9141 9202 cp = (char *)&tbr[1];
9142 9203 switch (addr_length) {
9143 9204 case sizeof (ipa_conn_t):
9144 9205 ASSERT(tcp->tcp_family == AF_INET);
9145 9206 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
9146 9207
9147 9208 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
9148 9209 if (mp->b_cont == NULL) {
9149 9210 freemsg(mp);
9150 9211 return (NULL);
9151 9212 }
9152 9213 mp->b_cont->b_wptr += sizeof (ire_t);
9153 9214 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
9154 9215
9155 9216 /* cp known to be 32 bit aligned */
9156 9217 ac = (ipa_conn_t *)cp;
9157 9218 ac->ac_laddr = tcp->tcp_ipha->ipha_src;
9158 9219 ac->ac_faddr = tcp->tcp_remote;
9159 9220 ac->ac_fport = tcp->tcp_fport;
9160 9221 ac->ac_lport = tcp->tcp_lport;
9161 9222 tcp->tcp_hard_binding = 1;
9162 9223 break;
9163 9224
9164 9225 case sizeof (ipa6_conn_t):
9165 9226 ASSERT(tcp->tcp_family == AF_INET6);
9166 9227
9167 9228 mp->b_cont = allocb(sizeof (ire_t), BPRI_HI);
9168 9229 if (mp->b_cont == NULL) {
9169 9230 freemsg(mp);
9170 9231 return (NULL);
9171 9232 }
9172 9233 mp->b_cont->b_wptr += sizeof (ire_t);
9173 9234 mp->b_cont->b_datap->db_type = IRE_DB_REQ_TYPE;
9174 9235
9175 9236 /* cp known to be 32 bit aligned */
9176 9237 ac6 = (ipa6_conn_t *)cp;
9177 9238 if (tcp->tcp_ipversion == IPV4_VERSION) {
9178 9239 IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src,
9179 9240 &ac6->ac6_laddr);
9180 9241 } else {
9181 9242 ac6->ac6_laddr = tcp->tcp_ip6h->ip6_src;
9182 9243 }
9183 9244 ac6->ac6_faddr = tcp->tcp_remote_v6;
9184 9245 ac6->ac6_fport = tcp->tcp_fport;
9185 9246 ac6->ac6_lport = tcp->tcp_lport;
9186 9247 tcp->tcp_hard_binding = 1;
9187 9248 break;
9188 9249
9189 9250 case sizeof (sin_t):
9190 9251 /*
9191 9252 * NOTE: IPV6_ADDR_LEN also has same size.
9192 9253 * Use family to discriminate.
9193 9254 */
9194 9255 if (tcp->tcp_family == AF_INET) {
9195 9256 sin = (sin_t *)cp;
9196 9257
9197 9258 *sin = sin_null;
9198 9259 sin->sin_family = AF_INET;
9199 9260 sin->sin_addr.s_addr = tcp->tcp_bound_source;
9200 9261 sin->sin_port = tcp->tcp_lport;
9201 9262 break;
9202 9263 } else {
9203 9264 *(in6_addr_t *)cp = tcp->tcp_bound_source_v6;
9204 9265 }
9205 9266 break;
9206 9267
9207 9268 case sizeof (sin6_t):
9208 9269 ASSERT(tcp->tcp_family == AF_INET6);
9209 9270 sin6 = (sin6_t *)cp;
9210 9271
9211 9272 *sin6 = sin6_null;
9212 9273 sin6->sin6_family = AF_INET6;
9213 9274 sin6->sin6_addr = tcp->tcp_bound_source_v6;
9214 9275 sin6->sin6_port = tcp->tcp_lport;
9215 9276 break;
9216 9277
9217 9278 case IP_ADDR_LEN:
9218 9279 ASSERT(tcp->tcp_ipversion == IPV4_VERSION);
9219 9280 *(uint32_t *)cp = tcp->tcp_ipha->ipha_src;
9220 9281 break;
9221 9282
9222 9283 }
9223 9284 /* Add protocol number to end */
9224 9285 cp[addr_length] = (char)IPPROTO_TCP;
9225 9286 mp->b_wptr = (uchar_t *)&cp[addr_length + 1];
9226 9287 return (mp);
9227 9288 }
9228 9289
9229 9290 /*
9230 9291 * Notify IP that we are having trouble with this connection. IP should
9231 9292 * blow the IRE away and start over.
9232 9293 */
9233 9294 static void
9234 9295 tcp_ip_notify(tcp_t *tcp)
9235 9296 {
9236 9297 struct iocblk *iocp;
9237 9298 ipid_t *ipid;
9238 9299 mblk_t *mp;
9239 9300
9240 9301 /* IPv6 has NUD thus notification to delete the IRE is not needed */
9241 9302 if (tcp->tcp_ipversion == IPV6_VERSION)
9242 9303 return;
9243 9304
9244 9305 mp = mkiocb(IP_IOCTL);
9245 9306 if (mp == NULL)
9246 9307 return;
9247 9308
9248 9309 iocp = (struct iocblk *)mp->b_rptr;
9249 9310 iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst);
9250 9311
9251 9312 mp->b_cont = allocb(iocp->ioc_count, BPRI_HI);
9252 9313 if (!mp->b_cont) {
9253 9314 freeb(mp);
9254 9315 return;
9255 9316 }
9256 9317
9257 9318 ipid = (ipid_t *)mp->b_cont->b_rptr;
9258 9319 mp->b_cont->b_wptr += iocp->ioc_count;
9259 9320 bzero(ipid, sizeof (*ipid));
9260 9321 ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY;
9261 9322 ipid->ipid_ire_type = IRE_CACHE;
9262 9323 ipid->ipid_addr_offset = sizeof (ipid_t);
9263 9324 ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst);
9264 9325 /*
9265 9326 * Note: in the case of source routing we want to blow away the
9266 9327 * route to the first source route hop.
9267 9328 */
9268 9329 bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1],
9269 9330 sizeof (tcp->tcp_ipha->ipha_dst));
9270 9331
9271 9332 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp);
9272 9333 }
9273 9334
9274 9335 /* Unlink and return any mblk that looks like it contains an ire */
9275 9336 static mblk_t *
9276 9337 tcp_ire_mp(mblk_t *mp)
9277 9338 {
9278 9339 mblk_t *prev_mp;
9279 9340
9280 9341 for (;;) {
9281 9342 prev_mp = mp;
9282 9343 mp = mp->b_cont;
9283 9344 if (mp == NULL)
9284 9345 break;
9285 9346 switch (DB_TYPE(mp)) {
9286 9347 case IRE_DB_TYPE:
9287 9348 case IRE_DB_REQ_TYPE:
9288 9349 if (prev_mp != NULL)
9289 9350 prev_mp->b_cont = mp->b_cont;
9290 9351 mp->b_cont = NULL;
9291 9352 return (mp);
9292 9353 default:
9293 9354 break;
9294 9355 }
9295 9356 }
9296 9357 return (mp);
9297 9358 }
9298 9359
9299 9360 /*
9300 9361 * Timer callback routine for keepalive probe. We do a fake resend of
9301 9362 * last ACKed byte. Then set a timer using RTO. When the timer expires,
9302 9363 * check to see if we have heard anything from the other end for the last
9303 9364 * RTO period. If we have, set the timer to expire for another
9304 9365 * tcp_keepalive_intrvl and check again. If we have not, set a timer using
9305 9366 * RTO << 1 and check again when it expires. Keep exponentially increasing
9306 9367 * the timeout if we have not heard from the other side. If for more than
9307 9368 * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
9308 9369 * kill the connection unless the keepalive abort threshold is 0. In
9309 9370 * that case, we will probe "forever."
9310 9371 */
9311 9372 static void
9312 9373 tcp_keepalive_killer(void *arg)
9313 9374 {
9314 9375 mblk_t *mp;
9315 9376 conn_t *connp = (conn_t *)arg;
9316 9377 tcp_t *tcp = connp->conn_tcp;
9317 9378 int32_t firetime;
9318 9379 int32_t idletime;
9319 9380 int32_t ka_intrvl;
9320 9381 tcp_stack_t *tcps = tcp->tcp_tcps;
9321 9382
9322 9383 tcp->tcp_ka_tid = 0;
9323 9384
9324 9385 if (tcp->tcp_fused)
9325 9386 return;
9326 9387
9327 9388 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive);
9328 9389 ka_intrvl = tcp->tcp_ka_interval;
9329 9390
9330 9391 /*
9331 9392 * Keepalive probe should only be sent if the application has not
9332 9393 * done a close on the connection.
9333 9394 */
9334 9395 if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
9335 9396 return;
9336 9397 }
9337 9398 /* Timer fired too early, restart it. */
9338 9399 if (tcp->tcp_state < TCPS_ESTABLISHED) {
9339 9400 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
9340 9401 MSEC_TO_TICK(ka_intrvl));
9341 9402 return;
9342 9403 }
9343 9404
9344 9405 idletime = TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time);
9345 9406 /*
9346 9407 * If we have not heard from the other side for a long
9347 9408 * time, kill the connection unless the keepalive abort
9348 9409 * threshold is 0. In that case, we will probe "forever."
9349 9410 */
9350 9411 if (tcp->tcp_ka_abort_thres != 0 &&
9351 9412 idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
9352 9413 BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop);
9353 9414 (void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
9354 9415 tcp->tcp_client_errno : ETIMEDOUT, 11);
9355 9416 return;
9356 9417 }
9357 9418
9358 9419 if (tcp->tcp_snxt == tcp->tcp_suna &&
9359 9420 idletime >= ka_intrvl) {
9360 9421 /* Fake resend of last ACKed byte. */
9361 9422 mblk_t *mp1 = allocb(1, BPRI_LO);
9362 9423
9363 9424 if (mp1 != NULL) {
9364 9425 *mp1->b_wptr++ = '\0';
9365 9426 mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
9366 9427 tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
9367 9428 freeb(mp1);
9368 9429 /*
9369 9430 * if allocation failed, fall through to start the
9370 9431 * timer back.
9371 9432 */
9372 9433 if (mp != NULL) {
9373 9434 TCP_RECORD_TRACE(tcp, mp,
9374 9435 TCP_TRACE_SEND_PKT);
9375 9436 tcp_send_data(tcp, tcp->tcp_wq, mp);
9376 9437 BUMP_MIB(&tcps->tcps_mib,
9377 9438 tcpTimKeepaliveProbe);
9378 9439 if (tcp->tcp_ka_last_intrvl != 0) {
9379 9440 int max;
9380 9441 /*
9381 9442 * We should probe again at least
9382 9443 * in ka_intrvl, but not more than
9383 9444 * tcp_rexmit_interval_max.
9384 9445 */
9385 9446 max = tcps->tcps_rexmit_interval_max;
9386 9447 firetime = MIN(ka_intrvl - 1,
9387 9448 tcp->tcp_ka_last_intrvl << 1);
9388 9449 if (firetime > max)
9389 9450 firetime = max;
9390 9451 } else {
9391 9452 firetime = tcp->tcp_rto;
9392 9453 }
9393 9454 tcp->tcp_ka_tid = TCP_TIMER(tcp,
9394 9455 tcp_keepalive_killer,
9395 9456 MSEC_TO_TICK(firetime));
9396 9457 tcp->tcp_ka_last_intrvl = firetime;
9397 9458 return;
9398 9459 }
9399 9460 }
9400 9461 } else {
9401 9462 tcp->tcp_ka_last_intrvl = 0;
9402 9463 }
9403 9464
9404 9465 /* firetime can be negative if (mp1 == NULL || mp == NULL) */
9405 9466 if ((firetime = ka_intrvl - idletime) < 0) {
9406 9467 firetime = ka_intrvl;
9407 9468 }
9408 9469 tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer,
9409 9470 MSEC_TO_TICK(firetime));
9410 9471 }
9411 9472
9412 9473 int
9413 9474 tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk)
9414 9475 {
9415 9476 queue_t *q = tcp->tcp_rq;
9416 9477 int32_t mss = tcp->tcp_mss;
9417 9478 int maxpsz;
9418 9479
9419 9480 if (TCP_IS_DETACHED(tcp))
9420 9481 return (mss);
9421 9482
9422 9483 if (tcp->tcp_fused) {
9423 9484 maxpsz = tcp_fuse_maxpsz_set(tcp);
9424 9485 mss = INFPSZ;
9425 9486 } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) {
9426 9487 /*
9427 9488 * Set the sd_qn_maxpsz according to the socket send buffer
9428 9489 * size, and sd_maxblk to INFPSZ (-1). This will essentially
9429 9490 * instruct the stream head to copyin user data into contiguous
9430 9491 * kernel-allocated buffers without breaking it up into smaller
9431 9492 * chunks. We round up the buffer size to the nearest SMSS.
9432 9493 */
9433 9494 maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss);
9434 9495 if (tcp->tcp_kssl_ctx == NULL)
9435 9496 mss = INFPSZ;
9436 9497 else
9437 9498 mss = SSL3_MAX_RECORD_LEN;
9438 9499 } else {
9439 9500 /*
9440 9501 * Set sd_qn_maxpsz to approx half the (receivers) buffer
9441 9502 * (and a multiple of the mss). This instructs the stream
9442 9503 * head to break down larger than SMSS writes into SMSS-
9443 9504 * size mblks, up to tcp_maxpsz_multiplier mblks at a time.
9444 9505 */
9445 9506 maxpsz = tcp->tcp_maxpsz * mss;
9446 9507 if (maxpsz > tcp->tcp_xmit_hiwater/2) {
9447 9508 maxpsz = tcp->tcp_xmit_hiwater/2;
9448 9509 /* Round up to nearest mss */
9449 9510 maxpsz = MSS_ROUNDUP(maxpsz, mss);
9450 9511 }
9451 9512 }
9452 9513 (void) setmaxps(q, maxpsz);
9453 9514 tcp->tcp_wq->q_maxpsz = maxpsz;
9454 9515
9455 9516 if (set_maxblk)
9456 9517 (void) mi_set_sth_maxblk(q, mss);
9457 9518
9458 9519 return (mss);
9459 9520 }
9460 9521
9461 9522 /*
9462 9523 * Extract option values from a tcp header. We put any found values into the
9463 9524 * tcpopt struct and return a bitmask saying which options were found.
9464 9525 */
9465 9526 static int
9466 9527 tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt)
9467 9528 {
9468 9529 uchar_t *endp;
9469 9530 int len;
9470 9531 uint32_t mss;
9471 9532 uchar_t *up = (uchar_t *)tcph;
9472 9533 int found = 0;
9473 9534 int32_t sack_len;
9474 9535 tcp_seq sack_begin, sack_end;
9475 9536 tcp_t *tcp;
9476 9537
9477 9538 endp = up + TCP_HDR_LENGTH(tcph);
9478 9539 up += TCP_MIN_HEADER_LENGTH;
9479 9540 while (up < endp) {
9480 9541 len = endp - up;
9481 9542 switch (*up) {
9482 9543 case TCPOPT_EOL:
9483 9544 break;
9484 9545
9485 9546 case TCPOPT_NOP:
9486 9547 up++;
9487 9548 continue;
9488 9549
9489 9550 case TCPOPT_MAXSEG:
9490 9551 if (len < TCPOPT_MAXSEG_LEN ||
9491 9552 up[1] != TCPOPT_MAXSEG_LEN)
9492 9553 break;
9493 9554
9494 9555 mss = BE16_TO_U16(up+2);
9495 9556 /* Caller must handle tcp_mss_min and tcp_mss_max_* */
9496 9557 tcpopt->tcp_opt_mss = mss;
9497 9558 found |= TCP_OPT_MSS_PRESENT;
9498 9559
9499 9560 up += TCPOPT_MAXSEG_LEN;
9500 9561 continue;
9501 9562
9502 9563 case TCPOPT_WSCALE:
9503 9564 if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN)
9504 9565 break;
9505 9566
9506 9567 if (up[2] > TCP_MAX_WINSHIFT)
9507 9568 tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT;
9508 9569 else
9509 9570 tcpopt->tcp_opt_wscale = up[2];
9510 9571 found |= TCP_OPT_WSCALE_PRESENT;
9511 9572
9512 9573 up += TCPOPT_WS_LEN;
9513 9574 continue;
9514 9575
9515 9576 case TCPOPT_SACK_PERMITTED:
9516 9577 if (len < TCPOPT_SACK_OK_LEN ||
9517 9578 up[1] != TCPOPT_SACK_OK_LEN)
9518 9579 break;
9519 9580 found |= TCP_OPT_SACK_OK_PRESENT;
9520 9581 up += TCPOPT_SACK_OK_LEN;
9521 9582 continue;
9522 9583
9523 9584 case TCPOPT_SACK:
9524 9585 if (len <= 2 || up[1] <= 2 || len < up[1])
9525 9586 break;
9526 9587
9527 9588 /* If TCP is not interested in SACK blks... */
9528 9589 if ((tcp = tcpopt->tcp) == NULL) {
9529 9590 up += up[1];
9530 9591 continue;
9531 9592 }
9532 9593 sack_len = up[1] - TCPOPT_HEADER_LEN;
9533 9594 up += TCPOPT_HEADER_LEN;
9534 9595
9535 9596 /*
9536 9597 * If the list is empty, allocate one and assume
9537 9598 * nothing is sack'ed.
9538 9599 */
9539 9600 ASSERT(tcp->tcp_sack_info != NULL);
9540 9601 if (tcp->tcp_notsack_list == NULL) {
9541 9602 tcp_notsack_update(&(tcp->tcp_notsack_list),
9542 9603 tcp->tcp_suna, tcp->tcp_snxt,
9543 9604 &(tcp->tcp_num_notsack_blk),
9544 9605 &(tcp->tcp_cnt_notsack_list));
9545 9606
9546 9607 /*
9547 9608 * Make sure tcp_notsack_list is not NULL.
9548 9609 * This happens when kmem_alloc(KM_NOSLEEP)
9549 9610 * returns NULL.
9550 9611 */
9551 9612 if (tcp->tcp_notsack_list == NULL) {
9552 9613 up += sack_len;
9553 9614 continue;
9554 9615 }
9555 9616 tcp->tcp_fack = tcp->tcp_suna;
9556 9617 }
9557 9618
9558 9619 while (sack_len > 0) {
9559 9620 if (up + 8 > endp) {
9560 9621 up = endp;
956