3460 tbr->PRIM_type = T_BIND_ACK;
3461 mp->b_datap->db_type = M_PCPROTO;
3462
3463 /* Chain in the reply mp for tcp_rput() */
3464 mp1->b_cont = mp;
3465 mp = mp1;
3466
3467 tcp->tcp_conn_req_max = tbr->CONIND_number;
3468 if (tcp->tcp_conn_req_max) {
3469 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
3470 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
3471 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
3472 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
3473 /*
3474 * If this is a listener, do not reset the eager list
3475 * and other stuffs. Note that we don't check if the
3476 * existing eager list meets the new tcp_conn_req_max
3477 * requirement.
3478 */
3479 if (tcp->tcp_state != TCPS_LISTEN) {
3480 tcp->tcp_state = TCPS_LISTEN;
3481 /* Initialize the chain. Don't need the eager_lock */
3482 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3483 tcp->tcp_eager_next_drop_q0 = tcp;
3484 tcp->tcp_eager_prev_drop_q0 = tcp;
3485 tcp->tcp_second_ctimer_threshold =
3486 tcps->tcps_ip_abort_linterval;
3487 }
3488 }
3489
3490 /*
3491 * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3492 * processing continues in tcp_rput_other().
3493 *
3494 * We need to make sure that the conn_recv is set to a non-null
3495 * value before we insert the conn into the classifier table.
3496 * This is to avoid a race with an incoming packet which does an
3497 * ipcl_classify().
3498 */
3499 connp->conn_recv = tcp_conn_request;
3756 * address and source port, which is
3757 * refused regardless of the
3758 * SO_REUSEADDR setting, so we break.
3759 */
3760 if (IN6_ARE_ADDR_EQUAL(laddr,
3761 <cp->tcp_bound_source_v6) &&
3762 (ltcp->tcp_state == TCPS_LISTEN ||
3763 ltcp->tcp_state == TCPS_BOUND))
3764 break;
3765 }
3766 }
3767 if (ltcp != NULL) {
3768 /* The port number is busy */
3769 mutex_exit(&tbf->tf_lock);
3770 } else {
3771 /*
3772 * This port is ours. Insert in fanout and mark as
3773 * bound to prevent others from getting the port
3774 * number.
3775 */
3776 tcp->tcp_state = TCPS_BOUND;
3777 tcp->tcp_lport = htons(port);
3778 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3779
3780 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
3781 tcp->tcp_lport)] == tbf);
3782 tcp_bind_hash_insert(tbf, tcp, 1);
3783
3784 mutex_exit(&tbf->tf_lock);
3785
3786 /*
3787 * We don't want tcp_next_port_to_try to "inherit"
3788 * a port number supplied by the user in a bind.
3789 */
3790 if (user_specified)
3791 return (port);
3792
3793 /*
3794 * This is the only place where tcp_next_port_to_try
3795 * is updated. After the update, it may or may not
3887 tcp->tcp_ipversion == IPV6_VERSION)));
3888
3889 if (TCP_IS_DETACHED(tcp)) {
3890 if (tcp->tcp_hard_binding) {
3891 /*
3892 * Its an eager that we are dealing with. We close the
3893 * eager but in case a conn_ind has already gone to the
3894 * listener, let tcp_accept_finish() send a discon_ind
3895 * to the listener and drop the last reference. If the
3896 * listener doesn't even know about the eager i.e. the
3897 * conn_ind hasn't gone up, blow away the eager and drop
3898 * the last reference as well. If the conn_ind has gone
3899 * up, state should be BOUND. tcp_accept_finish
3900 * will figure out that the connection has received a
3901 * RST and will send a DISCON_IND to the application.
3902 */
3903 tcp_closei_local(tcp);
3904 if (!tcp->tcp_tconnind_started) {
3905 CONN_DEC_REF(tcp->tcp_connp);
3906 } else {
3907 tcp->tcp_state = TCPS_BOUND;
3908 }
3909 } else {
3910 tcp_close_detached(tcp);
3911 }
3912 return (0);
3913 }
3914
3915 TCP_STAT(tcps, tcp_clean_death_nondetached);
3916
3917 /*
3918 * If T_ORDREL_IND has not been sent yet (done when service routine
3919 * is run) postpone cleaning up the endpoint until service routine
3920 * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3921 * client_errno since tcp_close uses the client_errno field.
3922 */
3923 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3924 if (err != 0)
3925 tcp->tcp_client_errno = err;
3926
4620 (void) tcp_time_wait_remove(tcp, NULL);
4621 CL_INET_DISCONNECT(tcp);
4622 ipcl_hash_remove(connp);
4623
4624 /*
4625 * Delete the cached ire in conn_ire_cache and also mark
4626 * the conn as CONDEMNED
4627 */
4628 mutex_enter(&connp->conn_lock);
4629 connp->conn_state_flags |= CONN_CONDEMNED;
4630 ire = connp->conn_ire_cache;
4631 connp->conn_ire_cache = NULL;
4632 mutex_exit(&connp->conn_lock);
4633 if (ire != NULL)
4634 IRE_REFRELE_NOTR(ire);
4635
4636 /* Need to cleanup any pending ioctls */
4637 ASSERT(tcp->tcp_time_wait_next == NULL);
4638 ASSERT(tcp->tcp_time_wait_prev == NULL);
4639 ASSERT(tcp->tcp_time_wait_expire == 0);
4640 tcp->tcp_state = TCPS_CLOSED;
4641
4642 /* Release any SSL context */
4643 if (tcp->tcp_kssl_ent != NULL) {
4644 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4645 tcp->tcp_kssl_ent = NULL;
4646 }
4647 if (tcp->tcp_kssl_ctx != NULL) {
4648 kssl_release_ctx(tcp->tcp_kssl_ctx);
4649 tcp->tcp_kssl_ctx = NULL;
4650 }
4651 tcp->tcp_kssl_pending = B_FALSE;
4652
4653 tcp_ipsec_cleanup(tcp);
4654 }
4655
4656 /*
4657 * tcp is dying (called from ipcl_conn_destroy and error cases).
4658 * Free the tcp_t in either case.
4659 */
5861 DTRACE_PROBE3(
5862 tx__ip__log__error__connrequest__tcp,
5863 char *, "eager connp(1) label on SYN mp(2) failed",
5864 conn_t *, econnp, mblk_t *, mp);
5865 goto error3;
5866 }
5867 }
5868
5869 eager->tcp_hard_binding = B_TRUE;
5870
5871 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
5872 TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5873
5874 CL_INET_CONNECT(eager);
5875
5876 /*
5877 * No need to check for multicast destination since ip will only pass
5878 * up multicasts to those that have expressed interest
5879 * TODO: what about rejecting broadcasts?
5880 * Also check that source is not a multicast or broadcast address.
5881 */
5882 eager->tcp_state = TCPS_SYN_RCVD;
5883
5884
5885 /*
5886 * There should be no ire in the mp as we are being called after
5887 * receiving the SYN.
5888 */
5889 ASSERT(tcp_ire_mp(mp) == NULL);
5890
5891 /*
5892 * Adapt our mss, ttl, ... according to information provided in IRE.
5893 */
5894
5895 if (tcp_adapt_ire(eager, NULL) == 0) {
5896 /* Undo the bind_hash_insert */
5897 tcp_bind_hash_remove(eager);
5898 goto error3;
5899 }
5900
5901 /* Process all TCP options. */
5902 tcp_process_options(eager, tcph);
5903
5904 /* Is the other end ECN capable? */
5905 if (tcps->tcps_ecn_permitted >= 1 &&
5906 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5907 eager->tcp_ecn_ok = B_TRUE;
5908 }
5909
5910 /*
5911 * listener->tcp_rq->q_hiwat should be the default window size or a
5912 * window size changed via SO_RCVBUF option. First round up the
5913 * eager's tcp_rwnd to the nearest MSS. Then find out the window
5914 * scale option value if needed. Call tcp_rwnd_set() to finish the
5915 * setting.
5916 *
5917 * Note if there is a rpipe metric associated with the remote host,
5918 * we should not inherit receive window size from listener.
5919 */
5920 eager->tcp_rwnd = MSS_ROUNDUP(
5997 if (addr_cache != NULL && eager->tcp_remote ==
5998 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
5999 eager->tcp_dontdrop = B_TRUE;
6000 }
6001 }
6002
6003 /*
6004 * We need to insert the eager in its own perimeter but as soon
6005 * as we do that, we expose the eager to the classifier and
6006 * should not touch any field outside the eager's perimeter.
6007 * So do all the work necessary before inserting the eager
6008 * in its own perimeter. Be optimistic that ipcl_conn_insert()
6009 * will succeed but undo everything if it fails.
6010 */
6011 seg_seq = ABE32_TO_U32(tcph->th_seq);
6012 eager->tcp_irs = seg_seq;
6013 eager->tcp_rack = seg_seq;
6014 eager->tcp_rnxt = seg_seq + 1;
6015 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
6016 BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
6017 eager->tcp_state = TCPS_SYN_RCVD;
6018 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
6019 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
6020 if (mp1 == NULL) {
6021 /*
6022 * Increment the ref count as we are going to
6023 * enqueueing an mp in squeue
6024 */
6025 CONN_INC_REF(econnp);
6026 goto error;
6027 }
6028 DB_CPID(mp1) = tcp->tcp_cpid;
6029 eager->tcp_cpid = tcp->tcp_cpid;
6030 eager->tcp_open_time = lbolt64;
6031
6032 /*
6033 * We need to start the rto timer. In normal case, we start
6034 * the timer after sending the packet on the wire (or at
6035 * least believing that packet was sent by waiting for
6036 * CALL_IP_WPUT() to return). Since this is the first packet
6106 * anymore (someone blew it away). Just
6107 * free this message and hopefully remote
6108 * will retransmit at which time the SYN can be
6109 * treated as a new connection or dealth with
6110 * a TH_RST if a connection already exists.
6111 */
6112 CONN_DEC_REF(econnp);
6113 freemsg(mp);
6114 } else {
6115 squeue_fill(econnp->conn_sqp, mp, tcp_input,
6116 econnp, SQTAG_TCP_CONN_REQ_1);
6117 }
6118 } else {
6119 /* Nobody wants this packet */
6120 freemsg(mp);
6121 }
6122 return;
6123 error3:
6124 CONN_DEC_REF(econnp);
6125 error2:
6126 freemsg(mp);
6127 }
6128
6129 /*
6130 * In an ideal case of vertical partition in NUMA architecture, its
6131 * beneficial to have the listener and all the incoming connections
6132 * tied to the same squeue. The other constraint is that incoming
6133 * connections should be tied to the squeue attached to interrupted
6134 * CPU for obvious locality reason so this leaves the listener to
6135 * be tied to the same squeue. Our only problem is that when listener
6136 * is binding, the CPU that will get interrupted by the NIC whose
6137 * IP address the listener is binding to is not even known. So
6138 * the code below allows us to change that binding at the time the
6139 * CPU is interrupted by virtue of incoming connection's squeue.
6140 *
6141 * This is usefull only in case of a listener bound to a specific IP
6142 * address. For other kind of listeners, they get bound the
6143 * very first time and there is no attempt to rebind them.
6144 */
6145 void
6604 * At this point the remote destination address and remote port fields
6605 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6606 * have to see which state tcp was in so we can take apropriate action.
6607 */
6608 if (oldstate == TCPS_IDLE) {
6609 /*
6610 * We support a quick connect capability here, allowing
6611 * clients to transition directly from IDLE to SYN_SENT
6612 * tcp_bindi will pick an unused port, insert the connection
6613 * in the bind hash and transition to BOUND state.
6614 */
6615 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6616 tcp, B_TRUE);
6617 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6618 B_FALSE, B_FALSE);
6619 if (lport == 0) {
6620 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6621 goto failed;
6622 }
6623 }
6624 tcp->tcp_state = TCPS_SYN_SENT;
6625
6626 /*
6627 * TODO: allow data with connect requests
6628 * by unlinking M_DATA trailers here and
6629 * linking them in behind the T_OK_ACK mblk.
6630 * The tcp_rput() bind ack handler would then
6631 * feed them to tcp_wput_data() rather than call
6632 * tcp_timer().
6633 */
6634 mp = mi_tpi_ok_ack_alloc(mp);
6635 if (!mp) {
6636 tcp->tcp_state = oldstate;
6637 goto failed;
6638 }
6639 if (tcp->tcp_family == AF_INET) {
6640 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6641 sizeof (ipa_conn_t));
6642 } else {
6643 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6644 sizeof (ipa6_conn_t));
6645 }
6646 if (mp1) {
6647 /*
6648 * We need to make sure that the conn_recv is set to a non-null
6649 * value before we insert the conn_t into the classifier table.
6650 * This is to avoid a race with an incoming packet which does
6651 * an ipcl_classify().
6652 */
6653 tcp->tcp_connp->conn_recv = tcp_input;
6654
6655 /* Hang onto the T_OK_ACK for later. */
6656 linkb(mp1, mp);
6657 mblk_setcred(mp1, tcp->tcp_cred);
6658 if (tcp->tcp_family == AF_INET)
6659 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6660 else {
6661 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6662 &tcp->tcp_sticky_ipp);
6663 }
6664 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6665 tcp->tcp_active_open = 1;
6666 /*
6667 * If the bind cannot complete immediately
6668 * IP will arrange to call tcp_rput_other
6669 * when the bind completes.
6670 */
6671 if (mp1 != NULL)
6672 tcp_rput_other(tcp, mp1);
6673 return;
6674 }
6675 /* Error case */
6676 tcp->tcp_state = oldstate;
6677 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6678
6679 failed:
6680 /* return error ack and blow away saved option results if any */
6681 if (mp != NULL)
6682 putnext(tcp->tcp_rq, mp);
6683 else {
6684 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6685 TSYSERR, ENOMEM);
6686 }
6687 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6688 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6689
6690 }
6691
6692 /*
6693 * Handle connect to IPv6 destinations.
6694 */
6695 static void
6812 * At this point the remote destination address and remote port fields
6813 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6814 * have to see which state tcp was in so we can take apropriate action.
6815 */
6816 if (oldstate == TCPS_IDLE) {
6817 /*
6818 * We support a quick connect capability here, allowing
6819 * clients to transition directly from IDLE to SYN_SENT
6820 * tcp_bindi will pick an unused port, insert the connection
6821 * in the bind hash and transition to BOUND state.
6822 */
6823 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6824 tcp, B_TRUE);
6825 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6826 B_FALSE, B_FALSE);
6827 if (lport == 0) {
6828 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6829 goto failed;
6830 }
6831 }
6832 tcp->tcp_state = TCPS_SYN_SENT;
6833 /*
6834 * TODO: allow data with connect requests
6835 * by unlinking M_DATA trailers here and
6836 * linking them in behind the T_OK_ACK mblk.
6837 * The tcp_rput() bind ack handler would then
6838 * feed them to tcp_wput_data() rather than call
6839 * tcp_timer().
6840 */
6841 mp = mi_tpi_ok_ack_alloc(mp);
6842 if (!mp) {
6843 tcp->tcp_state = oldstate;
6844 goto failed;
6845 }
6846 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6847 if (mp1) {
6848 /*
6849 * We need to make sure that the conn_recv is set to a non-null
6850 * value before we insert the conn_t into the classifier table.
6851 * This is to avoid a race with an incoming packet which does
6852 * an ipcl_classify().
6853 */
6854 tcp->tcp_connp->conn_recv = tcp_input;
6855
6856 /* Hang onto the T_OK_ACK for later. */
6857 linkb(mp1, mp);
6858 mblk_setcred(mp1, tcp->tcp_cred);
6859 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6860 &tcp->tcp_sticky_ipp);
6861 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6862 tcp->tcp_active_open = 1;
6863 /* ip_bind_v6() may return ACK or ERROR */
6864 if (mp1 != NULL)
6865 tcp_rput_other(tcp, mp1);
6866 return;
6867 }
6868 /* Error case */
6869 tcp->tcp_state = oldstate;
6870 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6871
6872 failed:
6873 /* return error ack and blow away saved option results if any */
6874 if (mp != NULL)
6875 putnext(tcp->tcp_rq, mp);
6876 else {
6877 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6878 TSYSERR, ENOMEM);
6879 }
6880 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6881 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6882 }
6883
6884 /*
6885 * We need a stream q for detached closing tcp connections
6886 * to use. Our client hereby indicates that this q is the
6887 * one to use.
6888 */
6997 ltcp = NULL;
6998 /*
6999 * If it used to be a listener, check to make sure no one else
7000 * has taken the port before switching back to LISTEN state.
7001 */
7002 if (tcp->tcp_ipversion == IPV4_VERSION) {
7003 connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
7004 tcp->tcp_ipha->ipha_src,
7005 tcp->tcp_connp->conn_zoneid, ipst);
7006 if (connp != NULL)
7007 ltcp = connp->conn_tcp;
7008 } else {
7009 /* Allow tcp_bound_if listeners? */
7010 connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
7011 &tcp->tcp_ip6h->ip6_src, 0,
7012 tcp->tcp_connp->conn_zoneid, ipst);
7013 if (connp != NULL)
7014 ltcp = connp->conn_tcp;
7015 }
7016 if (tcp->tcp_conn_req_max && ltcp == NULL) {
7017 tcp->tcp_state = TCPS_LISTEN;
7018 } else if (old_state > TCPS_BOUND) {
7019 tcp->tcp_conn_req_max = 0;
7020 tcp->tcp_state = TCPS_BOUND;
7021 }
7022 if (ltcp != NULL)
7023 CONN_DEC_REF(ltcp->tcp_connp);
7024 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
7025 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
7026 } else if (old_state == TCPS_ESTABLISHED ||
7027 old_state == TCPS_CLOSE_WAIT) {
7028 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
7029 }
7030
7031 if (tcp->tcp_fused)
7032 tcp_unfuse(tcp);
7033
7034 mutex_enter(&tcp->tcp_eager_lock);
7035 if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
7036 (tcp->tcp_conn_req_cnt_q != 0)) {
7037 tcp_eager_cleanup(tcp, 0);
7038 }
7039 mutex_exit(&tcp->tcp_eager_lock);
7900 tcp_ipsec_cleanup(tcp);
7901
7902 if (tcp->tcp_conn_req_max != 0) {
7903 /*
7904 * This is the case when a TLI program uses the same
7905 * transport end point to accept a connection. This
7906 * makes the TCP both a listener and acceptor. When
7907 * this connection is closed, we need to set the state
7908 * back to TCPS_LISTEN. Make sure that the eager list
7909 * is reinitialized.
7910 *
7911 * Note that this stream is still bound to the four
7912 * tuples of the previous connection in IP. If a new
7913 * SYN with different foreign address comes in, IP will
7914 * not find it and will send it to the global queue. In
7915 * the global queue, TCP will do a tcp_lookup_listener()
7916 * to find this stream. This works because this stream
7917 * is only removed from connected hash.
7918 *
7919 */
7920 tcp->tcp_state = TCPS_LISTEN;
7921 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7922 tcp->tcp_eager_next_drop_q0 = tcp;
7923 tcp->tcp_eager_prev_drop_q0 = tcp;
7924 tcp->tcp_connp->conn_recv = tcp_conn_request;
7925 if (tcp->tcp_family == AF_INET6) {
7926 ASSERT(tcp->tcp_connp->conn_af_isv6);
7927 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7928 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7929 } else {
7930 ASSERT(!tcp->tcp_connp->conn_af_isv6);
7931 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7932 tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7933 }
7934 } else {
7935 tcp->tcp_state = TCPS_BOUND;
7936 }
7937
7938 /*
7939 * Initialize to default values
7940 * Can't fail since enough header template space already allocated
7941 * at open().
7942 */
7943 err = tcp_init_values(tcp);
7944 ASSERT(err == 0);
7945 /* Restore state in tcp_tcph */
7946 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
7947 if (tcp->tcp_ipversion == IPV4_VERSION)
7948 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
7949 else
7950 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
7951 /*
7952 * Copy of the src addr. in tcp_t is needed in tcp_t
7953 * since the lookup funcs can only lookup on tcp_t
7954 */
8276 DONTCARE(tcp->tcmp_stk[0]);
8277 #endif
8278
8279
8280 #undef DONTCARE
8281 #undef PRESERVE
8282 }
8283
8284 /*
8285 * Allocate necessary resources and initialize state vector.
8286 * Guaranteed not to fail so that when an error is returned,
8287 * the caller doesn't need to do any additional cleanup.
8288 */
8289 int
8290 tcp_init(tcp_t *tcp, queue_t *q)
8291 {
8292 int err;
8293
8294 tcp->tcp_rq = q;
8295 tcp->tcp_wq = WR(q);
8296 tcp->tcp_state = TCPS_IDLE;
8297 if ((err = tcp_init_values(tcp)) != 0)
8298 tcp_timers_stop(tcp);
8299 return (err);
8300 }
8301
8302 static int
8303 tcp_init_values(tcp_t *tcp)
8304 {
8305 int err;
8306 tcp_stack_t *tcps = tcp->tcp_tcps;
8307
8308 ASSERT((tcp->tcp_family == AF_INET &&
8309 tcp->tcp_ipversion == IPV4_VERSION) ||
8310 (tcp->tcp_family == AF_INET6 &&
8311 (tcp->tcp_ipversion == IPV4_VERSION ||
8312 tcp->tcp_ipversion == IPV6_VERSION)));
8313
8314 /*
8315 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
13277 if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
13278 freemsg(mp);
13279 return;
13280 }
13281 /* Update pointers into message */
13282 iphdr = rptr = mp->b_rptr;
13283 tcph = (tcph_t *)&rptr[ip_hdr_len];
13284 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
13285 /*
13286 * Since we can't handle any data with this urgent
13287 * pointer that is out of sequence, we expunge
13288 * the data. This allows us to still register
13289 * the urgent mark and generate the M_PCSIG,
13290 * which we can do.
13291 */
13292 mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
13293 seg_len = 0;
13294 }
13295 }
13296
13297 switch (tcp->tcp_state) {
13298 case TCPS_SYN_SENT:
13299 if (flags & TH_ACK) {
13300 /*
13301 * Note that our stack cannot send data before a
13302 * connection is established, therefore the
13303 * following check is valid. Otherwise, it has
13304 * to be changed.
13305 */
13306 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) ||
13307 SEQ_GT(seg_ack, tcp->tcp_snxt)) {
13308 freemsg(mp);
13309 if (flags & TH_RST)
13310 return;
13311 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
13312 tcp, seg_ack, 0, TH_RST);
13313 return;
13314 }
13315 ASSERT(tcp->tcp_suna + 1 == seg_ack);
13316 }
13317 if (flags & TH_RST) {
13318 freemsg(mp);
13319 if (flags & TH_ACK)
13320 (void) tcp_clean_death(tcp,
13321 ECONNREFUSED, 13);
13322 return;
13323 }
13324 if (!(flags & TH_SYN)) {
13325 freemsg(mp);
13326 return;
13327 }
13328
13329 /* Process all TCP options. */
13330 tcp_process_options(tcp, tcph);
13331 /*
13332 * The following changes our rwnd to be a multiple of the
13333 * MIN(peer MSS, our MSS) for performance reason.
13334 */
13335 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat,
13336 tcp->tcp_mss));
13337
13372 *
13373 * XXX: how can we pretend we didn't see it if we
13374 * have updated rnxt et. al.
13375 *
13376 * For loopback we defer sending up the T_CONN_CON
13377 * until after some checks below.
13378 */
13379 mp1 = NULL;
13380 if (!tcp_conn_con(tcp, iphdr, tcph, mp,
13381 tcp->tcp_loopback ? &mp1 : NULL)) {
13382 freemsg(mp);
13383 return;
13384 }
13385 /* SYN was acked - making progress */
13386 if (tcp->tcp_ipversion == IPV6_VERSION)
13387 tcp->tcp_ip_forward_progress = B_TRUE;
13388
13389 /* One for the SYN */
13390 tcp->tcp_suna = tcp->tcp_iss + 1;
13391 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
13392 tcp->tcp_state = TCPS_ESTABLISHED;
13393
13394 /*
13395 * If SYN was retransmitted, need to reset all
13396 * retransmission info. This is because this
13397 * segment will be treated as a dup ACK.
13398 */
13399 if (tcp->tcp_rexmit) {
13400 tcp->tcp_rexmit = B_FALSE;
13401 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
13402 tcp->tcp_rexmit_max = tcp->tcp_snxt;
13403 tcp->tcp_snd_burst = tcp->tcp_localnet ?
13404 TCP_CWND_INFINITE : TCP_CWND_NORMAL;
13405 tcp->tcp_ms_we_have_waited = 0;
13406
13407 /*
13408 * Set tcp_cwnd back to 1 MSS, per
13409 * recommendation from
13410 * draft-floyd-incr-init-win-01.txt,
13411 * Increasing TCP's Initial Window.
13412 */
13413 tcp->tcp_cwnd = tcp->tcp_mss;
13414 }
13481
13482 /*
13483 * Check to see if there is data to be sent. If
13484 * yes, set the transmit flag. Then check to see
13485 * if received data processing needs to be done.
13486 * If not, go straight to xmit_check. This short
13487 * cut is OK as we don't support T/TCP.
13488 */
13489 if (tcp->tcp_unsent)
13490 flags |= TH_XMIT_NEEDED;
13491
13492 if (seg_len == 0 && !(flags & TH_URG)) {
13493 freemsg(mp);
13494 goto xmit_check;
13495 }
13496
13497 flags &= ~TH_SYN;
13498 seg_seq++;
13499 break;
13500 }
13501 tcp->tcp_state = TCPS_SYN_RCVD;
13502 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
13503 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
13504 if (mp1) {
13505 DB_CPID(mp1) = tcp->tcp_cpid;
13506 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT);
13507 tcp_send_data(tcp, tcp->tcp_wq, mp1);
13508 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
13509 }
13510 freemsg(mp);
13511 return;
13512 case TCPS_SYN_RCVD:
13513 if (flags & TH_ACK) {
13514 /*
13515 * In this state, a SYN|ACK packet is either bogus
13516 * because the other side must be ACKing our SYN which
13517 * indicates it has seen the ACK for their SYN and
13518 * shouldn't retransmit it or we're crossing SYNs
13519 * on active open.
13520 */
14400 tcp->tcp_cwnd = mss;
14401 }
14402
14403 /*
14404 * We set the send window to zero here.
14405 * This is needed if there is data to be
14406 * processed already on the queue.
14407 * Later (at swnd_update label), the
14408 * "new_swnd > tcp_swnd" condition is satisfied
14409 * the XMIT_NEEDED flag is set in the current
14410 * (SYN_RCVD) state. This ensures tcp_wput_data() is
14411 * called if there is already data on queue in
14412 * this state.
14413 */
14414 tcp->tcp_swnd = 0;
14415
14416 if (new_swnd > tcp->tcp_max_swnd)
14417 tcp->tcp_max_swnd = new_swnd;
14418 tcp->tcp_swl1 = seg_seq;
14419 tcp->tcp_swl2 = seg_ack;
14420 tcp->tcp_state = TCPS_ESTABLISHED;
14421 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
14422
14423 /* Fuse when both sides are in ESTABLISHED state */
14424 if (tcp->tcp_loopback && do_tcp_fusion)
14425 tcp_fuse(tcp, iphdr, tcph);
14426
14427 }
14428 /* This code follows 4.4BSD-Lite2 mostly. */
14429 if (bytes_acked < 0)
14430 goto est;
14431
14432 /*
14433 * If TCP is ECN capable and the congestion experience bit is
14434 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
14435 * done once per window (or more loosely, per RTT).
14436 */
14437 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
14438 tcp->tcp_cwr = B_FALSE;
14439 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
15036 *
15037 * 1. the segment acknowledges some data. Or
15038 * 2. the segment is new, i.e. it has a higher seq num. Or
15039 * 3. the segment is not old and the advertised window is
15040 * larger than the previous advertised window.
15041 */
15042 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd)
15043 flags |= TH_XMIT_NEEDED;
15044 tcp->tcp_swnd = new_swnd;
15045 if (new_swnd > tcp->tcp_max_swnd)
15046 tcp->tcp_max_swnd = new_swnd;
15047 tcp->tcp_swl1 = seg_seq;
15048 tcp->tcp_swl2 = seg_ack;
15049 }
15050 est:
15051 if (tcp->tcp_state > TCPS_ESTABLISHED) {
15052
15053 switch (tcp->tcp_state) {
15054 case TCPS_FIN_WAIT_1:
15055 if (tcp->tcp_fin_acked) {
15056 tcp->tcp_state = TCPS_FIN_WAIT_2;
15057 /*
15058 * We implement the non-standard BSD/SunOS
15059 * FIN_WAIT_2 flushing algorithm.
15060 * If there is no user attached to this
15061 * TCP endpoint, then this TCP struct
15062 * could hang around forever in FIN_WAIT_2
15063 * state if the peer forgets to send us
15064 * a FIN. To prevent this, we wait only
15065 * 2*MSL (a convenient time value) for
15066 * the FIN to arrive. If it doesn't show up,
15067 * we flush the TCP endpoint. This algorithm,
15068 * though a violation of RFC-793, has worked
15069 * for over 10 years in BSD systems.
15070 * Note: SunOS 4.x waits 675 seconds before
15071 * flushing the FIN_WAIT_2 connection.
15072 */
15073 TCP_TIMER_RESTART(tcp,
15074 tcps->tcps_fin_wait_2_flush_interval);
15075 }
15076 break;
15077 case TCPS_FIN_WAIT_2:
15078 break; /* Shutdown hook? */
15079 case TCPS_LAST_ACK:
15080 freemsg(mp);
15081 if (tcp->tcp_fin_acked) {
15082 (void) tcp_clean_death(tcp, 0, 19);
15083 return;
15084 }
15085 goto xmit_check;
15086 case TCPS_CLOSING:
15087 if (tcp->tcp_fin_acked) {
15088 tcp->tcp_state = TCPS_TIME_WAIT;
15089 /*
15090 * Unconditionally clear the exclusive binding
15091 * bit so this TIME-WAIT connection won't
15092 * interfere with new ones.
15093 */
15094 tcp->tcp_exclbind = 0;
15095 if (!TCP_IS_DETACHED(tcp)) {
15096 TCP_TIMER_RESTART(tcp,
15097 tcps->tcps_time_wait_interval);
15098 } else {
15099 tcp_time_wait_append(tcp);
15100 TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15101 }
15102 }
15103 /*FALLTHRU*/
15104 case TCPS_CLOSE_WAIT:
15105 freemsg(mp);
15106 goto xmit_check;
15107 default:
15113 /* Make sure we ack the fin */
15114 flags |= TH_ACK_NEEDED;
15115 if (!tcp->tcp_fin_rcvd) {
15116 tcp->tcp_fin_rcvd = B_TRUE;
15117 tcp->tcp_rnxt++;
15118 tcph = tcp->tcp_tcph;
15119 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
15120
15121 /*
15122 * Generate the ordrel_ind at the end unless we
15123 * are an eager guy.
15124 * In the eager case tcp_rsrv will do this when run
15125 * after tcp_accept is done.
15126 */
15127 if (tcp->tcp_listener == NULL &&
15128 !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding))
15129 flags |= TH_ORDREL_NEEDED;
15130 switch (tcp->tcp_state) {
15131 case TCPS_SYN_RCVD:
15132 case TCPS_ESTABLISHED:
15133 tcp->tcp_state = TCPS_CLOSE_WAIT;
15134 /* Keepalive? */
15135 break;
15136 case TCPS_FIN_WAIT_1:
15137 if (!tcp->tcp_fin_acked) {
15138 tcp->tcp_state = TCPS_CLOSING;
15139 break;
15140 }
15141 /* FALLTHRU */
15142 case TCPS_FIN_WAIT_2:
15143 tcp->tcp_state = TCPS_TIME_WAIT;
15144 /*
15145 * Unconditionally clear the exclusive binding
15146 * bit so this TIME-WAIT connection won't
15147 * interfere with new ones.
15148 */
15149 tcp->tcp_exclbind = 0;
15150 if (!TCP_IS_DETACHED(tcp)) {
15151 TCP_TIMER_RESTART(tcp,
15152 tcps->tcps_time_wait_interval);
15153 } else {
15154 tcp_time_wait_append(tcp);
15155 TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15156 }
15157 if (seg_len) {
15158 /*
15159 * implies data piggybacked on FIN.
15160 * break to handle data.
15161 */
15162 break;
15983 tea = (struct T_error_ack *)mp->b_rptr;
15984 tea->PRIM_type = T_ERROR_ACK;
15985 tea->TLI_error = TSYSERR;
15986 tea->UNIX_error = error;
15987 if (tcp->tcp_state >= TCPS_SYN_SENT) {
15988 tea->ERROR_prim = T_CONN_REQ;
15989 } else {
15990 tea->ERROR_prim = O_T_BIND_REQ;
15991 }
15992 break;
15993
15994 case T_ERROR_ACK:
15995 if (tcp->tcp_state >= TCPS_SYN_SENT)
15996 tea->ERROR_prim = T_CONN_REQ;
15997 break;
15998 default:
15999 panic("tcp_bind_failed: unexpected TPI type");
16000 /*NOTREACHED*/
16001 }
16002
16003 tcp->tcp_state = TCPS_IDLE;
16004 if (tcp->tcp_ipversion == IPV4_VERSION)
16005 tcp->tcp_ipha->ipha_src = 0;
16006 else
16007 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
16008 /*
16009 * Copy of the src addr. in tcp_t is needed since
16010 * the lookup funcs. can only look at tcp_t
16011 */
16012 V6_SET_ZERO(tcp->tcp_ip_src_v6);
16013
16014 tcph = tcp->tcp_tcph;
16015 tcph->th_lport[0] = 0;
16016 tcph->th_lport[1] = 0;
16017 tcp_bind_hash_remove(tcp);
16018 bzero(&connp->u_port, sizeof (connp->u_port));
16019 /* blow away saved option results if any */
16020 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
16021 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
16022
16025 }
16026
16027 /*
16028 * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA
16029 * messages.
16030 */
16031 void
16032 tcp_rput_other(tcp_t *tcp, mblk_t *mp)
16033 {
16034 mblk_t *mp1;
16035 uchar_t *rptr = mp->b_rptr;
16036 queue_t *q = tcp->tcp_rq;
16037 struct T_error_ack *tea;
16038 uint32_t mss;
16039 mblk_t *syn_mp;
16040 mblk_t *mdti;
16041 mblk_t *lsoi;
16042 int retval;
16043 mblk_t *ire_mp;
16044 tcp_stack_t *tcps = tcp->tcp_tcps;
16045
16046 switch (mp->b_datap->db_type) {
16047 case M_PROTO:
16048 case M_PCPROTO:
16049 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
16050 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t))
16051 break;
16052 tea = (struct T_error_ack *)rptr;
16053 switch (tea->PRIM_type) {
16054 case T_BIND_ACK:
16055 /*
16056 * Adapt Multidata information, if any. The
16057 * following tcp_mdt_update routine will free
16058 * the message.
16059 */
16060 if ((mdti = tcp_mdt_info_mp(mp)) != NULL) {
16061 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
16062 b_rptr)->mdt_capab, B_TRUE);
16063 freemsg(mdti);
16064 }
16209 * Obtain the credential from the
16210 * thread calling connect(); the credential
16211 * lives on in the second mblk which
16212 * originated from T_CONN_REQ and is echoed
16213 * with the T_BIND_ACK from ip. If none
16214 * can be found, default to the creator
16215 * of the socket.
16216 */
16217 if (mp->b_cont == NULL ||
16218 (cr = DB_CRED(mp->b_cont)) == NULL) {
16219 cr = tcp->tcp_cred;
16220 pid = tcp->tcp_cpid;
16221 } else {
16222 pid = DB_CPID(mp->b_cont);
16223 }
16224
16225 TCP_RECORD_TRACE(tcp, syn_mp,
16226 TCP_TRACE_SEND_PKT);
16227 mblk_setcred(syn_mp, cr);
16228 DB_CPID(syn_mp) = pid;
16229 tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
16230 }
16231 after_syn_sent:
16232 /*
16233 * A trailer mblk indicates a waiting client upstream.
16234 * We complete here the processing begun in
16235 * either tcp_bind() or tcp_connect() by passing
16236 * upstream the reply message they supplied.
16237 */
16238 mp1 = mp;
16239 mp = mp->b_cont;
16240 freeb(mp1);
16241 if (mp)
16242 break;
16243 return;
16244 case T_ERROR_ACK:
16245 if (tcp->tcp_debug) {
16246 (void) strlog(TCP_MOD_ID, 0, 1,
16247 SL_TRACE|SL_ERROR,
16248 "tcp_rput_other: case T_ERROR_ACK, "
17828 }
17829
17830 /*
17831 * Need to clean up all the eagers since after the unbind, segments
17832 * will no longer be delivered to this listener stream.
17833 */
17834 mutex_enter(&tcp->tcp_eager_lock);
17835 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
17836 tcp_eager_cleanup(tcp, 0);
17837 }
17838 mutex_exit(&tcp->tcp_eager_lock);
17839
17840 if (tcp->tcp_ipversion == IPV4_VERSION) {
17841 tcp->tcp_ipha->ipha_src = 0;
17842 } else {
17843 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
17844 }
17845 V6_SET_ZERO(tcp->tcp_ip_src_v6);
17846 bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport));
17847 tcp_bind_hash_remove(tcp);
17848 tcp->tcp_state = TCPS_IDLE;
17849 tcp->tcp_mdt = B_FALSE;
17850 /* Send M_FLUSH according to TPI */
17851 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
17852 connp = tcp->tcp_connp;
17853 connp->conn_mdt_ok = B_FALSE;
17854 ipcl_hash_remove(connp);
17855 bzero(&connp->conn_ports, sizeof (connp->conn_ports));
17856 mp = mi_tpi_ok_ack_alloc(mp);
17857 putnext(tcp->tcp_rq, mp);
17858 }
17859
17860 /*
17861 * Don't let port fall into the privileged range.
17862 * Since the extra privileged ports can be arbitrary we also
17863 * ensure that we exclude those from consideration.
17864 * tcp_g_epriv_ports is not sorted thus we loop over it until
17865 * there are no changes.
17866 *
17867 * Note: No locks are held when inspecting tcp_g_*epriv_ports
19506
19507 return (B_TRUE);
19508 }
19509
19510 static void
19511 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
19512 {
19513 ipha_t *ipha;
19514 ipaddr_t src;
19515 ipaddr_t dst;
19516 uint32_t cksum;
19517 ire_t *ire;
19518 uint16_t *up;
19519 ill_t *ill;
19520 conn_t *connp = tcp->tcp_connp;
19521 uint32_t hcksum_txflags = 0;
19522 mblk_t *ire_fp_mp;
19523 uint_t ire_fp_mp_len;
19524 tcp_stack_t *tcps = tcp->tcp_tcps;
19525 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
19526
19527 ASSERT(DB_TYPE(mp) == M_DATA);
19528
19529 if (DB_CRED(mp) == NULL)
19530 mblk_setcred(mp, CONN_CRED(connp));
19531
19532 ipha = (ipha_t *)mp->b_rptr;
19533 src = ipha->ipha_src;
19534 dst = ipha->ipha_dst;
19535
19536 /*
19537 * Drop off fast path for IPv6 and also if options are present or
19538 * we need to resolve a TS label.
19539 */
19540 if (tcp->tcp_ipversion != IPV4_VERSION ||
19541 !IPCL_IS_CONNECTED(connp) ||
19542 !CONN_IS_LSO_MD_FASTPATH(connp) ||
19543 (connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
19544 !connp->conn_ulp_labeled ||
19545 ipha->ipha_ident == IP_HDR_INCLUDED ||
19546 ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
19547 IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
19548 if (tcp->tcp_snd_zcopy_aware)
19549 mp = tcp_zcopy_disable(tcp, mp);
19550 TCP_STAT(tcps, tcp_ip_send);
19551 CALL_IP_WPUT(connp, q, mp);
19552 return;
19553 }
19554
19555 if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) {
19556 if (tcp->tcp_snd_zcopy_aware)
19557 mp = tcp_zcopy_backoff(tcp, mp, 0);
19558 CALL_IP_WPUT(connp, q, mp);
19559 return;
19560 }
19561 ire_fp_mp = ire->ire_nce->nce_fp_mp;
19562 ire_fp_mp_len = MBLKL(ire_fp_mp);
19563
19564 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
19565 ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
19566 #ifndef _BIG_ENDIAN
19567 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
19568 #endif
19569
19570 /*
20939
20940 /*
20941 * Set FIN bit if this is our last segment; snxt
20942 * already includes its length, and it will not
20943 * be adjusted after this point.
20944 */
20945 if (tcp->tcp_valid_bits == TCP_FSS_VALID &&
20946 *snxt == tcp->tcp_fss) {
20947 if (!tcp->tcp_fin_acked) {
20948 tcp->tcp_tcph->th_flags[0] |= TH_FIN;
20949 BUMP_MIB(&tcps->tcps_mib,
20950 tcpOutControl);
20951 }
20952 if (!tcp->tcp_fin_sent) {
20953 tcp->tcp_fin_sent = B_TRUE;
20954 /*
20955 * tcp state must be ESTABLISHED
20956 * in order for us to get here in
20957 * the first place.
20958 */
20959 tcp->tcp_state = TCPS_FIN_WAIT_1;
20960
20961 /*
20962 * Upon returning from this routine,
20963 * tcp_wput_data() will set tcp_snxt
20964 * to be equal to snxt + tcp_fin_sent.
20965 * This is essentially the same as
20966 * setting it to tcp_fss + 1.
20967 */
20968 }
20969 }
20970
20971 tcp->tcp_last_sent_len = (ushort_t)len;
20972
20973 len += tcp_hdr_len;
20974 if (tcp->tcp_ipversion == IPV4_VERSION)
20975 tcp->tcp_ipha->ipha_length = htons(len);
20976 else
20977 tcp->tcp_ip6h->ip6_plen = htons(len -
20978 ((char *)&tcp->tcp_ip6h[1] -
21574 if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
21575 ASSERT(ill->ill_hcksum_capab != NULL);
21576 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
21577 }
21578
21579 /*
21580 * Since the TCP checksum should be recalculated by h/w, we can just
21581 * zero the checksum field for HCK_FULLCKSUM, or calculate partial
21582 * pseudo-header checksum for HCK_PARTIALCKSUM.
21583 * The partial pseudo-header excludes TCP length, that was calculated
21584 * in tcp_send(), so to zero *up before further processing.
21585 */
21586 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
21587
21588 up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
21589 *up = 0;
21590
21591 IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
21592 IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
21593
21594 /*
21595 * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp).
21596 */
21597 DB_LSOFLAGS(mp) |= HW_LSO;
21598 DB_LSOMSS(mp) = mss;
21599
21600 ipha->ipha_fragment_offset_and_flags |=
21601 (uint32_t)htons(ire->ire_frag_flag);
21602
21603 ire_fp_mp = ire->ire_nce->nce_fp_mp;
21604 ire_fp_mp_len = MBLKL(ire_fp_mp);
21605 ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
21606 mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
21607 bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
21608
21609 UPDATE_OB_PKT_COUNT(ire);
21610 ire->ire_last_used_time = lbolt;
21611 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
21612 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
21613 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
23219 }
23220
23221 if (mctl_present) {
23222 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
23223
23224 ASSERT(ii->ipsec_in_type == IPSEC_IN);
23225 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) {
23226 return;
23227 }
23228 }
23229 if (zoneid == ALL_ZONES)
23230 zoneid = GLOBAL_ZONEID;
23231
23232 /* Add the zoneid so ip_output routes it properly */
23233 if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) {
23234 freemsg(ipsec_mp);
23235 return;
23236 }
23237 ipsec_mp = nmp;
23238
23239 /*
23240 * NOTE: one might consider tracing a TCP packet here, but
23241 * this function has no active TCP state and no tcp structure
23242 * that has a trace buffer. If we traced here, we would have
23243 * to keep a local trace buffer in tcp_record_trace().
23244 *
23245 * TSol note: The mblk that contains the incoming packet was
23246 * reused by tcp_xmit_listener_reset, so it already contains
23247 * the right credentials and we don't need to call mblk_setcred.
23248 * Also the conn's cred is not right since it is associated
23249 * with tcps_g_q.
23250 */
23251 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp);
23252
23253 /*
23254 * Tell IP to mark the IRE used for this destination temporary.
23255 * This way, we can limit our exposure to DoS attack because IP
23256 * creates an IRE for each destination. If there are too many,
23257 * the time to do any routing lookup will be extremely long. And
23258 * the lookup can be in interrupt context.
23439 if (ipsec_mp == NULL)
23440 return;
23441 }
23442 if (is_system_labeled() && !tsol_can_reply_error(mp)) {
23443 DTRACE_PROBE2(
23444 tx__ip__log__error__nolistener__tcp,
23445 char *, "Could not reply with RST to mp(1)",
23446 mblk_t *, mp);
23447 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
23448 freemsg(ipsec_mp);
23449 return;
23450 }
23451
23452 rptr = mp->b_rptr;
23453
23454 tcph = (tcph_t *)&rptr[ip_hdr_len];
23455 seg_seq = BE32_TO_U32(tcph->th_seq);
23456 seg_ack = BE32_TO_U32(tcph->th_ack);
23457 flags = tcph->th_flags[0];
23458
23459 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len);
23460 if (flags & TH_RST) {
23461 freemsg(ipsec_mp);
23462 } else if (flags & TH_ACK) {
23463 tcp_xmit_early_reset("no tcp, reset",
23464 ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps,
23465 connp);
23466 } else {
23467 if (flags & TH_SYN) {
23468 seg_len++;
23469 } else {
23470 /*
23471 * Here we violate the RFC. Note that a normal
23472 * TCP will never send a segment without the ACK
23473 * flag, except for RST or SYN segment. This
23474 * segment is neither. Just drop it on the
23475 * floor.
23476 */
23477 freemsg(ipsec_mp);
23478 tcps->tcps_rst_unsent++;
23776 /*
23777 * Get IP set to checksum on our behalf
23778 * Include the adjustment for a source route if any.
23779 */
23780 u1 += tcp->tcp_sum;
23781 u1 = (u1 >> 16) + (u1 & 0xFFFF);
23782 U16_TO_BE16(u1, tcph->th_sum);
23783 BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23784 }
23785 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
23786 (seq + data_length) == tcp->tcp_fss) {
23787 if (!tcp->tcp_fin_acked) {
23788 flags |= TH_FIN;
23789 BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23790 }
23791 if (!tcp->tcp_fin_sent) {
23792 tcp->tcp_fin_sent = B_TRUE;
23793 switch (tcp->tcp_state) {
23794 case TCPS_SYN_RCVD:
23795 case TCPS_ESTABLISHED:
23796 tcp->tcp_state = TCPS_FIN_WAIT_1;
23797 break;
23798 case TCPS_CLOSE_WAIT:
23799 tcp->tcp_state = TCPS_LAST_ACK;
23800 break;
23801 }
23802 if (tcp->tcp_suna == tcp->tcp_snxt)
23803 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
23804 tcp->tcp_snxt = tcp->tcp_fss + 1;
23805 }
23806 }
23807 /*
23808 * Note the trick here. u1 is unsigned. When tcp_urg
23809 * is smaller than seq, u1 will become a very huge value.
23810 * So the comparison will fail. Also note that tcp_urp
23811 * should be positive, see RFC 793 page 17.
23812 */
23813 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
23814 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
23815 u1 < (uint32_t)(64 * 1024)) {
23816 flags |= TH_URG;
23817 BUMP_MIB(&tcps->tcps_mib, tcpOutUrg);
23818 U32_TO_ABE16(u1, tcph->th_urp);
24128 /* ARGSUSED */
24129 static tcp_t *
24130 tcp_alloc_temp_tcp(in_port_t port, tcp_stack_t *tcps)
24131 {
24132 conn_t *connp;
24133 tcp_t *tcp;
24134
24135 connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP, tcps->tcps_netstack);
24136 if (connp == NULL)
24137 return (NULL);
24138
24139 tcp = connp->conn_tcp;
24140 tcp->tcp_tcps = tcps;
24141 TCPS_REFHOLD(tcps);
24142
24143 /*
24144 * Only initialize the necessary info in those structures. Note
24145 * that since INADDR_ANY is all 0, we do not need to set
24146 * tcp_bound_source to INADDR_ANY here.
24147 */
24148 tcp->tcp_state = TCPS_BOUND;
24149 tcp->tcp_lport = port;
24150 tcp->tcp_exclbind = 1;
24151 tcp->tcp_reserved_port = 1;
24152
24153 /* Just for place holding... */
24154 tcp->tcp_ipversion = IPV4_VERSION;
24155
24156 return (tcp);
24157 }
24158
24159 /*
24160 * To remove a port range specified by lo_port and hi_port from the
24161 * reserved port ranges. This is one of the three public functions of
24162 * the reserved port interface. Note that a port range has to be removed
24163 * as a whole. Ports in a range cannot be removed individually.
24164 *
24165 * Params:
24166 * in_port_t lo_port: the beginning port of the reserved port range to
24167 * be deleted.
|
3460 tbr->PRIM_type = T_BIND_ACK;
3461 mp->b_datap->db_type = M_PCPROTO;
3462
3463 /* Chain in the reply mp for tcp_rput() */
3464 mp1->b_cont = mp;
3465 mp = mp1;
3466
3467 tcp->tcp_conn_req_max = tbr->CONIND_number;
3468 if (tcp->tcp_conn_req_max) {
3469 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
3470 tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
3471 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
3472 tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
3473 /*
3474 * If this is a listener, do not reset the eager list
3475 * and other stuffs. Note that we don't check if the
3476 * existing eager list meets the new tcp_conn_req_max
3477 * requirement.
3478 */
3479 if (tcp->tcp_state != TCPS_LISTEN) {
3480 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3481 tcp_t *, tcp, int32_t, TCPS_LISTEN);
3482 tcp->tcp_state = TCPS_LISTEN;
3483 /* Initialize the chain. Don't need the eager_lock */
3484 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3485 tcp->tcp_eager_next_drop_q0 = tcp;
3486 tcp->tcp_eager_prev_drop_q0 = tcp;
3487 tcp->tcp_second_ctimer_threshold =
3488 tcps->tcps_ip_abort_linterval;
3489 }
3490 }
3491
3492 /*
3493 * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3494 * processing continues in tcp_rput_other().
3495 *
3496 * We need to make sure that the conn_recv is set to a non-null
3497 * value before we insert the conn into the classifier table.
3498 * This is to avoid a race with an incoming packet which does an
3499 * ipcl_classify().
3500 */
3501 connp->conn_recv = tcp_conn_request;
3758 * address and source port, which is
3759 * refused regardless of the
3760 * SO_REUSEADDR setting, so we break.
3761 */
3762 if (IN6_ARE_ADDR_EQUAL(laddr,
3763 <cp->tcp_bound_source_v6) &&
3764 (ltcp->tcp_state == TCPS_LISTEN ||
3765 ltcp->tcp_state == TCPS_BOUND))
3766 break;
3767 }
3768 }
3769 if (ltcp != NULL) {
3770 /* The port number is busy */
3771 mutex_exit(&tbf->tf_lock);
3772 } else {
3773 /*
3774 * This port is ours. Insert in fanout and mark as
3775 * bound to prevent others from getting the port
3776 * number.
3777 */
3778 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3779 tcp_t *, tcp, int32_t, TCPS_BOUND);
3780 tcp->tcp_state = TCPS_BOUND;
3781 tcp->tcp_lport = htons(port);
3782 *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3783
3784 ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
3785 tcp->tcp_lport)] == tbf);
3786 tcp_bind_hash_insert(tbf, tcp, 1);
3787
3788 mutex_exit(&tbf->tf_lock);
3789
3790 /*
3791 * We don't want tcp_next_port_to_try to "inherit"
3792 * a port number supplied by the user in a bind.
3793 */
3794 if (user_specified)
3795 return (port);
3796
3797 /*
3798 * This is the only place where tcp_next_port_to_try
3799 * is updated. After the update, it may or may not
3891 tcp->tcp_ipversion == IPV6_VERSION)));
3892
3893 if (TCP_IS_DETACHED(tcp)) {
3894 if (tcp->tcp_hard_binding) {
3895 /*
3896 * Its an eager that we are dealing with. We close the
3897 * eager but in case a conn_ind has already gone to the
3898 * listener, let tcp_accept_finish() send a discon_ind
3899 * to the listener and drop the last reference. If the
3900 * listener doesn't even know about the eager i.e. the
3901 * conn_ind hasn't gone up, blow away the eager and drop
3902 * the last reference as well. If the conn_ind has gone
3903 * up, state should be BOUND. tcp_accept_finish
3904 * will figure out that the connection has received a
3905 * RST and will send a DISCON_IND to the application.
3906 */
3907 tcp_closei_local(tcp);
3908 if (!tcp->tcp_tconnind_started) {
3909 CONN_DEC_REF(tcp->tcp_connp);
3910 } else {
3911 DTRACE_TCP4(state__change, void, NULL,
3912 conn_t *, NULL, tcp_t *, tcp, int32_t,
3913 TCPS_BOUND);
3914 tcp->tcp_state = TCPS_BOUND;
3915 }
3916 } else {
3917 tcp_close_detached(tcp);
3918 }
3919 return (0);
3920 }
3921
3922 TCP_STAT(tcps, tcp_clean_death_nondetached);
3923
3924 /*
3925 * If T_ORDREL_IND has not been sent yet (done when service routine
3926 * is run) postpone cleaning up the endpoint until service routine
3927 * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3928 * client_errno since tcp_close uses the client_errno field.
3929 */
3930 if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3931 if (err != 0)
3932 tcp->tcp_client_errno = err;
3933
4627 (void) tcp_time_wait_remove(tcp, NULL);
4628 CL_INET_DISCONNECT(tcp);
4629 ipcl_hash_remove(connp);
4630
4631 /*
4632 * Delete the cached ire in conn_ire_cache and also mark
4633 * the conn as CONDEMNED
4634 */
4635 mutex_enter(&connp->conn_lock);
4636 connp->conn_state_flags |= CONN_CONDEMNED;
4637 ire = connp->conn_ire_cache;
4638 connp->conn_ire_cache = NULL;
4639 mutex_exit(&connp->conn_lock);
4640 if (ire != NULL)
4641 IRE_REFRELE_NOTR(ire);
4642
4643 /* Need to cleanup any pending ioctls */
4644 ASSERT(tcp->tcp_time_wait_next == NULL);
4645 ASSERT(tcp->tcp_time_wait_prev == NULL);
4646 ASSERT(tcp->tcp_time_wait_expire == 0);
4647 if (connp->conn_fully_bound) {
4648 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
4649 tcp_t *, tcp, int32_t, TCPS_CLOSED);
4650 }
4651 tcp->tcp_state = TCPS_CLOSED;
4652
4653 /* Release any SSL context */
4654 if (tcp->tcp_kssl_ent != NULL) {
4655 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4656 tcp->tcp_kssl_ent = NULL;
4657 }
4658 if (tcp->tcp_kssl_ctx != NULL) {
4659 kssl_release_ctx(tcp->tcp_kssl_ctx);
4660 tcp->tcp_kssl_ctx = NULL;
4661 }
4662 tcp->tcp_kssl_pending = B_FALSE;
4663
4664 tcp_ipsec_cleanup(tcp);
4665 }
4666
4667 /*
4668 * tcp is dying (called from ipcl_conn_destroy and error cases).
4669 * Free the tcp_t in either case.
4670 */
5872 DTRACE_PROBE3(
5873 tx__ip__log__error__connrequest__tcp,
5874 char *, "eager connp(1) label on SYN mp(2) failed",
5875 conn_t *, econnp, mblk_t *, mp);
5876 goto error3;
5877 }
5878 }
5879
5880 eager->tcp_hard_binding = B_TRUE;
5881
5882 tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
5883 TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5884
5885 CL_INET_CONNECT(eager);
5886
5887 /*
5888 * No need to check for multicast destination since ip will only pass
5889 * up multicasts to those that have expressed interest
5890 * TODO: what about rejecting broadcasts?
5891 * Also check that source is not a multicast or broadcast address.
5892 *
5893 * DTrace tcp:::state-change is probed a little further down,
5894 * where it is set for the second time.
5895 */
5896 eager->tcp_state = TCPS_SYN_RCVD;
5897
5898
5899 /*
5900 * There should be no ire in the mp as we are being called after
5901 * receiving the SYN.
5902 */
5903 ASSERT(tcp_ire_mp(mp) == NULL);
5904
5905 /*
5906 * Adapt our mss, ttl, ... according to information provided in IRE.
5907 */
5908
5909 if (tcp_adapt_ire(eager, NULL) == 0) {
5910 /* Undo the bind_hash_insert */
5911 tcp_bind_hash_remove(eager);
5912 goto error3;
5913 }
5914
5915 /*
5916 * DTrace the first SYN as a tcp:::receive. This is placed after
5917 * tcp_adapt_ire() so that tcp->tcp_loopback has been set.
5918 */
5919 DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
5920 mp->b_rptr, tcp_t *, tcp, tcph_t *, tcph);
5921
5922 /* Process all TCP options. */
5923 tcp_process_options(eager, tcph);
5924
5925 /* Is the other end ECN capable? */
5926 if (tcps->tcps_ecn_permitted >= 1 &&
5927 (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5928 eager->tcp_ecn_ok = B_TRUE;
5929 }
5930
5931 /*
5932 * listener->tcp_rq->q_hiwat should be the default window size or a
5933 * window size changed via SO_RCVBUF option. First round up the
5934 * eager's tcp_rwnd to the nearest MSS. Then find out the window
5935 * scale option value if needed. Call tcp_rwnd_set() to finish the
5936 * setting.
5937 *
5938 * Note if there is a rpipe metric associated with the remote host,
5939 * we should not inherit receive window size from listener.
5940 */
5941 eager->tcp_rwnd = MSS_ROUNDUP(
6018 if (addr_cache != NULL && eager->tcp_remote ==
6019 addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
6020 eager->tcp_dontdrop = B_TRUE;
6021 }
6022 }
6023
6024 /*
6025 * We need to insert the eager in its own perimeter but as soon
6026 * as we do that, we expose the eager to the classifier and
6027 * should not touch any field outside the eager's perimeter.
6028 * So do all the work necessary before inserting the eager
6029 * in its own perimeter. Be optimistic that ipcl_conn_insert()
6030 * will succeed but undo everything if it fails.
6031 */
6032 seg_seq = ABE32_TO_U32(tcph->th_seq);
6033 eager->tcp_irs = seg_seq;
6034 eager->tcp_rack = seg_seq;
6035 eager->tcp_rnxt = seg_seq + 1;
6036 U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
6037 BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
6038 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, eager,
6039 int32_t, TCPS_SYN_RCVD);
6040 eager->tcp_state = TCPS_SYN_RCVD;
6041 mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
6042 NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
6043 if (mp1 == NULL) {
6044 /*
6045 * Increment the ref count as we are going to
6046 * enqueueing an mp in squeue
6047 */
6048 CONN_INC_REF(econnp);
6049 goto error;
6050 }
6051 DB_CPID(mp1) = tcp->tcp_cpid;
6052 eager->tcp_cpid = tcp->tcp_cpid;
6053 eager->tcp_open_time = lbolt64;
6054
6055 /*
6056 * We need to start the rto timer. In normal case, we start
6057 * the timer after sending the packet on the wire (or at
6058 * least believing that packet was sent by waiting for
6059 * CALL_IP_WPUT() to return). Since this is the first packet
6129 * anymore (someone blew it away). Just
6130 * free this message and hopefully remote
6131 * will retransmit at which time the SYN can be
6132 * treated as a new connection or dealth with
6133 * a TH_RST if a connection already exists.
6134 */
6135 CONN_DEC_REF(econnp);
6136 freemsg(mp);
6137 } else {
6138 squeue_fill(econnp->conn_sqp, mp, tcp_input,
6139 econnp, SQTAG_TCP_CONN_REQ_1);
6140 }
6141 } else {
6142 /* Nobody wants this packet */
6143 freemsg(mp);
6144 }
6145 return;
6146 error3:
6147 CONN_DEC_REF(econnp);
6148 error2:
6149 /*
6150 * DTrace this tcp:::receive event, as we skipped the previous receive
6151 * probe. For DTrace only, we find the IP header length so that the
6152 * TCP header can be found.
6153 */
6154 ipvers = IPH_HDR_VERSION(mp->b_rptr);
6155 if (OK_32PTR(mp->b_rptr) &&
6156 (ipvers == IPV4_VERSION || ipvers == IPV6_VERSION)) {
6157 if (ipvers == IPV4_VERSION)
6158 ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)mp->b_rptr);
6159 else
6160 ip_hdr_len = ip_hdr_length_v6(mp, (ip6_t *)mp->b_rptr);
6161 DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL,
6162 void_ip_t *, mp->b_rptr, tcp_t *, NULL, tcph_t *,
6163 &mp->b_rptr[ip_hdr_len]);
6164 }
6165
6166 freemsg(mp);
6167 }
6168
6169 /*
6170 * In an ideal case of vertical partition in NUMA architecture, its
6171 * beneficial to have the listener and all the incoming connections
6172 * tied to the same squeue. The other constraint is that incoming
6173 * connections should be tied to the squeue attached to interrupted
6174 * CPU for obvious locality reason so this leaves the listener to
6175 * be tied to the same squeue. Our only problem is that when listener
6176 * is binding, the CPU that will get interrupted by the NIC whose
6177 * IP address the listener is binding to is not even known. So
6178 * the code below allows us to change that binding at the time the
6179 * CPU is interrupted by virtue of incoming connection's squeue.
6180 *
6181 * This is usefull only in case of a listener bound to a specific IP
6182 * address. For other kind of listeners, they get bound the
6183 * very first time and there is no attempt to rebind them.
6184 */
6185 void
6644 * At this point the remote destination address and remote port fields
6645 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6646 * have to see which state tcp was in so we can take apropriate action.
6647 */
6648 if (oldstate == TCPS_IDLE) {
6649 /*
6650 * We support a quick connect capability here, allowing
6651 * clients to transition directly from IDLE to SYN_SENT
6652 * tcp_bindi will pick an unused port, insert the connection
6653 * in the bind hash and transition to BOUND state.
6654 */
6655 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6656 tcp, B_TRUE);
6657 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6658 B_FALSE, B_FALSE);
6659 if (lport == 0) {
6660 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6661 goto failed;
6662 }
6663 }
6664 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6665 int32_t, TCPS_SYN_SENT);
6666 tcp->tcp_state = TCPS_SYN_SENT;
6667
6668 /*
6669 * TODO: allow data with connect requests
6670 * by unlinking M_DATA trailers here and
6671 * linking them in behind the T_OK_ACK mblk.
6672 * The tcp_rput() bind ack handler would then
6673 * feed them to tcp_wput_data() rather than call
6674 * tcp_timer().
6675 */
6676 mp = mi_tpi_ok_ack_alloc(mp);
6677 if (!mp) {
6678 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6679 tcp_t *, tcp, int32_t, oldstate);
6680 tcp->tcp_state = oldstate;
6681 goto failed;
6682 }
6683 if (tcp->tcp_family == AF_INET) {
6684 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6685 sizeof (ipa_conn_t));
6686 } else {
6687 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6688 sizeof (ipa6_conn_t));
6689 }
6690 if (mp1) {
6691 /*
6692 * We need to make sure that the conn_recv is set to a non-null
6693 * value before we insert the conn_t into the classifier table.
6694 * This is to avoid a race with an incoming packet which does
6695 * an ipcl_classify().
6696 */
6697 tcp->tcp_connp->conn_recv = tcp_input;
6698
6699 /* Hang onto the T_OK_ACK for later. */
6700 linkb(mp1, mp);
6701 mblk_setcred(mp1, tcp->tcp_cred);
6702 if (tcp->tcp_family == AF_INET)
6703 mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6704 else {
6705 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6706 &tcp->tcp_sticky_ipp);
6707 }
6708 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6709 tcp->tcp_active_open = 1;
6710 /*
6711 * If the bind cannot complete immediately
6712 * IP will arrange to call tcp_rput_other
6713 * when the bind completes.
6714 */
6715 if (mp1 != NULL)
6716 tcp_rput_other(tcp, mp1);
6717 return;
6718 }
6719 /* Error case */
6720 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6721 int32_t, oldstate);
6722 tcp->tcp_state = oldstate;
6723 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6724
6725 failed:
6726 /* return error ack and blow away saved option results if any */
6727 if (mp != NULL)
6728 putnext(tcp->tcp_rq, mp);
6729 else {
6730 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6731 TSYSERR, ENOMEM);
6732 }
6733 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6734 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6735
6736 }
6737
6738 /*
6739 * Handle connect to IPv6 destinations.
6740 */
6741 static void
6858 * At this point the remote destination address and remote port fields
6859 * in the tcp-four-tuple have been filled in the tcp structure. Now we
6860 * have to see which state tcp was in so we can take apropriate action.
6861 */
6862 if (oldstate == TCPS_IDLE) {
6863 /*
6864 * We support a quick connect capability here, allowing
6865 * clients to transition directly from IDLE to SYN_SENT
6866 * tcp_bindi will pick an unused port, insert the connection
6867 * in the bind hash and transition to BOUND state.
6868 */
6869 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6870 tcp, B_TRUE);
6871 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6872 B_FALSE, B_FALSE);
6873 if (lport == 0) {
6874 mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6875 goto failed;
6876 }
6877 }
6878 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6879 int32_t, TCPS_SYN_SENT);
6880 tcp->tcp_state = TCPS_SYN_SENT;
6881 /*
6882 * TODO: allow data with connect requests
6883 * by unlinking M_DATA trailers here and
6884 * linking them in behind the T_OK_ACK mblk.
6885 * The tcp_rput() bind ack handler would then
6886 * feed them to tcp_wput_data() rather than call
6887 * tcp_timer().
6888 */
6889 mp = mi_tpi_ok_ack_alloc(mp);
6890 if (!mp) {
6891 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6892 tcp_t *, tcp, int32_t, oldstate);
6893 tcp->tcp_state = oldstate;
6894 goto failed;
6895 }
6896 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6897 if (mp1) {
6898 /*
6899 * We need to make sure that the conn_recv is set to a non-null
6900 * value before we insert the conn_t into the classifier table.
6901 * This is to avoid a race with an incoming packet which does
6902 * an ipcl_classify().
6903 */
6904 tcp->tcp_connp->conn_recv = tcp_input;
6905
6906 /* Hang onto the T_OK_ACK for later. */
6907 linkb(mp1, mp);
6908 mblk_setcred(mp1, tcp->tcp_cred);
6909 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6910 &tcp->tcp_sticky_ipp);
6911 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6912 tcp->tcp_active_open = 1;
6913 /* ip_bind_v6() may return ACK or ERROR */
6914 if (mp1 != NULL)
6915 tcp_rput_other(tcp, mp1);
6916 return;
6917 }
6918 /* Error case */
6919 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6920 int32_t, oldstate);
6921 tcp->tcp_state = oldstate;
6922 mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6923
6924 failed:
6925 /* return error ack and blow away saved option results if any */
6926 if (mp != NULL)
6927 putnext(tcp->tcp_rq, mp);
6928 else {
6929 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6930 TSYSERR, ENOMEM);
6931 }
6932 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6933 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6934 }
6935
6936 /*
6937 * We need a stream q for detached closing tcp connections
6938 * to use. Our client hereby indicates that this q is the
6939 * one to use.
6940 */
7049 ltcp = NULL;
7050 /*
7051 * If it used to be a listener, check to make sure no one else
7052 * has taken the port before switching back to LISTEN state.
7053 */
7054 if (tcp->tcp_ipversion == IPV4_VERSION) {
7055 connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
7056 tcp->tcp_ipha->ipha_src,
7057 tcp->tcp_connp->conn_zoneid, ipst);
7058 if (connp != NULL)
7059 ltcp = connp->conn_tcp;
7060 } else {
7061 /* Allow tcp_bound_if listeners? */
7062 connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
7063 &tcp->tcp_ip6h->ip6_src, 0,
7064 tcp->tcp_connp->conn_zoneid, ipst);
7065 if (connp != NULL)
7066 ltcp = connp->conn_tcp;
7067 }
7068 if (tcp->tcp_conn_req_max && ltcp == NULL) {
7069 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7070 tcp_t *, tcp, int32_t, TCPS_LISTEN);
7071 tcp->tcp_state = TCPS_LISTEN;
7072 } else if (old_state > TCPS_BOUND) {
7073 tcp->tcp_conn_req_max = 0;
7074 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7075 tcp_t *, tcp, int32_t, TCPS_BOUND);
7076 tcp->tcp_state = TCPS_BOUND;
7077 }
7078 if (ltcp != NULL)
7079 CONN_DEC_REF(ltcp->tcp_connp);
7080 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
7081 BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
7082 } else if (old_state == TCPS_ESTABLISHED ||
7083 old_state == TCPS_CLOSE_WAIT) {
7084 BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
7085 }
7086
7087 if (tcp->tcp_fused)
7088 tcp_unfuse(tcp);
7089
7090 mutex_enter(&tcp->tcp_eager_lock);
7091 if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
7092 (tcp->tcp_conn_req_cnt_q != 0)) {
7093 tcp_eager_cleanup(tcp, 0);
7094 }
7095 mutex_exit(&tcp->tcp_eager_lock);
7956 tcp_ipsec_cleanup(tcp);
7957
7958 if (tcp->tcp_conn_req_max != 0) {
7959 /*
7960 * This is the case when a TLI program uses the same
7961 * transport end point to accept a connection. This
7962 * makes the TCP both a listener and acceptor. When
7963 * this connection is closed, we need to set the state
7964 * back to TCPS_LISTEN. Make sure that the eager list
7965 * is reinitialized.
7966 *
7967 * Note that this stream is still bound to the four
7968 * tuples of the previous connection in IP. If a new
7969 * SYN with different foreign address comes in, IP will
7970 * not find it and will send it to the global queue. In
7971 * the global queue, TCP will do a tcp_lookup_listener()
7972 * to find this stream. This works because this stream
7973 * is only removed from connected hash.
7974 *
7975 */
7976 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7977 tcp_t *, tcp, int32_t, TCPS_LISTEN);
7978 tcp->tcp_state = TCPS_LISTEN;
7979 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7980 tcp->tcp_eager_next_drop_q0 = tcp;
7981 tcp->tcp_eager_prev_drop_q0 = tcp;
7982 tcp->tcp_connp->conn_recv = tcp_conn_request;
7983 if (tcp->tcp_family == AF_INET6) {
7984 ASSERT(tcp->tcp_connp->conn_af_isv6);
7985 (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7986 &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7987 } else {
7988 ASSERT(!tcp->tcp_connp->conn_af_isv6);
7989 (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7990 tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7991 }
7992 } else {
7993 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7994 tcp_t *, tcp, int32_t, TCPS_BOUND);
7995 tcp->tcp_state = TCPS_BOUND;
7996 }
7997
7998 /*
7999 * Initialize to default values
8000 * Can't fail since enough header template space already allocated
8001 * at open().
8002 */
8003 err = tcp_init_values(tcp);
8004 ASSERT(err == 0);
8005 /* Restore state in tcp_tcph */
8006 bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
8007 if (tcp->tcp_ipversion == IPV4_VERSION)
8008 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
8009 else
8010 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
8011 /*
8012 * Copy of the src addr. in tcp_t is needed in tcp_t
8013 * since the lookup funcs can only lookup on tcp_t
8014 */
8336 DONTCARE(tcp->tcmp_stk[0]);
8337 #endif
8338
8339
8340 #undef DONTCARE
8341 #undef PRESERVE
8342 }
8343
8344 /*
8345 * Allocate necessary resources and initialize state vector.
8346 * Guaranteed not to fail so that when an error is returned,
8347 * the caller doesn't need to do any additional cleanup.
8348 */
8349 int
8350 tcp_init(tcp_t *tcp, queue_t *q)
8351 {
8352 int err;
8353
8354 tcp->tcp_rq = q;
8355 tcp->tcp_wq = WR(q);
8356 /* DTrace ignores this - it isn't a tcp:::state-change */
8357 tcp->tcp_state = TCPS_IDLE;
8358 if ((err = tcp_init_values(tcp)) != 0)
8359 tcp_timers_stop(tcp);
8360 return (err);
8361 }
8362
8363 static int
8364 tcp_init_values(tcp_t *tcp)
8365 {
8366 int err;
8367 tcp_stack_t *tcps = tcp->tcp_tcps;
8368
8369 ASSERT((tcp->tcp_family == AF_INET &&
8370 tcp->tcp_ipversion == IPV4_VERSION) ||
8371 (tcp->tcp_family == AF_INET6 &&
8372 (tcp->tcp_ipversion == IPV4_VERSION ||
8373 tcp->tcp_ipversion == IPV6_VERSION)));
8374
8375 /*
8376 * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
13338 if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
13339 freemsg(mp);
13340 return;
13341 }
13342 /* Update pointers into message */
13343 iphdr = rptr = mp->b_rptr;
13344 tcph = (tcph_t *)&rptr[ip_hdr_len];
13345 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
13346 /*
13347 * Since we can't handle any data with this urgent
13348 * pointer that is out of sequence, we expunge
13349 * the data. This allows us to still register
13350 * the urgent mark and generate the M_PCSIG,
13351 * which we can do.
13352 */
13353 mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
13354 seg_len = 0;
13355 }
13356 }
13357
13358 DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
13359 iphdr, tcp_t *, tcp, tcph_t *, tcph);
13360 if (tcp->tcp_state == TCPS_SYN_RCVD && (flags & TH_ACK)) {
13361 DTRACE_TCP5(accept__established, mblk_t *, NULL, conn_t *,
13362 NULL, void_ip_t *, iphdr, tcp_t *, tcp, tcph_t *, tcph);
13363 }
13364
13365 switch (tcp->tcp_state) {
13366 case TCPS_SYN_SENT:
13367 if (flags & TH_ACK) {
13368 /*
13369 * Note that our stack cannot send data before a
13370 * connection is established, therefore the
13371 * following check is valid. Otherwise, it has
13372 * to be changed.
13373 */
13374 if (SEQ_LEQ(seg_ack, tcp->tcp_iss) ||
13375 SEQ_GT(seg_ack, tcp->tcp_snxt)) {
13376 freemsg(mp);
13377 if (flags & TH_RST)
13378 return;
13379 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
13380 tcp, seg_ack, 0, TH_RST);
13381 return;
13382 }
13383 ASSERT(tcp->tcp_suna + 1 == seg_ack);
13384 }
13385 if (flags & TH_RST) {
13386 DTRACE_TCP5(connect__refused, mblk_t *, NULL,
13387 conn_t *, NULL, void_ip_t *, iphdr, tcp_t *, NULL,
13388 tcph_t *, tcph);
13389
13390 freemsg(mp);
13391 if (flags & TH_ACK)
13392 (void) tcp_clean_death(tcp,
13393 ECONNREFUSED, 13);
13394 return;
13395 }
13396 if (!(flags & TH_SYN)) {
13397 freemsg(mp);
13398 return;
13399 }
13400
13401 /* Process all TCP options. */
13402 tcp_process_options(tcp, tcph);
13403 /*
13404 * The following changes our rwnd to be a multiple of the
13405 * MIN(peer MSS, our MSS) for performance reason.
13406 */
13407 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat,
13408 tcp->tcp_mss));
13409
13444 *
13445 * XXX: how can we pretend we didn't see it if we
13446 * have updated rnxt et. al.
13447 *
13448 * For loopback we defer sending up the T_CONN_CON
13449 * until after some checks below.
13450 */
13451 mp1 = NULL;
13452 if (!tcp_conn_con(tcp, iphdr, tcph, mp,
13453 tcp->tcp_loopback ? &mp1 : NULL)) {
13454 freemsg(mp);
13455 return;
13456 }
13457 /* SYN was acked - making progress */
13458 if (tcp->tcp_ipversion == IPV6_VERSION)
13459 tcp->tcp_ip_forward_progress = B_TRUE;
13460
13461 /* One for the SYN */
13462 tcp->tcp_suna = tcp->tcp_iss + 1;
13463 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
13464 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
13465 tcp_t *, tcp, int32_t, TCPS_ESTABLISHED);
13466 tcp->tcp_state = TCPS_ESTABLISHED;
13467
13468 /*
13469 * For DTrace observability, remember that we just
13470 * established a connection and are about to send
13471 * the final ACK.
13472 */
13473 tcp->tcp_dtrace_connect_established = B_TRUE;
13474
13475 /*
13476 * If SYN was retransmitted, need to reset all
13477 * retransmission info. This is because this
13478 * segment will be treated as a dup ACK.
13479 */
13480 if (tcp->tcp_rexmit) {
13481 tcp->tcp_rexmit = B_FALSE;
13482 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
13483 tcp->tcp_rexmit_max = tcp->tcp_snxt;
13484 tcp->tcp_snd_burst = tcp->tcp_localnet ?
13485 TCP_CWND_INFINITE : TCP_CWND_NORMAL;
13486 tcp->tcp_ms_we_have_waited = 0;
13487
13488 /*
13489 * Set tcp_cwnd back to 1 MSS, per
13490 * recommendation from
13491 * draft-floyd-incr-init-win-01.txt,
13492 * Increasing TCP's Initial Window.
13493 */
13494 tcp->tcp_cwnd = tcp->tcp_mss;
13495 }
13562
13563 /*
13564 * Check to see if there is data to be sent. If
13565 * yes, set the transmit flag. Then check to see
13566 * if received data processing needs to be done.
13567 * If not, go straight to xmit_check. This short
13568 * cut is OK as we don't support T/TCP.
13569 */
13570 if (tcp->tcp_unsent)
13571 flags |= TH_XMIT_NEEDED;
13572
13573 if (seg_len == 0 && !(flags & TH_URG)) {
13574 freemsg(mp);
13575 goto xmit_check;
13576 }
13577
13578 flags &= ~TH_SYN;
13579 seg_seq++;
13580 break;
13581 }
13582 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
13583 tcp_t *, tcp, int32_t, TCPS_SYN_RCVD);
13584 tcp->tcp_state = TCPS_SYN_RCVD;
13585 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
13586 NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
13587 if (mp1) {
13588 DB_CPID(mp1) = tcp->tcp_cpid;
13589 TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT);
13590 tcp_send_data(tcp, tcp->tcp_wq, mp1);
13591 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
13592 }
13593 freemsg(mp);
13594 return;
13595 case TCPS_SYN_RCVD:
13596 if (flags & TH_ACK) {
13597 /*
13598 * In this state, a SYN|ACK packet is either bogus
13599 * because the other side must be ACKing our SYN which
13600 * indicates it has seen the ACK for their SYN and
13601 * shouldn't retransmit it or we're crossing SYNs
13602 * on active open.
13603 */
14483 tcp->tcp_cwnd = mss;
14484 }
14485
14486 /*
14487 * We set the send window to zero here.
14488 * This is needed if there is data to be
14489 * processed already on the queue.
14490 * Later (at swnd_update label), the
14491 * "new_swnd > tcp_swnd" condition is satisfied
14492 * the XMIT_NEEDED flag is set in the current
14493 * (SYN_RCVD) state. This ensures tcp_wput_data() is
14494 * called if there is already data on queue in
14495 * this state.
14496 */
14497 tcp->tcp_swnd = 0;
14498
14499 if (new_swnd > tcp->tcp_max_swnd)
14500 tcp->tcp_max_swnd = new_swnd;
14501 tcp->tcp_swl1 = seg_seq;
14502 tcp->tcp_swl2 = seg_ack;
14503 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
14504 tcp_t *, tcp, int32_t, TCPS_ESTABLISHED);
14505 tcp->tcp_state = TCPS_ESTABLISHED;
14506 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
14507
14508 /* Fuse when both sides are in ESTABLISHED state */
14509 if (tcp->tcp_loopback && do_tcp_fusion)
14510 tcp_fuse(tcp, iphdr, tcph);
14511
14512 }
14513 /* This code follows 4.4BSD-Lite2 mostly. */
14514 if (bytes_acked < 0)
14515 goto est;
14516
14517 /*
14518 * If TCP is ECN capable and the congestion experience bit is
14519 * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be
14520 * done once per window (or more loosely, per RTT).
14521 */
14522 if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
14523 tcp->tcp_cwr = B_FALSE;
14524 if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {
15121 *
15122 * 1. the segment acknowledges some data. Or
15123 * 2. the segment is new, i.e. it has a higher seq num. Or
15124 * 3. the segment is not old and the advertised window is
15125 * larger than the previous advertised window.
15126 */
15127 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd)
15128 flags |= TH_XMIT_NEEDED;
15129 tcp->tcp_swnd = new_swnd;
15130 if (new_swnd > tcp->tcp_max_swnd)
15131 tcp->tcp_max_swnd = new_swnd;
15132 tcp->tcp_swl1 = seg_seq;
15133 tcp->tcp_swl2 = seg_ack;
15134 }
15135 est:
15136 if (tcp->tcp_state > TCPS_ESTABLISHED) {
15137
15138 switch (tcp->tcp_state) {
15139 case TCPS_FIN_WAIT_1:
15140 if (tcp->tcp_fin_acked) {
15141 DTRACE_TCP4(state__change, void, NULL,
15142 conn_t *, NULL, tcp_t *, tcp, int32_t,
15143 TCPS_FIN_WAIT_2);
15144 tcp->tcp_state = TCPS_FIN_WAIT_2;
15145 /*
15146 * We implement the non-standard BSD/SunOS
15147 * FIN_WAIT_2 flushing algorithm.
15148 * If there is no user attached to this
15149 * TCP endpoint, then this TCP struct
15150 * could hang around forever in FIN_WAIT_2
15151 * state if the peer forgets to send us
15152 * a FIN. To prevent this, we wait only
15153 * 2*MSL (a convenient time value) for
15154 * the FIN to arrive. If it doesn't show up,
15155 * we flush the TCP endpoint. This algorithm,
15156 * though a violation of RFC-793, has worked
15157 * for over 10 years in BSD systems.
15158 * Note: SunOS 4.x waits 675 seconds before
15159 * flushing the FIN_WAIT_2 connection.
15160 */
15161 TCP_TIMER_RESTART(tcp,
15162 tcps->tcps_fin_wait_2_flush_interval);
15163 }
15164 break;
15165 case TCPS_FIN_WAIT_2:
15166 break; /* Shutdown hook? */
15167 case TCPS_LAST_ACK:
15168 freemsg(mp);
15169 if (tcp->tcp_fin_acked) {
15170 (void) tcp_clean_death(tcp, 0, 19);
15171 return;
15172 }
15173 goto xmit_check;
15174 case TCPS_CLOSING:
15175 if (tcp->tcp_fin_acked) {
15176 DTRACE_TCP4(state__change, void, NULL,
15177 conn_t *, NULL, tcp_t *, tcp, int32_t,
15178 TCPS_TIME_WAIT);
15179 tcp->tcp_state = TCPS_TIME_WAIT;
15180 /*
15181 * Unconditionally clear the exclusive binding
15182 * bit so this TIME-WAIT connection won't
15183 * interfere with new ones.
15184 */
15185 tcp->tcp_exclbind = 0;
15186 if (!TCP_IS_DETACHED(tcp)) {
15187 TCP_TIMER_RESTART(tcp,
15188 tcps->tcps_time_wait_interval);
15189 } else {
15190 tcp_time_wait_append(tcp);
15191 TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15192 }
15193 }
15194 /*FALLTHRU*/
15195 case TCPS_CLOSE_WAIT:
15196 freemsg(mp);
15197 goto xmit_check;
15198 default:
15204 /* Make sure we ack the fin */
15205 flags |= TH_ACK_NEEDED;
15206 if (!tcp->tcp_fin_rcvd) {
15207 tcp->tcp_fin_rcvd = B_TRUE;
15208 tcp->tcp_rnxt++;
15209 tcph = tcp->tcp_tcph;
15210 U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
15211
15212 /*
15213 * Generate the ordrel_ind at the end unless we
15214 * are an eager guy.
15215 * In the eager case tcp_rsrv will do this when run
15216 * after tcp_accept is done.
15217 */
15218 if (tcp->tcp_listener == NULL &&
15219 !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding))
15220 flags |= TH_ORDREL_NEEDED;
15221 switch (tcp->tcp_state) {
15222 case TCPS_SYN_RCVD:
15223 case TCPS_ESTABLISHED:
15224 DTRACE_TCP4(state__change, void, NULL,
15225 conn_t *, NULL, tcp_t *, tcp, int32_t,
15226 TCPS_CLOSE_WAIT);
15227 tcp->tcp_state = TCPS_CLOSE_WAIT;
15228 /* Keepalive? */
15229 break;
15230 case TCPS_FIN_WAIT_1:
15231 if (!tcp->tcp_fin_acked) {
15232 DTRACE_TCP4(state__change, void, NULL,
15233 conn_t *, NULL, tcp_t *, tcp,
15234 int32_t, TCPS_CLOSING);
15235 tcp->tcp_state = TCPS_CLOSING;
15236 break;
15237 }
15238 /* FALLTHRU */
15239 case TCPS_FIN_WAIT_2:
15240 DTRACE_TCP4(state__change, void, NULL,
15241 conn_t *, NULL, tcp_t *, tcp, int32_t,
15242 TCPS_TIME_WAIT);
15243 tcp->tcp_state = TCPS_TIME_WAIT;
15244 /*
15245 * Unconditionally clear the exclusive binding
15246 * bit so this TIME-WAIT connection won't
15247 * interfere with new ones.
15248 */
15249 tcp->tcp_exclbind = 0;
15250 if (!TCP_IS_DETACHED(tcp)) {
15251 TCP_TIMER_RESTART(tcp,
15252 tcps->tcps_time_wait_interval);
15253 } else {
15254 tcp_time_wait_append(tcp);
15255 TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15256 }
15257 if (seg_len) {
15258 /*
15259 * implies data piggybacked on FIN.
15260 * break to handle data.
15261 */
15262 break;
16083 tea = (struct T_error_ack *)mp->b_rptr;
16084 tea->PRIM_type = T_ERROR_ACK;
16085 tea->TLI_error = TSYSERR;
16086 tea->UNIX_error = error;
16087 if (tcp->tcp_state >= TCPS_SYN_SENT) {
16088 tea->ERROR_prim = T_CONN_REQ;
16089 } else {
16090 tea->ERROR_prim = O_T_BIND_REQ;
16091 }
16092 break;
16093
16094 case T_ERROR_ACK:
16095 if (tcp->tcp_state >= TCPS_SYN_SENT)
16096 tea->ERROR_prim = T_CONN_REQ;
16097 break;
16098 default:
16099 panic("tcp_bind_failed: unexpected TPI type");
16100 /*NOTREACHED*/
16101 }
16102
16103 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
16104 int32_t, TCPS_IDLE);
16105 tcp->tcp_state = TCPS_IDLE;
16106 if (tcp->tcp_ipversion == IPV4_VERSION)
16107 tcp->tcp_ipha->ipha_src = 0;
16108 else
16109 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
16110 /*
16111 * Copy of the src addr. in tcp_t is needed since
16112 * the lookup funcs. can only look at tcp_t
16113 */
16114 V6_SET_ZERO(tcp->tcp_ip_src_v6);
16115
16116 tcph = tcp->tcp_tcph;
16117 tcph->th_lport[0] = 0;
16118 tcph->th_lport[1] = 0;
16119 tcp_bind_hash_remove(tcp);
16120 bzero(&connp->u_port, sizeof (connp->u_port));
16121 /* blow away saved option results if any */
16122 if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
16123 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
16124
16127 }
16128
16129 /*
16130 * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA
16131 * messages.
16132 */
16133 void
16134 tcp_rput_other(tcp_t *tcp, mblk_t *mp)
16135 {
16136 mblk_t *mp1;
16137 uchar_t *rptr = mp->b_rptr;
16138 queue_t *q = tcp->tcp_rq;
16139 struct T_error_ack *tea;
16140 uint32_t mss;
16141 mblk_t *syn_mp;
16142 mblk_t *mdti;
16143 mblk_t *lsoi;
16144 int retval;
16145 mblk_t *ire_mp;
16146 tcp_stack_t *tcps = tcp->tcp_tcps;
16147 uint_t ip_hdr_len;
16148
16149 switch (mp->b_datap->db_type) {
16150 case M_PROTO:
16151 case M_PCPROTO:
16152 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
16153 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t))
16154 break;
16155 tea = (struct T_error_ack *)rptr;
16156 switch (tea->PRIM_type) {
16157 case T_BIND_ACK:
16158 /*
16159 * Adapt Multidata information, if any. The
16160 * following tcp_mdt_update routine will free
16161 * the message.
16162 */
16163 if ((mdti = tcp_mdt_info_mp(mp)) != NULL) {
16164 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
16165 b_rptr)->mdt_capab, B_TRUE);
16166 freemsg(mdti);
16167 }
16312 * Obtain the credential from the
16313 * thread calling connect(); the credential
16314 * lives on in the second mblk which
16315 * originated from T_CONN_REQ and is echoed
16316 * with the T_BIND_ACK from ip. If none
16317 * can be found, default to the creator
16318 * of the socket.
16319 */
16320 if (mp->b_cont == NULL ||
16321 (cr = DB_CRED(mp->b_cont)) == NULL) {
16322 cr = tcp->tcp_cred;
16323 pid = tcp->tcp_cpid;
16324 } else {
16325 pid = DB_CPID(mp->b_cont);
16326 }
16327
16328 TCP_RECORD_TRACE(tcp, syn_mp,
16329 TCP_TRACE_SEND_PKT);
16330 mblk_setcred(syn_mp, cr);
16331 DB_CPID(syn_mp) = pid;
16332
16333 /*
16334 * DTrace sending the first SYN as a
16335 * tcp:::connect-request event. For DTrace
16336 * only, the IP header length is found
16337 * so that the TCP header can be retrieved.
16338 */
16339 if (tcp->tcp_ipversion == IPV4_VERSION)
16340 ip_hdr_len = IPH_HDR_LENGTH(
16341 (ipha_t *)syn_mp->b_rptr);
16342 else
16343 ip_hdr_len = ip_hdr_length_v6(mp,
16344 (ip6_t *)syn_mp->b_rptr);
16345 DTRACE_TCP5(connect__request, mblk_t *, NULL,
16346 conn_t *, NULL, void_ip_t *,
16347 syn_mp->b_rptr, tcp_t *, tcp, tcph_t *,
16348 &syn_mp->b_rptr[ip_hdr_len]);
16349
16350 tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
16351 }
16352 after_syn_sent:
16353 /*
16354 * A trailer mblk indicates a waiting client upstream.
16355 * We complete here the processing begun in
16356 * either tcp_bind() or tcp_connect() by passing
16357 * upstream the reply message they supplied.
16358 */
16359 mp1 = mp;
16360 mp = mp->b_cont;
16361 freeb(mp1);
16362 if (mp)
16363 break;
16364 return;
16365 case T_ERROR_ACK:
16366 if (tcp->tcp_debug) {
16367 (void) strlog(TCP_MOD_ID, 0, 1,
16368 SL_TRACE|SL_ERROR,
16369 "tcp_rput_other: case T_ERROR_ACK, "
17949 }
17950
17951 /*
17952 * Need to clean up all the eagers since after the unbind, segments
17953 * will no longer be delivered to this listener stream.
17954 */
17955 mutex_enter(&tcp->tcp_eager_lock);
17956 if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
17957 tcp_eager_cleanup(tcp, 0);
17958 }
17959 mutex_exit(&tcp->tcp_eager_lock);
17960
17961 if (tcp->tcp_ipversion == IPV4_VERSION) {
17962 tcp->tcp_ipha->ipha_src = 0;
17963 } else {
17964 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
17965 }
17966 V6_SET_ZERO(tcp->tcp_ip_src_v6);
17967 bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport));
17968 tcp_bind_hash_remove(tcp);
17969 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
17970 int32_t, TCPS_IDLE);
17971 tcp->tcp_state = TCPS_IDLE;
17972 tcp->tcp_mdt = B_FALSE;
17973 /* Send M_FLUSH according to TPI */
17974 (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
17975 connp = tcp->tcp_connp;
17976 connp->conn_mdt_ok = B_FALSE;
17977 ipcl_hash_remove(connp);
17978 bzero(&connp->conn_ports, sizeof (connp->conn_ports));
17979 mp = mi_tpi_ok_ack_alloc(mp);
17980 putnext(tcp->tcp_rq, mp);
17981 }
17982
17983 /*
17984 * Don't let port fall into the privileged range.
17985 * Since the extra privileged ports can be arbitrary we also
17986 * ensure that we exclude those from consideration.
17987 * tcp_g_epriv_ports is not sorted thus we loop over it until
17988 * there are no changes.
17989 *
17990 * Note: No locks are held when inspecting tcp_g_*epriv_ports
19629
19630 return (B_TRUE);
19631 }
19632
19633 static void
19634 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
19635 {
19636 ipha_t *ipha;
19637 ipaddr_t src;
19638 ipaddr_t dst;
19639 uint32_t cksum;
19640 ire_t *ire;
19641 uint16_t *up;
19642 ill_t *ill;
19643 conn_t *connp = tcp->tcp_connp;
19644 uint32_t hcksum_txflags = 0;
19645 mblk_t *ire_fp_mp;
19646 uint_t ire_fp_mp_len;
19647 tcp_stack_t *tcps = tcp->tcp_tcps;
19648 ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
19649 uint_t ip_hdr_len;
19650
19651 ASSERT(DB_TYPE(mp) == M_DATA);
19652
19653 if (DB_CRED(mp) == NULL)
19654 mblk_setcred(mp, CONN_CRED(connp));
19655
19656 ipha = (ipha_t *)mp->b_rptr;
19657 src = ipha->ipha_src;
19658 dst = ipha->ipha_dst;
19659
19660 if (tcp->tcp_ipversion == IPV4_VERSION) {
19661 DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL,
19662 void_ip_t *, ipha, tcp_t *, tcp, tcph_t *,
19663 &mp->b_rptr[IPH_HDR_LENGTH(mp->b_rptr)]);
19664 if (tcp->tcp_dtrace_connect_established) {
19665 DTRACE_TCP5(connect__established, mblk_t *, NULL,
19666 conn_t *, NULL, void_ip_t *, ipha, tcp_t *, tcp,
19667 tcph_t *, &mp->b_rptr[IPH_HDR_LENGTH(mp->b_rptr)]);
19668 tcp->tcp_dtrace_connect_established = B_FALSE;
19669 }
19670 }
19671
19672 /*
19673 * Drop off fast path for IPv6 and also if options are present or
19674 * we need to resolve a TS label.
19675 */
19676 if (tcp->tcp_ipversion != IPV4_VERSION ||
19677 !IPCL_IS_CONNECTED(connp) ||
19678 !CONN_IS_LSO_MD_FASTPATH(connp) ||
19679 (connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
19680 !connp->conn_ulp_labeled ||
19681 ipha->ipha_ident == IP_HDR_INCLUDED ||
19682 ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
19683 IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
19684 if (tcp->tcp_snd_zcopy_aware)
19685 mp = tcp_zcopy_disable(tcp, mp);
19686 TCP_STAT(tcps, tcp_ip_send);
19687
19688 if (tcp->tcp_ipversion == IPV6_VERSION) {
19689 ip_hdr_len = ip_hdr_length_v6(mp, (ip6_t *)mp->b_rptr);
19690 DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL,
19691 void_ip_t *, mp->b_rptr, tcp_t *, tcp, tcph_t *,
19692 &mp->b_rptr[ip_hdr_len]);
19693 if (tcp->tcp_dtrace_connect_established) {
19694 DTRACE_TCP5(connect__established, mblk_t *,
19695 NULL, conn_t *, NULL, void_ip_t *,
19696 mp->b_rptr, tcp_t *, tcp, tcph_t *,
19697 &mp->b_rptr[ip_hdr_len]);
19698 tcp->tcp_dtrace_connect_established = B_FALSE;
19699 }
19700 }
19701
19702 CALL_IP_WPUT(connp, q, mp);
19703 return;
19704 }
19705
19706 if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) {
19707 if (tcp->tcp_snd_zcopy_aware)
19708 mp = tcp_zcopy_backoff(tcp, mp, 0);
19709 CALL_IP_WPUT(connp, q, mp);
19710 return;
19711 }
19712 ire_fp_mp = ire->ire_nce->nce_fp_mp;
19713 ire_fp_mp_len = MBLKL(ire_fp_mp);
19714
19715 ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
19716 ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
19717 #ifndef _BIG_ENDIAN
19718 ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
19719 #endif
19720
19721 /*
21090
21091 /*
21092 * Set FIN bit if this is our last segment; snxt
21093 * already includes its length, and it will not
21094 * be adjusted after this point.
21095 */
21096 if (tcp->tcp_valid_bits == TCP_FSS_VALID &&
21097 *snxt == tcp->tcp_fss) {
21098 if (!tcp->tcp_fin_acked) {
21099 tcp->tcp_tcph->th_flags[0] |= TH_FIN;
21100 BUMP_MIB(&tcps->tcps_mib,
21101 tcpOutControl);
21102 }
21103 if (!tcp->tcp_fin_sent) {
21104 tcp->tcp_fin_sent = B_TRUE;
21105 /*
21106 * tcp state must be ESTABLISHED
21107 * in order for us to get here in
21108 * the first place.
21109 */
21110 DTRACE_TCP4(state__change, void, NULL,
21111 conn_t *, NULL, tcp_t *, tcp,
21112 int32_t, TCPS_FIN_WAIT_1);
21113 tcp->tcp_state = TCPS_FIN_WAIT_1;
21114
21115 /*
21116 * Upon returning from this routine,
21117 * tcp_wput_data() will set tcp_snxt
21118 * to be equal to snxt + tcp_fin_sent.
21119 * This is essentially the same as
21120 * setting it to tcp_fss + 1.
21121 */
21122 }
21123 }
21124
21125 tcp->tcp_last_sent_len = (ushort_t)len;
21126
21127 len += tcp_hdr_len;
21128 if (tcp->tcp_ipversion == IPV4_VERSION)
21129 tcp->tcp_ipha->ipha_length = htons(len);
21130 else
21131 tcp->tcp_ip6h->ip6_plen = htons(len -
21132 ((char *)&tcp->tcp_ip6h[1] -
21728 if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
21729 ASSERT(ill->ill_hcksum_capab != NULL);
21730 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
21731 }
21732
21733 /*
21734 * Since the TCP checksum should be recalculated by h/w, we can just
21735 * zero the checksum field for HCK_FULLCKSUM, or calculate partial
21736 * pseudo-header checksum for HCK_PARTIALCKSUM.
21737 * The partial pseudo-header excludes TCP length, that was calculated
21738 * in tcp_send(), so to zero *up before further processing.
21739 */
21740 cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
21741
21742 up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
21743 *up = 0;
21744
21745 IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
21746 IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
21747
21748 DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL, void_ip_t *, ipha,
21749 tcp_t *, tcp, tcph_t *, &mp->b_rptr[IPH_HDR_LENGTH(mp->b_rptr)]);
21750
21751 /*
21752 * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp).
21753 */
21754 DB_LSOFLAGS(mp) |= HW_LSO;
21755 DB_LSOMSS(mp) = mss;
21756
21757 ipha->ipha_fragment_offset_and_flags |=
21758 (uint32_t)htons(ire->ire_frag_flag);
21759
21760 ire_fp_mp = ire->ire_nce->nce_fp_mp;
21761 ire_fp_mp_len = MBLKL(ire_fp_mp);
21762 ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
21763 mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
21764 bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
21765
21766 UPDATE_OB_PKT_COUNT(ire);
21767 ire->ire_last_used_time = lbolt;
21768 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
21769 BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
21770 UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,
23376 }
23377
23378 if (mctl_present) {
23379 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
23380
23381 ASSERT(ii->ipsec_in_type == IPSEC_IN);
23382 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) {
23383 return;
23384 }
23385 }
23386 if (zoneid == ALL_ZONES)
23387 zoneid = GLOBAL_ZONEID;
23388
23389 /* Add the zoneid so ip_output routes it properly */
23390 if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) {
23391 freemsg(ipsec_mp);
23392 return;
23393 }
23394 ipsec_mp = nmp;
23395
23396 DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
23397 mp->b_rptr, tcp_t *, NULL, tcph_t *, tcph);
23398 if (tcph->th_flags[0] == (TH_RST|TH_ACK)) {
23399 DTRACE_TCP5(accept__refused, mblk_t *, NULL, conn_t *, NULL,
23400 void_ip_t *, mp->b_rptr, tcp_t *, NULL, tcph_t *, tcph);
23401 }
23402
23403 /*
23404 * NOTE: one might consider tracing a TCP packet here, but
23405 * this function has no active TCP state and no tcp structure
23406 * that has a trace buffer. If we traced here, we would have
23407 * to keep a local trace buffer in tcp_record_trace().
23408 *
23409 * TSol note: The mblk that contains the incoming packet was
23410 * reused by tcp_xmit_listener_reset, so it already contains
23411 * the right credentials and we don't need to call mblk_setcred.
23412 * Also the conn's cred is not right since it is associated
23413 * with tcps_g_q.
23414 */
23415 CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp);
23416
23417 /*
23418 * Tell IP to mark the IRE used for this destination temporary.
23419 * This way, we can limit our exposure to DoS attack because IP
23420 * creates an IRE for each destination. If there are too many,
23421 * the time to do any routing lookup will be extremely long. And
23422 * the lookup can be in interrupt context.
23603 if (ipsec_mp == NULL)
23604 return;
23605 }
23606 if (is_system_labeled() && !tsol_can_reply_error(mp)) {
23607 DTRACE_PROBE2(
23608 tx__ip__log__error__nolistener__tcp,
23609 char *, "Could not reply with RST to mp(1)",
23610 mblk_t *, mp);
23611 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
23612 freemsg(ipsec_mp);
23613 return;
23614 }
23615
23616 rptr = mp->b_rptr;
23617
23618 tcph = (tcph_t *)&rptr[ip_hdr_len];
23619 seg_seq = BE32_TO_U32(tcph->th_seq);
23620 seg_ack = BE32_TO_U32(tcph->th_ack);
23621 flags = tcph->th_flags[0];
23622
23623 /*
23624 * DTrace this "unknown" segment as a tcp:::receive, as we did
23625 * just receive something that was TCP.
23626 */
23627 DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *, rptr,
23628 tcp_t *, NULL, tcph_t *, tcph);
23629
23630 seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len);
23631 if (flags & TH_RST) {
23632 freemsg(ipsec_mp);
23633 } else if (flags & TH_ACK) {
23634 tcp_xmit_early_reset("no tcp, reset",
23635 ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps,
23636 connp);
23637 } else {
23638 if (flags & TH_SYN) {
23639 seg_len++;
23640 } else {
23641 /*
23642 * Here we violate the RFC. Note that a normal
23643 * TCP will never send a segment without the ACK
23644 * flag, except for RST or SYN segment. This
23645 * segment is neither. Just drop it on the
23646 * floor.
23647 */
23648 freemsg(ipsec_mp);
23649 tcps->tcps_rst_unsent++;
23947 /*
23948 * Get IP set to checksum on our behalf
23949 * Include the adjustment for a source route if any.
23950 */
23951 u1 += tcp->tcp_sum;
23952 u1 = (u1 >> 16) + (u1 & 0xFFFF);
23953 U16_TO_BE16(u1, tcph->th_sum);
23954 BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23955 }
23956 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
23957 (seq + data_length) == tcp->tcp_fss) {
23958 if (!tcp->tcp_fin_acked) {
23959 flags |= TH_FIN;
23960 BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23961 }
23962 if (!tcp->tcp_fin_sent) {
23963 tcp->tcp_fin_sent = B_TRUE;
23964 switch (tcp->tcp_state) {
23965 case TCPS_SYN_RCVD:
23966 case TCPS_ESTABLISHED:
23967 DTRACE_TCP4(state__change, void, NULL,
23968 conn_t *, NULL, tcp_t *, tcp,
23969 int32_t, TCPS_FIN_WAIT_1);
23970 tcp->tcp_state = TCPS_FIN_WAIT_1;
23971 break;
23972 case TCPS_CLOSE_WAIT:
23973 DTRACE_TCP4(state__change, void, NULL,
23974 conn_t *, NULL, tcp_t *, tcp,
23975 int32_t, TCPS_LAST_ACK);
23976 tcp->tcp_state = TCPS_LAST_ACK;
23977 break;
23978 }
23979 if (tcp->tcp_suna == tcp->tcp_snxt)
23980 TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
23981 tcp->tcp_snxt = tcp->tcp_fss + 1;
23982 }
23983 }
23984 /*
23985 * Note the trick here. u1 is unsigned. When tcp_urg
23986 * is smaller than seq, u1 will become a very huge value.
23987 * So the comparison will fail. Also note that tcp_urp
23988 * should be positive, see RFC 793 page 17.
23989 */
23990 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
23991 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
23992 u1 < (uint32_t)(64 * 1024)) {
23993 flags |= TH_URG;
23994 BUMP_MIB(&tcps->tcps_mib, tcpOutUrg);
23995 U32_TO_ABE16(u1, tcph->th_urp);
24305 /* ARGSUSED */
24306 static tcp_t *
24307 tcp_alloc_temp_tcp(in_port_t port, tcp_stack_t *tcps)
24308 {
24309 conn_t *connp;
24310 tcp_t *tcp;
24311
24312 connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP, tcps->tcps_netstack);
24313 if (connp == NULL)
24314 return (NULL);
24315
24316 tcp = connp->conn_tcp;
24317 tcp->tcp_tcps = tcps;
24318 TCPS_REFHOLD(tcps);
24319
24320 /*
24321 * Only initialize the necessary info in those structures. Note
24322 * that since INADDR_ANY is all 0, we do not need to set
24323 * tcp_bound_source to INADDR_ANY here.
24324 */
24325 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
24326 int32_t, TCPS_BOUND);
24327 tcp->tcp_state = TCPS_BOUND;
24328 tcp->tcp_lport = port;
24329 tcp->tcp_exclbind = 1;
24330 tcp->tcp_reserved_port = 1;
24331
24332 /* Just for place holding... */
24333 tcp->tcp_ipversion = IPV4_VERSION;
24334
24335 return (tcp);
24336 }
24337
24338 /*
24339 * To remove a port range specified by lo_port and hi_port from the
24340 * reserved port ranges. This is one of the three public functions of
24341 * the reserved port interface. Note that a port range has to be removed
24342 * as a whole. Ports in a range cannot be removed individually.
24343 *
24344 * Params:
24345 * in_port_t lo_port: the beginning port of the reserved port range to
24346 * be deleted.
|