Print this page
*** NO COMMENTS ***


3460         tbr->PRIM_type = T_BIND_ACK;
3461         mp->b_datap->db_type = M_PCPROTO;
3462 
3463         /* Chain in the reply mp for tcp_rput() */
3464         mp1->b_cont = mp;
3465         mp = mp1;
3466 
3467         tcp->tcp_conn_req_max = tbr->CONIND_number;
3468         if (tcp->tcp_conn_req_max) {
3469                 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
3470                         tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
3471                 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
3472                         tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
3473                 /*
3474                  * If this is a listener, do not reset the eager list
3475                  * and other stuffs.  Note that we don't check if the
3476                  * existing eager list meets the new tcp_conn_req_max
3477                  * requirement.
3478                  */
3479                 if (tcp->tcp_state != TCPS_LISTEN) {


3480                         tcp->tcp_state = TCPS_LISTEN;
3481                         /* Initialize the chain. Don't need the eager_lock */
3482                         tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3483                         tcp->tcp_eager_next_drop_q0 = tcp;
3484                         tcp->tcp_eager_prev_drop_q0 = tcp;
3485                         tcp->tcp_second_ctimer_threshold =
3486                             tcps->tcps_ip_abort_linterval;
3487                 }
3488         }
3489 
3490         /*
3491          * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3492          * processing continues in tcp_rput_other().
3493          *
3494          * We need to make sure that the conn_recv is set to a non-null
3495          * value before we insert the conn into the classifier table.
3496          * This is to avoid a race with an incoming packet which does an
3497          * ipcl_classify().
3498          */
3499         connp->conn_recv = tcp_conn_request;


3756                                  * address and source port, which is
3757                                  * refused regardless of the
3758                                  * SO_REUSEADDR setting, so we break.
3759                                  */
3760                                 if (IN6_ARE_ADDR_EQUAL(laddr,
3761                                     &ltcp->tcp_bound_source_v6) &&
3762                                     (ltcp->tcp_state == TCPS_LISTEN ||
3763                                     ltcp->tcp_state == TCPS_BOUND))
3764                                         break;
3765                         }
3766                 }
3767                 if (ltcp != NULL) {
3768                         /* The port number is busy */
3769                         mutex_exit(&tbf->tf_lock);
3770                 } else {
3771                         /*
3772                          * This port is ours. Insert in fanout and mark as
3773                          * bound to prevent others from getting the port
3774                          * number.
3775                          */


3776                         tcp->tcp_state = TCPS_BOUND;
3777                         tcp->tcp_lport = htons(port);
3778                         *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3779 
3780                         ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
3781                             tcp->tcp_lport)] == tbf);
3782                         tcp_bind_hash_insert(tbf, tcp, 1);
3783 
3784                         mutex_exit(&tbf->tf_lock);
3785 
3786                         /*
3787                          * We don't want tcp_next_port_to_try to "inherit"
3788                          * a port number supplied by the user in a bind.
3789                          */
3790                         if (user_specified)
3791                                 return (port);
3792 
3793                         /*
3794                          * This is the only place where tcp_next_port_to_try
3795                          * is updated. After the update, it may or may not


3887             tcp->tcp_ipversion == IPV6_VERSION)));
3888 
3889         if (TCP_IS_DETACHED(tcp)) {
3890                 if (tcp->tcp_hard_binding) {
3891                         /*
3892                          * Its an eager that we are dealing with. We close the
3893                          * eager but in case a conn_ind has already gone to the
3894                          * listener, let tcp_accept_finish() send a discon_ind
3895                          * to the listener and drop the last reference. If the
3896                          * listener doesn't even know about the eager i.e. the
3897                          * conn_ind hasn't gone up, blow away the eager and drop
3898                          * the last reference as well. If the conn_ind has gone
3899                          * up, state should be BOUND. tcp_accept_finish
3900                          * will figure out that the connection has received a
3901                          * RST and will send a DISCON_IND to the application.
3902                          */
3903                         tcp_closei_local(tcp);
3904                         if (!tcp->tcp_tconnind_started) {
3905                                 CONN_DEC_REF(tcp->tcp_connp);
3906                         } else {



3907                                 tcp->tcp_state = TCPS_BOUND;
3908                         }
3909                 } else {
3910                         tcp_close_detached(tcp);
3911                 }
3912                 return (0);
3913         }
3914 
3915         TCP_STAT(tcps, tcp_clean_death_nondetached);
3916 
3917         /*
3918          * If T_ORDREL_IND has not been sent yet (done when service routine
3919          * is run) postpone cleaning up the endpoint until service routine
3920          * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3921          * client_errno since tcp_close uses the client_errno field.
3922          */
3923         if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3924                 if (err != 0)
3925                         tcp->tcp_client_errno = err;
3926 


4620                 (void) tcp_time_wait_remove(tcp, NULL);
4621         CL_INET_DISCONNECT(tcp);
4622         ipcl_hash_remove(connp);
4623 
4624         /*
4625          * Delete the cached ire in conn_ire_cache and also mark
4626          * the conn as CONDEMNED
4627          */
4628         mutex_enter(&connp->conn_lock);
4629         connp->conn_state_flags |= CONN_CONDEMNED;
4630         ire = connp->conn_ire_cache;
4631         connp->conn_ire_cache = NULL;
4632         mutex_exit(&connp->conn_lock);
4633         if (ire != NULL)
4634                 IRE_REFRELE_NOTR(ire);
4635 
4636         /* Need to cleanup any pending ioctls */
4637         ASSERT(tcp->tcp_time_wait_next == NULL);
4638         ASSERT(tcp->tcp_time_wait_prev == NULL);
4639         ASSERT(tcp->tcp_time_wait_expire == 0);




4640         tcp->tcp_state = TCPS_CLOSED;
4641 
4642         /* Release any SSL context */
4643         if (tcp->tcp_kssl_ent != NULL) {
4644                 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4645                 tcp->tcp_kssl_ent = NULL;
4646         }
4647         if (tcp->tcp_kssl_ctx != NULL) {
4648                 kssl_release_ctx(tcp->tcp_kssl_ctx);
4649                 tcp->tcp_kssl_ctx = NULL;
4650         }
4651         tcp->tcp_kssl_pending = B_FALSE;
4652 
4653         tcp_ipsec_cleanup(tcp);
4654 }
4655 
4656 /*
4657  * tcp is dying (called from ipcl_conn_destroy and error cases).
4658  * Free the tcp_t in either case.
4659  */


5861                         DTRACE_PROBE3(
5862                             tx__ip__log__error__connrequest__tcp,
5863                             char *, "eager connp(1) label on SYN mp(2) failed",
5864                             conn_t *, econnp, mblk_t *, mp);
5865                         goto error3;
5866                 }
5867         }
5868 
5869         eager->tcp_hard_binding = B_TRUE;
5870 
5871         tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
5872             TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5873 
5874         CL_INET_CONNECT(eager);
5875 
5876         /*
5877          * No need to check for multicast destination since ip will only pass
5878          * up multicasts to those that have expressed interest
5879          * TODO: what about rejecting broadcasts?
5880          * Also check that source is not a multicast or broadcast address.



5881          */
5882         eager->tcp_state = TCPS_SYN_RCVD;
5883 
5884 
5885         /*
5886          * There should be no ire in the mp as we are being called after
5887          * receiving the SYN.
5888          */
5889         ASSERT(tcp_ire_mp(mp) == NULL);
5890 
5891         /*
5892          * Adapt our mss, ttl, ... according to information provided in IRE.
5893          */
5894 
5895         if (tcp_adapt_ire(eager, NULL) == 0) {
5896                 /* Undo the bind_hash_insert */
5897                 tcp_bind_hash_remove(eager);
5898                 goto error3;
5899         }
5900 







5901         /* Process all TCP options. */
5902         tcp_process_options(eager, tcph);
5903 
5904         /* Is the other end ECN capable? */
5905         if (tcps->tcps_ecn_permitted >= 1 &&
5906             (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5907                 eager->tcp_ecn_ok = B_TRUE;
5908         }
5909 
5910         /*
5911          * listener->tcp_rq->q_hiwat should be the default window size or a
5912          * window size changed via SO_RCVBUF option.  First round up the
5913          * eager's tcp_rwnd to the nearest MSS.  Then find out the window
5914          * scale option value if needed.  Call tcp_rwnd_set() to finish the
5915          * setting.
5916          *
5917          * Note if there is a rpipe metric associated with the remote host,
5918          * we should not inherit receive window size from listener.
5919          */
5920         eager->tcp_rwnd = MSS_ROUNDUP(


5997                 if (addr_cache != NULL && eager->tcp_remote ==
5998                     addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
5999                         eager->tcp_dontdrop = B_TRUE;
6000                 }
6001         }
6002 
6003         /*
6004          * We need to insert the eager in its own perimeter but as soon
6005          * as we do that, we expose the eager to the classifier and
6006          * should not touch any field outside the eager's perimeter.
6007          * So do all the work necessary before inserting the eager
6008          * in its own perimeter. Be optimistic that ipcl_conn_insert()
6009          * will succeed but undo everything if it fails.
6010          */
6011         seg_seq = ABE32_TO_U32(tcph->th_seq);
6012         eager->tcp_irs = seg_seq;
6013         eager->tcp_rack = seg_seq;
6014         eager->tcp_rnxt = seg_seq + 1;
6015         U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
6016         BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);


6017         eager->tcp_state = TCPS_SYN_RCVD;
6018         mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
6019             NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
6020         if (mp1 == NULL) {
6021                 /*
6022                  * Increment the ref count as we are going to
6023                  * enqueueing an mp in squeue
6024                  */
6025                 CONN_INC_REF(econnp);
6026                 goto error;
6027         }
6028         DB_CPID(mp1) = tcp->tcp_cpid;
6029         eager->tcp_cpid = tcp->tcp_cpid;
6030         eager->tcp_open_time = lbolt64;
6031 
6032         /*
6033          * We need to start the rto timer. In normal case, we start
6034          * the timer after sending the packet on the wire (or at
6035          * least believing that packet was sent by waiting for
6036          * CALL_IP_WPUT() to return). Since this is the first packet


6106                          * anymore (someone blew it away). Just
6107                          * free this message and hopefully remote
6108                          * will retransmit at which time the SYN can be
6109                          * treated as a new connection or dealth with
6110                          * a TH_RST if a connection already exists.
6111                          */
6112                         CONN_DEC_REF(econnp);
6113                         freemsg(mp);
6114                 } else {
6115                         squeue_fill(econnp->conn_sqp, mp, tcp_input,
6116                             econnp, SQTAG_TCP_CONN_REQ_1);
6117                 }
6118         } else {
6119                 /* Nobody wants this packet */
6120                 freemsg(mp);
6121         }
6122         return;
6123 error3:
6124         CONN_DEC_REF(econnp);
6125 error2:

















6126         freemsg(mp);
6127 }
6128 
6129 /*
6130  * In an ideal case of vertical partition in NUMA architecture, its
6131  * beneficial to have the listener and all the incoming connections
6132  * tied to the same squeue. The other constraint is that incoming
6133  * connections should be tied to the squeue attached to interrupted
6134  * CPU for obvious locality reason so this leaves the listener to
6135  * be tied to the same squeue. Our only problem is that when listener
6136  * is binding, the CPU that will get interrupted by the NIC whose
6137  * IP address the listener is binding to is not even known. So
6138  * the code below allows us to change that binding at the time the
6139  * CPU is interrupted by virtue of incoming connection's squeue.
6140  *
6141  * This is usefull only in case of a listener bound to a specific IP
6142  * address. For other kind of listeners, they get bound the
6143  * very first time and there is no attempt to rebind them.
6144  */
6145 void


6604          * At this point the remote destination address and remote port fields
6605          * in the tcp-four-tuple have been filled in the tcp structure. Now we
6606          * have to see which state tcp was in so we can take apropriate action.
6607          */
6608         if (oldstate == TCPS_IDLE) {
6609                 /*
6610                  * We support a quick connect capability here, allowing
6611                  * clients to transition directly from IDLE to SYN_SENT
6612                  * tcp_bindi will pick an unused port, insert the connection
6613                  * in the bind hash and transition to BOUND state.
6614                  */
6615                 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6616                     tcp, B_TRUE);
6617                 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6618                     B_FALSE, B_FALSE);
6619                 if (lport == 0) {
6620                         mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6621                         goto failed;
6622                 }
6623         }


6624         tcp->tcp_state = TCPS_SYN_SENT;
6625 
6626         /*
6627          * TODO: allow data with connect requests
6628          * by unlinking M_DATA trailers here and
6629          * linking them in behind the T_OK_ACK mblk.
6630          * The tcp_rput() bind ack handler would then
6631          * feed them to tcp_wput_data() rather than call
6632          * tcp_timer().
6633          */
6634         mp = mi_tpi_ok_ack_alloc(mp);
6635         if (!mp) {


6636                 tcp->tcp_state = oldstate;
6637                 goto failed;
6638         }
6639         if (tcp->tcp_family == AF_INET) {
6640                 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6641                     sizeof (ipa_conn_t));
6642         } else {
6643                 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6644                     sizeof (ipa6_conn_t));
6645         }
6646         if (mp1) {
6647                 /*
6648                  * We need to make sure that the conn_recv is set to a non-null
6649                  * value before we insert the conn_t into the classifier table.
6650                  * This is to avoid a race with an incoming packet which does
6651                  * an ipcl_classify().
6652                  */
6653                 tcp->tcp_connp->conn_recv = tcp_input;
6654 
6655                 /* Hang onto the T_OK_ACK for later. */
6656                 linkb(mp1, mp);
6657                 mblk_setcred(mp1, tcp->tcp_cred);
6658                 if (tcp->tcp_family == AF_INET)
6659                         mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6660                 else {
6661                         mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6662                             &tcp->tcp_sticky_ipp);
6663                 }
6664                 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6665                 tcp->tcp_active_open = 1;
6666                 /*
6667                  * If the bind cannot complete immediately
6668                  * IP will arrange to call tcp_rput_other
6669                  * when the bind completes.
6670                  */
6671                 if (mp1 != NULL)
6672                         tcp_rput_other(tcp, mp1);
6673                 return;
6674         }
6675         /* Error case */


6676         tcp->tcp_state = oldstate;
6677         mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6678 
6679 failed:
6680         /* return error ack and blow away saved option results if any */
6681         if (mp != NULL)
6682                 putnext(tcp->tcp_rq, mp);
6683         else {
6684                 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6685                     TSYSERR, ENOMEM);
6686         }
6687         if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6688                 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6689 
6690 }
6691 
6692 /*
6693  * Handle connect to IPv6 destinations.
6694  */
6695 static void


6812          * At this point the remote destination address and remote port fields
6813          * in the tcp-four-tuple have been filled in the tcp structure. Now we
6814          * have to see which state tcp was in so we can take apropriate action.
6815          */
6816         if (oldstate == TCPS_IDLE) {
6817                 /*
6818                  * We support a quick connect capability here, allowing
6819                  * clients to transition directly from IDLE to SYN_SENT
6820                  * tcp_bindi will pick an unused port, insert the connection
6821                  * in the bind hash and transition to BOUND state.
6822                  */
6823                 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6824                     tcp, B_TRUE);
6825                 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6826                     B_FALSE, B_FALSE);
6827                 if (lport == 0) {
6828                         mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6829                         goto failed;
6830                 }
6831         }


6832         tcp->tcp_state = TCPS_SYN_SENT;
6833         /*
6834          * TODO: allow data with connect requests
6835          * by unlinking M_DATA trailers here and
6836          * linking them in behind the T_OK_ACK mblk.
6837          * The tcp_rput() bind ack handler would then
6838          * feed them to tcp_wput_data() rather than call
6839          * tcp_timer().
6840          */
6841         mp = mi_tpi_ok_ack_alloc(mp);
6842         if (!mp) {


6843                 tcp->tcp_state = oldstate;
6844                 goto failed;
6845         }
6846         mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6847         if (mp1) {
6848                 /*
6849                  * We need to make sure that the conn_recv is set to a non-null
6850                  * value before we insert the conn_t into the classifier table.
6851                  * This is to avoid a race with an incoming packet which does
6852                  * an ipcl_classify().
6853                  */
6854                 tcp->tcp_connp->conn_recv = tcp_input;
6855 
6856                 /* Hang onto the T_OK_ACK for later. */
6857                 linkb(mp1, mp);
6858                 mblk_setcred(mp1, tcp->tcp_cred);
6859                 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6860                     &tcp->tcp_sticky_ipp);
6861                 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6862                 tcp->tcp_active_open = 1;
6863                 /* ip_bind_v6() may return ACK or ERROR */
6864                 if (mp1 != NULL)
6865                         tcp_rput_other(tcp, mp1);
6866                 return;
6867         }
6868         /* Error case */


6869         tcp->tcp_state = oldstate;
6870         mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6871 
6872 failed:
6873         /* return error ack and blow away saved option results if any */
6874         if (mp != NULL)
6875                 putnext(tcp->tcp_rq, mp);
6876         else {
6877                 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6878                     TSYSERR, ENOMEM);
6879         }
6880         if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6881                 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6882 }
6883 
6884 /*
6885  * We need a stream q for detached closing tcp connections
6886  * to use.  Our client hereby indicates that this q is the
6887  * one to use.
6888  */


6997                 ltcp = NULL;
6998                 /*
6999                  * If it used to be a listener, check to make sure no one else
7000                  * has taken the port before switching back to LISTEN state.
7001                  */
7002                 if (tcp->tcp_ipversion == IPV4_VERSION) {
7003                         connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
7004                             tcp->tcp_ipha->ipha_src,
7005                             tcp->tcp_connp->conn_zoneid, ipst);
7006                         if (connp != NULL)
7007                                 ltcp = connp->conn_tcp;
7008                 } else {
7009                         /* Allow tcp_bound_if listeners? */
7010                         connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
7011                             &tcp->tcp_ip6h->ip6_src, 0,
7012                             tcp->tcp_connp->conn_zoneid, ipst);
7013                         if (connp != NULL)
7014                                 ltcp = connp->conn_tcp;
7015                 }
7016                 if (tcp->tcp_conn_req_max && ltcp == NULL) {


7017                         tcp->tcp_state = TCPS_LISTEN;
7018                 } else if (old_state > TCPS_BOUND) {
7019                         tcp->tcp_conn_req_max = 0;


7020                         tcp->tcp_state = TCPS_BOUND;
7021                 }
7022                 if (ltcp != NULL)
7023                         CONN_DEC_REF(ltcp->tcp_connp);
7024                 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
7025                         BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
7026                 } else if (old_state == TCPS_ESTABLISHED ||
7027                     old_state == TCPS_CLOSE_WAIT) {
7028                         BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
7029                 }
7030 
7031                 if (tcp->tcp_fused)
7032                         tcp_unfuse(tcp);
7033 
7034                 mutex_enter(&tcp->tcp_eager_lock);
7035                 if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
7036                     (tcp->tcp_conn_req_cnt_q != 0)) {
7037                         tcp_eager_cleanup(tcp, 0);
7038                 }
7039                 mutex_exit(&tcp->tcp_eager_lock);


7900         tcp_ipsec_cleanup(tcp);
7901 
7902         if (tcp->tcp_conn_req_max != 0) {
7903                 /*
7904                  * This is the case when a TLI program uses the same
7905                  * transport end point to accept a connection.  This
7906                  * makes the TCP both a listener and acceptor.  When
7907                  * this connection is closed, we need to set the state
7908                  * back to TCPS_LISTEN.  Make sure that the eager list
7909                  * is reinitialized.
7910                  *
7911                  * Note that this stream is still bound to the four
7912                  * tuples of the previous connection in IP.  If a new
7913                  * SYN with different foreign address comes in, IP will
7914                  * not find it and will send it to the global queue.  In
7915                  * the global queue, TCP will do a tcp_lookup_listener()
7916                  * to find this stream.  This works because this stream
7917                  * is only removed from connected hash.
7918                  *
7919                  */


7920                 tcp->tcp_state = TCPS_LISTEN;
7921                 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7922                 tcp->tcp_eager_next_drop_q0 = tcp;
7923                 tcp->tcp_eager_prev_drop_q0 = tcp;
7924                 tcp->tcp_connp->conn_recv = tcp_conn_request;
7925                 if (tcp->tcp_family == AF_INET6) {
7926                         ASSERT(tcp->tcp_connp->conn_af_isv6);
7927                         (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7928                             &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7929                 } else {
7930                         ASSERT(!tcp->tcp_connp->conn_af_isv6);
7931                         (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7932                             tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7933                 }
7934         } else {


7935                 tcp->tcp_state = TCPS_BOUND;
7936         }
7937 
7938         /*
7939          * Initialize to default values
7940          * Can't fail since enough header template space already allocated
7941          * at open().
7942          */
7943         err = tcp_init_values(tcp);
7944         ASSERT(err == 0);
7945         /* Restore state in tcp_tcph */
7946         bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
7947         if (tcp->tcp_ipversion == IPV4_VERSION)
7948                 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
7949         else
7950                 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
7951         /*
7952          * Copy of the src addr. in tcp_t is needed in tcp_t
7953          * since the lookup funcs can only lookup on tcp_t
7954          */


8276         DONTCARE(tcp->tcmp_stk[0]);
8277 #endif
8278 
8279 
8280 #undef  DONTCARE
8281 #undef  PRESERVE
8282 }
8283 
8284 /*
8285  * Allocate necessary resources and initialize state vector.
8286  * Guaranteed not to fail so that when an error is returned,
8287  * the caller doesn't need to do any additional cleanup.
8288  */
8289 int
8290 tcp_init(tcp_t *tcp, queue_t *q)
8291 {
8292         int     err;
8293 
8294         tcp->tcp_rq = q;
8295         tcp->tcp_wq = WR(q);

8296         tcp->tcp_state = TCPS_IDLE;
8297         if ((err = tcp_init_values(tcp)) != 0)
8298                 tcp_timers_stop(tcp);
8299         return (err);
8300 }
8301 
8302 static int
8303 tcp_init_values(tcp_t *tcp)
8304 {
8305         int     err;
8306         tcp_stack_t     *tcps = tcp->tcp_tcps;
8307 
8308         ASSERT((tcp->tcp_family == AF_INET &&
8309             tcp->tcp_ipversion == IPV4_VERSION) ||
8310             (tcp->tcp_family == AF_INET6 &&
8311             (tcp->tcp_ipversion == IPV4_VERSION ||
8312             tcp->tcp_ipversion == IPV6_VERSION)));
8313 
8314         /*
8315          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO


13277                 if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
13278                         freemsg(mp);
13279                         return;
13280                 }
13281                 /* Update pointers into message */
13282                 iphdr = rptr = mp->b_rptr;
13283                 tcph = (tcph_t *)&rptr[ip_hdr_len];
13284                 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
13285                         /*
13286                          * Since we can't handle any data with this urgent
13287                          * pointer that is out of sequence, we expunge
13288                          * the data.  This allows us to still register
13289                          * the urgent mark and generate the M_PCSIG,
13290                          * which we can do.
13291                          */
13292                         mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
13293                         seg_len = 0;
13294                 }
13295         }
13296 







13297         switch (tcp->tcp_state) {
13298         case TCPS_SYN_SENT:
13299                 if (flags & TH_ACK) {
13300                         /*
13301                          * Note that our stack cannot send data before a
13302                          * connection is established, therefore the
13303                          * following check is valid.  Otherwise, it has
13304                          * to be changed.
13305                          */
13306                         if (SEQ_LEQ(seg_ack, tcp->tcp_iss) ||
13307                             SEQ_GT(seg_ack, tcp->tcp_snxt)) {
13308                                 freemsg(mp);
13309                                 if (flags & TH_RST)
13310                                         return;
13311                                 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
13312                                     tcp, seg_ack, 0, TH_RST);
13313                                 return;
13314                         }
13315                         ASSERT(tcp->tcp_suna + 1 == seg_ack);
13316                 }
13317                 if (flags & TH_RST) {




13318                         freemsg(mp);
13319                         if (flags & TH_ACK)
13320                                 (void) tcp_clean_death(tcp,
13321                                     ECONNREFUSED, 13);
13322                         return;
13323                 }
13324                 if (!(flags & TH_SYN)) {
13325                         freemsg(mp);
13326                         return;
13327                 }
13328 
13329                 /* Process all TCP options. */
13330                 tcp_process_options(tcp, tcph);
13331                 /*
13332                  * The following changes our rwnd to be a multiple of the
13333                  * MIN(peer MSS, our MSS) for performance reason.
13334                  */
13335                 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat,
13336                     tcp->tcp_mss));
13337 


13372                          *
13373                          * XXX: how can we pretend we didn't see it if we
13374                          * have updated rnxt et. al.
13375                          *
13376                          * For loopback we defer sending up the T_CONN_CON
13377                          * until after some checks below.
13378                          */
13379                         mp1 = NULL;
13380                         if (!tcp_conn_con(tcp, iphdr, tcph, mp,
13381                             tcp->tcp_loopback ? &mp1 : NULL)) {
13382                                 freemsg(mp);
13383                                 return;
13384                         }
13385                         /* SYN was acked - making progress */
13386                         if (tcp->tcp_ipversion == IPV6_VERSION)
13387                                 tcp->tcp_ip_forward_progress = B_TRUE;
13388 
13389                         /* One for the SYN */
13390                         tcp->tcp_suna = tcp->tcp_iss + 1;
13391                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;


13392                         tcp->tcp_state = TCPS_ESTABLISHED;
13393 
13394                         /*







13395                          * If SYN was retransmitted, need to reset all
13396                          * retransmission info.  This is because this
13397                          * segment will be treated as a dup ACK.
13398                          */
13399                         if (tcp->tcp_rexmit) {
13400                                 tcp->tcp_rexmit = B_FALSE;
13401                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
13402                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
13403                                 tcp->tcp_snd_burst = tcp->tcp_localnet ?
13404                                     TCP_CWND_INFINITE : TCP_CWND_NORMAL;
13405                                 tcp->tcp_ms_we_have_waited = 0;
13406 
13407                                 /*
13408                                  * Set tcp_cwnd back to 1 MSS, per
13409                                  * recommendation from
13410                                  * draft-floyd-incr-init-win-01.txt,
13411                                  * Increasing TCP's Initial Window.
13412                                  */
13413                                 tcp->tcp_cwnd = tcp->tcp_mss;
13414                         }


13481 
13482                         /*
13483                          * Check to see if there is data to be sent.  If
13484                          * yes, set the transmit flag.  Then check to see
13485                          * if received data processing needs to be done.
13486                          * If not, go straight to xmit_check.  This short
13487                          * cut is OK as we don't support T/TCP.
13488                          */
13489                         if (tcp->tcp_unsent)
13490                                 flags |= TH_XMIT_NEEDED;
13491 
13492                         if (seg_len == 0 && !(flags & TH_URG)) {
13493                                 freemsg(mp);
13494                                 goto xmit_check;
13495                         }
13496 
13497                         flags &= ~TH_SYN;
13498                         seg_seq++;
13499                         break;
13500                 }


13501                 tcp->tcp_state = TCPS_SYN_RCVD;
13502                 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
13503                     NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
13504                 if (mp1) {
13505                         DB_CPID(mp1) = tcp->tcp_cpid;
13506                         TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT);
13507                         tcp_send_data(tcp, tcp->tcp_wq, mp1);
13508                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
13509                 }
13510                 freemsg(mp);
13511                 return;
13512         case TCPS_SYN_RCVD:
13513                 if (flags & TH_ACK) {
13514                         /*
13515                          * In this state, a SYN|ACK packet is either bogus
13516                          * because the other side must be ACKing our SYN which
13517                          * indicates it has seen the ACK for their SYN and
13518                          * shouldn't retransmit it or we're crossing SYNs
13519                          * on active open.
13520                          */


14400                         tcp->tcp_cwnd = mss;
14401                 }
14402 
14403                 /*
14404                  * We set the send window to zero here.
14405                  * This is needed if there is data to be
14406                  * processed already on the queue.
14407                  * Later (at swnd_update label), the
14408                  * "new_swnd > tcp_swnd" condition is satisfied
14409                  * the XMIT_NEEDED flag is set in the current
14410                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
14411                  * called if there is already data on queue in
14412                  * this state.
14413                  */
14414                 tcp->tcp_swnd = 0;
14415 
14416                 if (new_swnd > tcp->tcp_max_swnd)
14417                         tcp->tcp_max_swnd = new_swnd;
14418                 tcp->tcp_swl1 = seg_seq;
14419                 tcp->tcp_swl2 = seg_ack;


14420                 tcp->tcp_state = TCPS_ESTABLISHED;
14421                 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
14422 
14423                 /* Fuse when both sides are in ESTABLISHED state */
14424                 if (tcp->tcp_loopback && do_tcp_fusion)
14425                         tcp_fuse(tcp, iphdr, tcph);
14426 
14427         }
14428         /* This code follows 4.4BSD-Lite2 mostly. */
14429         if (bytes_acked < 0)
14430                 goto est;
14431 
14432         /*
14433          * If TCP is ECN capable and the congestion experience bit is
14434          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
14435          * done once per window (or more loosely, per RTT).
14436          */
14437         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
14438                 tcp->tcp_cwr = B_FALSE;
14439         if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {


15036                  *
15037                  * 1. the segment acknowledges some data.  Or
15038                  * 2. the segment is new, i.e. it has a higher seq num. Or
15039                  * 3. the segment is not old and the advertised window is
15040                  * larger than the previous advertised window.
15041                  */
15042                 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd)
15043                         flags |= TH_XMIT_NEEDED;
15044                 tcp->tcp_swnd = new_swnd;
15045                 if (new_swnd > tcp->tcp_max_swnd)
15046                         tcp->tcp_max_swnd = new_swnd;
15047                 tcp->tcp_swl1 = seg_seq;
15048                 tcp->tcp_swl2 = seg_ack;
15049         }
15050 est:
15051         if (tcp->tcp_state > TCPS_ESTABLISHED) {
15052 
15053                 switch (tcp->tcp_state) {
15054                 case TCPS_FIN_WAIT_1:
15055                         if (tcp->tcp_fin_acked) {



15056                                 tcp->tcp_state = TCPS_FIN_WAIT_2;
15057                                 /*
15058                                  * We implement the non-standard BSD/SunOS
15059                                  * FIN_WAIT_2 flushing algorithm.
15060                                  * If there is no user attached to this
15061                                  * TCP endpoint, then this TCP struct
15062                                  * could hang around forever in FIN_WAIT_2
15063                                  * state if the peer forgets to send us
15064                                  * a FIN.  To prevent this, we wait only
15065                                  * 2*MSL (a convenient time value) for
15066                                  * the FIN to arrive.  If it doesn't show up,
15067                                  * we flush the TCP endpoint.  This algorithm,
15068                                  * though a violation of RFC-793, has worked
15069                                  * for over 10 years in BSD systems.
15070                                  * Note: SunOS 4.x waits 675 seconds before
15071                                  * flushing the FIN_WAIT_2 connection.
15072                                  */
15073                                 TCP_TIMER_RESTART(tcp,
15074                                     tcps->tcps_fin_wait_2_flush_interval);
15075                         }
15076                         break;
15077                 case TCPS_FIN_WAIT_2:
15078                         break;  /* Shutdown hook? */
15079                 case TCPS_LAST_ACK:
15080                         freemsg(mp);
15081                         if (tcp->tcp_fin_acked) {
15082                                 (void) tcp_clean_death(tcp, 0, 19);
15083                                 return;
15084                         }
15085                         goto xmit_check;
15086                 case TCPS_CLOSING:
15087                         if (tcp->tcp_fin_acked) {



15088                                 tcp->tcp_state = TCPS_TIME_WAIT;
15089                                 /*
15090                                  * Unconditionally clear the exclusive binding
15091                                  * bit so this TIME-WAIT connection won't
15092                                  * interfere with new ones.
15093                                  */
15094                                 tcp->tcp_exclbind = 0;
15095                                 if (!TCP_IS_DETACHED(tcp)) {
15096                                         TCP_TIMER_RESTART(tcp,
15097                                             tcps->tcps_time_wait_interval);
15098                                 } else {
15099                                         tcp_time_wait_append(tcp);
15100                                         TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15101                                 }
15102                         }
15103                         /*FALLTHRU*/
15104                 case TCPS_CLOSE_WAIT:
15105                         freemsg(mp);
15106                         goto xmit_check;
15107                 default:


15113                 /* Make sure we ack the fin */
15114                 flags |= TH_ACK_NEEDED;
15115                 if (!tcp->tcp_fin_rcvd) {
15116                         tcp->tcp_fin_rcvd = B_TRUE;
15117                         tcp->tcp_rnxt++;
15118                         tcph = tcp->tcp_tcph;
15119                         U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
15120 
15121                         /*
15122                          * Generate the ordrel_ind at the end unless we
15123                          * are an eager guy.
15124                          * In the eager case tcp_rsrv will do this when run
15125                          * after tcp_accept is done.
15126                          */
15127                         if (tcp->tcp_listener == NULL &&
15128                             !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding))
15129                                 flags |= TH_ORDREL_NEEDED;
15130                         switch (tcp->tcp_state) {
15131                         case TCPS_SYN_RCVD:
15132                         case TCPS_ESTABLISHED:



15133                                 tcp->tcp_state = TCPS_CLOSE_WAIT;
15134                                 /* Keepalive? */
15135                                 break;
15136                         case TCPS_FIN_WAIT_1:
15137                                 if (!tcp->tcp_fin_acked) {



15138                                         tcp->tcp_state = TCPS_CLOSING;
15139                                         break;
15140                                 }
15141                                 /* FALLTHRU */
15142                         case TCPS_FIN_WAIT_2:



15143                                 tcp->tcp_state = TCPS_TIME_WAIT;
15144                                 /*
15145                                  * Unconditionally clear the exclusive binding
15146                                  * bit so this TIME-WAIT connection won't
15147                                  * interfere with new ones.
15148                                  */
15149                                 tcp->tcp_exclbind = 0;
15150                                 if (!TCP_IS_DETACHED(tcp)) {
15151                                         TCP_TIMER_RESTART(tcp,
15152                                             tcps->tcps_time_wait_interval);
15153                                 } else {
15154                                         tcp_time_wait_append(tcp);
15155                                         TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15156                                 }
15157                                 if (seg_len) {
15158                                         /*
15159                                          * implies data piggybacked on FIN.
15160                                          * break to handle data.
15161                                          */
15162                                         break;


15983                 tea = (struct T_error_ack *)mp->b_rptr;
15984                 tea->PRIM_type = T_ERROR_ACK;
15985                 tea->TLI_error = TSYSERR;
15986                 tea->UNIX_error = error;
15987                 if (tcp->tcp_state >= TCPS_SYN_SENT) {
15988                         tea->ERROR_prim = T_CONN_REQ;
15989                 } else {
15990                         tea->ERROR_prim = O_T_BIND_REQ;
15991                 }
15992                 break;
15993 
15994         case T_ERROR_ACK:
15995                 if (tcp->tcp_state >= TCPS_SYN_SENT)
15996                         tea->ERROR_prim = T_CONN_REQ;
15997                 break;
15998         default:
15999                 panic("tcp_bind_failed: unexpected TPI type");
16000                 /*NOTREACHED*/
16001         }
16002 


16003         tcp->tcp_state = TCPS_IDLE;
16004         if (tcp->tcp_ipversion == IPV4_VERSION)
16005                 tcp->tcp_ipha->ipha_src = 0;
16006         else
16007                 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
16008         /*
16009          * Copy of the src addr. in tcp_t is needed since
16010          * the lookup funcs. can only look at tcp_t
16011          */
16012         V6_SET_ZERO(tcp->tcp_ip_src_v6);
16013 
16014         tcph = tcp->tcp_tcph;
16015         tcph->th_lport[0] = 0;
16016         tcph->th_lport[1] = 0;
16017         tcp_bind_hash_remove(tcp);
16018         bzero(&connp->u_port, sizeof (connp->u_port));
16019         /* blow away saved option results if any */
16020         if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
16021                 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
16022 


16025 }
16026 
16027 /*
16028  * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA
16029  * messages.
16030  */
16031 void
16032 tcp_rput_other(tcp_t *tcp, mblk_t *mp)
16033 {
16034         mblk_t  *mp1;
16035         uchar_t *rptr = mp->b_rptr;
16036         queue_t *q = tcp->tcp_rq;
16037         struct T_error_ack *tea;
16038         uint32_t mss;
16039         mblk_t *syn_mp;
16040         mblk_t *mdti;
16041         mblk_t *lsoi;
16042         int     retval;
16043         mblk_t *ire_mp;
16044         tcp_stack_t     *tcps = tcp->tcp_tcps;

16045 
16046         switch (mp->b_datap->db_type) {
16047         case M_PROTO:
16048         case M_PCPROTO:
16049                 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
16050                 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t))
16051                         break;
16052                 tea = (struct T_error_ack *)rptr;
16053                 switch (tea->PRIM_type) {
16054                 case T_BIND_ACK:
16055                         /*
16056                          * Adapt Multidata information, if any.  The
16057                          * following tcp_mdt_update routine will free
16058                          * the message.
16059                          */
16060                         if ((mdti = tcp_mdt_info_mp(mp)) != NULL) {
16061                                 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
16062                                     b_rptr)->mdt_capab, B_TRUE);
16063                                 freemsg(mdti);
16064                         }


16209                                  * Obtain the credential from the
16210                                  * thread calling connect(); the credential
16211                                  * lives on in the second mblk which
16212                                  * originated from T_CONN_REQ and is echoed
16213                                  * with the T_BIND_ACK from ip.  If none
16214                                  * can be found, default to the creator
16215                                  * of the socket.
16216                                  */
16217                                 if (mp->b_cont == NULL ||
16218                                     (cr = DB_CRED(mp->b_cont)) == NULL) {
16219                                         cr = tcp->tcp_cred;
16220                                         pid = tcp->tcp_cpid;
16221                                 } else {
16222                                         pid = DB_CPID(mp->b_cont);
16223                                 }
16224 
16225                                 TCP_RECORD_TRACE(tcp, syn_mp,
16226                                     TCP_TRACE_SEND_PKT);
16227                                 mblk_setcred(syn_mp, cr);
16228                                 DB_CPID(syn_mp) = pid;


















16229                                 tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
16230                         }
16231                 after_syn_sent:
16232                         /*
16233                          * A trailer mblk indicates a waiting client upstream.
16234                          * We complete here the processing begun in
16235                          * either tcp_bind() or tcp_connect() by passing
16236                          * upstream the reply message they supplied.
16237                          */
16238                         mp1 = mp;
16239                         mp = mp->b_cont;
16240                         freeb(mp1);
16241                         if (mp)
16242                                 break;
16243                         return;
16244                 case T_ERROR_ACK:
16245                         if (tcp->tcp_debug) {
16246                                 (void) strlog(TCP_MOD_ID, 0, 1,
16247                                     SL_TRACE|SL_ERROR,
16248                                     "tcp_rput_other: case T_ERROR_ACK, "


17828         }
17829 
17830         /*
17831          * Need to clean up all the eagers since after the unbind, segments
17832          * will no longer be delivered to this listener stream.
17833          */
17834         mutex_enter(&tcp->tcp_eager_lock);
17835         if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
17836                 tcp_eager_cleanup(tcp, 0);
17837         }
17838         mutex_exit(&tcp->tcp_eager_lock);
17839 
17840         if (tcp->tcp_ipversion == IPV4_VERSION) {
17841                 tcp->tcp_ipha->ipha_src = 0;
17842         } else {
17843                 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
17844         }
17845         V6_SET_ZERO(tcp->tcp_ip_src_v6);
17846         bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport));
17847         tcp_bind_hash_remove(tcp);


17848         tcp->tcp_state = TCPS_IDLE;
17849         tcp->tcp_mdt = B_FALSE;
17850         /* Send M_FLUSH according to TPI */
17851         (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
17852         connp = tcp->tcp_connp;
17853         connp->conn_mdt_ok = B_FALSE;
17854         ipcl_hash_remove(connp);
17855         bzero(&connp->conn_ports, sizeof (connp->conn_ports));
17856         mp = mi_tpi_ok_ack_alloc(mp);
17857         putnext(tcp->tcp_rq, mp);
17858 }
17859 
17860 /*
17861  * Don't let port fall into the privileged range.
17862  * Since the extra privileged ports can be arbitrary we also
17863  * ensure that we exclude those from consideration.
17864  * tcp_g_epriv_ports is not sorted thus we loop over it until
17865  * there are no changes.
17866  *
17867  * Note: No locks are held when inspecting tcp_g_*epriv_ports


19506 
19507         return (B_TRUE);
19508 }
19509 
19510 static void
19511 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
19512 {
19513         ipha_t          *ipha;
19514         ipaddr_t        src;
19515         ipaddr_t        dst;
19516         uint32_t        cksum;
19517         ire_t           *ire;
19518         uint16_t        *up;
19519         ill_t           *ill;
19520         conn_t          *connp = tcp->tcp_connp;
19521         uint32_t        hcksum_txflags = 0;
19522         mblk_t          *ire_fp_mp;
19523         uint_t          ire_fp_mp_len;
19524         tcp_stack_t     *tcps = tcp->tcp_tcps;
19525         ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;

19526 
19527         ASSERT(DB_TYPE(mp) == M_DATA);
19528 
19529         if (DB_CRED(mp) == NULL)
19530                 mblk_setcred(mp, CONN_CRED(connp));
19531 
19532         ipha = (ipha_t *)mp->b_rptr;
19533         src = ipha->ipha_src;
19534         dst = ipha->ipha_dst;
19535 












19536         /*
19537          * Drop off fast path for IPv6 and also if options are present or
19538          * we need to resolve a TS label.
19539          */
19540         if (tcp->tcp_ipversion != IPV4_VERSION ||
19541             !IPCL_IS_CONNECTED(connp) ||
19542             !CONN_IS_LSO_MD_FASTPATH(connp) ||
19543             (connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
19544             !connp->conn_ulp_labeled ||
19545             ipha->ipha_ident == IP_HDR_INCLUDED ||
19546             ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
19547             IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
19548                 if (tcp->tcp_snd_zcopy_aware)
19549                         mp = tcp_zcopy_disable(tcp, mp);
19550                 TCP_STAT(tcps, tcp_ip_send);















19551                 CALL_IP_WPUT(connp, q, mp);
19552                 return;
19553         }
19554 
19555         if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) {
19556                 if (tcp->tcp_snd_zcopy_aware)
19557                         mp = tcp_zcopy_backoff(tcp, mp, 0);
19558                 CALL_IP_WPUT(connp, q, mp);
19559                 return;
19560         }
19561         ire_fp_mp = ire->ire_nce->nce_fp_mp;
19562         ire_fp_mp_len = MBLKL(ire_fp_mp);
19563 
19564         ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
19565         ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
19566 #ifndef _BIG_ENDIAN
19567         ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
19568 #endif
19569 
19570         /*


20939 
20940                         /*
20941                          * Set FIN bit if this is our last segment; snxt
20942                          * already includes its length, and it will not
20943                          * be adjusted after this point.
20944                          */
20945                         if (tcp->tcp_valid_bits == TCP_FSS_VALID &&
20946                             *snxt == tcp->tcp_fss) {
20947                                 if (!tcp->tcp_fin_acked) {
20948                                         tcp->tcp_tcph->th_flags[0] |= TH_FIN;
20949                                         BUMP_MIB(&tcps->tcps_mib,
20950                                             tcpOutControl);
20951                                 }
20952                                 if (!tcp->tcp_fin_sent) {
20953                                         tcp->tcp_fin_sent = B_TRUE;
20954                                         /*
20955                                          * tcp state must be ESTABLISHED
20956                                          * in order for us to get here in
20957                                          * the first place.
20958                                          */



20959                                         tcp->tcp_state = TCPS_FIN_WAIT_1;
20960 
20961                                         /*
20962                                          * Upon returning from this routine,
20963                                          * tcp_wput_data() will set tcp_snxt
20964                                          * to be equal to snxt + tcp_fin_sent.
20965                                          * This is essentially the same as
20966                                          * setting it to tcp_fss + 1.
20967                                          */
20968                                 }
20969                         }
20970 
20971                         tcp->tcp_last_sent_len = (ushort_t)len;
20972 
20973                         len += tcp_hdr_len;
20974                         if (tcp->tcp_ipversion == IPV4_VERSION)
20975                                 tcp->tcp_ipha->ipha_length = htons(len);
20976                         else
20977                                 tcp->tcp_ip6h->ip6_plen = htons(len -
20978                                     ((char *)&tcp->tcp_ip6h[1] -


21574         if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
21575                 ASSERT(ill->ill_hcksum_capab != NULL);
21576                 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
21577         }
21578 
21579         /*
21580          * Since the TCP checksum should be recalculated by h/w, we can just
21581          * zero the checksum field for HCK_FULLCKSUM, or calculate partial
21582          * pseudo-header checksum for HCK_PARTIALCKSUM.
21583          * The partial pseudo-header excludes TCP length, that was calculated
21584          * in tcp_send(), so to zero *up before further processing.
21585          */
21586         cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
21587 
21588         up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
21589         *up = 0;
21590 
21591         IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
21592             IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
21593 



21594         /*
21595          * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp).
21596          */
21597         DB_LSOFLAGS(mp) |= HW_LSO;
21598         DB_LSOMSS(mp) = mss;
21599 
21600         ipha->ipha_fragment_offset_and_flags |=
21601             (uint32_t)htons(ire->ire_frag_flag);
21602 
21603         ire_fp_mp = ire->ire_nce->nce_fp_mp;
21604         ire_fp_mp_len = MBLKL(ire_fp_mp);
21605         ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
21606         mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
21607         bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
21608 
21609         UPDATE_OB_PKT_COUNT(ire);
21610         ire->ire_last_used_time = lbolt;
21611         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
21612         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
21613         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,


23219         }
23220 
23221         if (mctl_present) {
23222                 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
23223 
23224                 ASSERT(ii->ipsec_in_type == IPSEC_IN);
23225                 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) {
23226                         return;
23227                 }
23228         }
23229         if (zoneid == ALL_ZONES)
23230                 zoneid = GLOBAL_ZONEID;
23231 
23232         /* Add the zoneid so ip_output routes it properly */
23233         if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) {
23234                 freemsg(ipsec_mp);
23235                 return;
23236         }
23237         ipsec_mp = nmp;
23238 







23239         /*
23240          * NOTE:  one might consider tracing a TCP packet here, but
23241          * this function has no active TCP state and no tcp structure
23242          * that has a trace buffer.  If we traced here, we would have
23243          * to keep a local trace buffer in tcp_record_trace().
23244          *
23245          * TSol note: The mblk that contains the incoming packet was
23246          * reused by tcp_xmit_listener_reset, so it already contains
23247          * the right credentials and we don't need to call mblk_setcred.
23248          * Also the conn's cred is not right since it is associated
23249          * with tcps_g_q.
23250          */
23251         CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp);
23252 
23253         /*
23254          * Tell IP to mark the IRE used for this destination temporary.
23255          * This way, we can limit our exposure to DoS attack because IP
23256          * creates an IRE for each destination.  If there are too many,
23257          * the time to do any routing lookup will be extremely long.  And
23258          * the lookup can be in interrupt context.


23439                 if (ipsec_mp == NULL)
23440                         return;
23441         }
23442         if (is_system_labeled() && !tsol_can_reply_error(mp)) {
23443                 DTRACE_PROBE2(
23444                     tx__ip__log__error__nolistener__tcp,
23445                     char *, "Could not reply with RST to mp(1)",
23446                     mblk_t *, mp);
23447                 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
23448                 freemsg(ipsec_mp);
23449                 return;
23450         }
23451 
23452         rptr = mp->b_rptr;
23453 
23454         tcph = (tcph_t *)&rptr[ip_hdr_len];
23455         seg_seq = BE32_TO_U32(tcph->th_seq);
23456         seg_ack = BE32_TO_U32(tcph->th_ack);
23457         flags = tcph->th_flags[0];
23458 







23459         seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len);
23460         if (flags & TH_RST) {
23461                 freemsg(ipsec_mp);
23462         } else if (flags & TH_ACK) {
23463                 tcp_xmit_early_reset("no tcp, reset",
23464                     ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps,
23465                     connp);
23466         } else {
23467                 if (flags & TH_SYN) {
23468                         seg_len++;
23469                 } else {
23470                         /*
23471                          * Here we violate the RFC.  Note that a normal
23472                          * TCP will never send a segment without the ACK
23473                          * flag, except for RST or SYN segment.  This
23474                          * segment is neither.  Just drop it on the
23475                          * floor.
23476                          */
23477                         freemsg(ipsec_mp);
23478                         tcps->tcps_rst_unsent++;


23776                         /*
23777                          * Get IP set to checksum on our behalf
23778                          * Include the adjustment for a source route if any.
23779                          */
23780                         u1 += tcp->tcp_sum;
23781                         u1 = (u1 >> 16) + (u1 & 0xFFFF);
23782                         U16_TO_BE16(u1, tcph->th_sum);
23783                         BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23784                 }
23785                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
23786                     (seq + data_length) == tcp->tcp_fss) {
23787                         if (!tcp->tcp_fin_acked) {
23788                                 flags |= TH_FIN;
23789                                 BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23790                         }
23791                         if (!tcp->tcp_fin_sent) {
23792                                 tcp->tcp_fin_sent = B_TRUE;
23793                                 switch (tcp->tcp_state) {
23794                                 case TCPS_SYN_RCVD:
23795                                 case TCPS_ESTABLISHED:



23796                                         tcp->tcp_state = TCPS_FIN_WAIT_1;
23797                                         break;
23798                                 case TCPS_CLOSE_WAIT:



23799                                         tcp->tcp_state = TCPS_LAST_ACK;
23800                                         break;
23801                                 }
23802                                 if (tcp->tcp_suna == tcp->tcp_snxt)
23803                                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
23804                                 tcp->tcp_snxt = tcp->tcp_fss + 1;
23805                         }
23806                 }
23807                 /*
23808                  * Note the trick here.  u1 is unsigned.  When tcp_urg
23809                  * is smaller than seq, u1 will become a very huge value.
23810                  * So the comparison will fail.  Also note that tcp_urp
23811                  * should be positive, see RFC 793 page 17.
23812                  */
23813                 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
23814                 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
23815                     u1 < (uint32_t)(64 * 1024)) {
23816                         flags |= TH_URG;
23817                         BUMP_MIB(&tcps->tcps_mib, tcpOutUrg);
23818                         U32_TO_ABE16(u1, tcph->th_urp);


24128 /* ARGSUSED */
24129 static tcp_t *
24130 tcp_alloc_temp_tcp(in_port_t port, tcp_stack_t *tcps)
24131 {
24132         conn_t  *connp;
24133         tcp_t   *tcp;
24134 
24135         connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP, tcps->tcps_netstack);
24136         if (connp == NULL)
24137                 return (NULL);
24138 
24139         tcp = connp->conn_tcp;
24140         tcp->tcp_tcps = tcps;
24141         TCPS_REFHOLD(tcps);
24142 
24143         /*
24144          * Only initialize the necessary info in those structures.  Note
24145          * that since INADDR_ANY is all 0, we do not need to set
24146          * tcp_bound_source to INADDR_ANY here.
24147          */


24148         tcp->tcp_state = TCPS_BOUND;
24149         tcp->tcp_lport = port;
24150         tcp->tcp_exclbind = 1;
24151         tcp->tcp_reserved_port = 1;
24152 
24153         /* Just for place holding... */
24154         tcp->tcp_ipversion = IPV4_VERSION;
24155 
24156         return (tcp);
24157 }
24158 
24159 /*
24160  * To remove a port range specified by lo_port and hi_port from the
24161  * reserved port ranges.  This is one of the three public functions of
24162  * the reserved port interface.  Note that a port range has to be removed
24163  * as a whole.  Ports in a range cannot be removed individually.
24164  *
24165  * Params:
24166  *      in_port_t lo_port: the beginning port of the reserved port range to
24167  *              be deleted.




3460         tbr->PRIM_type = T_BIND_ACK;
3461         mp->b_datap->db_type = M_PCPROTO;
3462 
3463         /* Chain in the reply mp for tcp_rput() */
3464         mp1->b_cont = mp;
3465         mp = mp1;
3466 
3467         tcp->tcp_conn_req_max = tbr->CONIND_number;
3468         if (tcp->tcp_conn_req_max) {
3469                 if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min)
3470                         tcp->tcp_conn_req_max = tcps->tcps_conn_req_min;
3471                 if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q)
3472                         tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q;
3473                 /*
3474                  * If this is a listener, do not reset the eager list
3475                  * and other stuffs.  Note that we don't check if the
3476                  * existing eager list meets the new tcp_conn_req_max
3477                  * requirement.
3478                  */
3479                 if (tcp->tcp_state != TCPS_LISTEN) {
3480                         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3481                             tcp_t *, tcp, int32_t, TCPS_LISTEN);
3482                         tcp->tcp_state = TCPS_LISTEN;
3483                         /* Initialize the chain. Don't need the eager_lock */
3484                         tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
3485                         tcp->tcp_eager_next_drop_q0 = tcp;
3486                         tcp->tcp_eager_prev_drop_q0 = tcp;
3487                         tcp->tcp_second_ctimer_threshold =
3488                             tcps->tcps_ip_abort_linterval;
3489                 }
3490         }
3491 
3492         /*
3493          * We can call ip_bind directly which returns a T_BIND_ACK mp. The
3494          * processing continues in tcp_rput_other().
3495          *
3496          * We need to make sure that the conn_recv is set to a non-null
3497          * value before we insert the conn into the classifier table.
3498          * This is to avoid a race with an incoming packet which does an
3499          * ipcl_classify().
3500          */
3501         connp->conn_recv = tcp_conn_request;


3758                                  * address and source port, which is
3759                                  * refused regardless of the
3760                                  * SO_REUSEADDR setting, so we break.
3761                                  */
3762                                 if (IN6_ARE_ADDR_EQUAL(laddr,
3763                                     &ltcp->tcp_bound_source_v6) &&
3764                                     (ltcp->tcp_state == TCPS_LISTEN ||
3765                                     ltcp->tcp_state == TCPS_BOUND))
3766                                         break;
3767                         }
3768                 }
3769                 if (ltcp != NULL) {
3770                         /* The port number is busy */
3771                         mutex_exit(&tbf->tf_lock);
3772                 } else {
3773                         /*
3774                          * This port is ours. Insert in fanout and mark as
3775                          * bound to prevent others from getting the port
3776                          * number.
3777                          */
3778                         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
3779                             tcp_t *, tcp, int32_t, TCPS_BOUND);
3780                         tcp->tcp_state = TCPS_BOUND;
3781                         tcp->tcp_lport = htons(port);
3782                         *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport;
3783 
3784                         ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH(
3785                             tcp->tcp_lport)] == tbf);
3786                         tcp_bind_hash_insert(tbf, tcp, 1);
3787 
3788                         mutex_exit(&tbf->tf_lock);
3789 
3790                         /*
3791                          * We don't want tcp_next_port_to_try to "inherit"
3792                          * a port number supplied by the user in a bind.
3793                          */
3794                         if (user_specified)
3795                                 return (port);
3796 
3797                         /*
3798                          * This is the only place where tcp_next_port_to_try
3799                          * is updated. After the update, it may or may not


3891             tcp->tcp_ipversion == IPV6_VERSION)));
3892 
3893         if (TCP_IS_DETACHED(tcp)) {
3894                 if (tcp->tcp_hard_binding) {
3895                         /*
3896                          * Its an eager that we are dealing with. We close the
3897                          * eager but in case a conn_ind has already gone to the
3898                          * listener, let tcp_accept_finish() send a discon_ind
3899                          * to the listener and drop the last reference. If the
3900                          * listener doesn't even know about the eager i.e. the
3901                          * conn_ind hasn't gone up, blow away the eager and drop
3902                          * the last reference as well. If the conn_ind has gone
3903                          * up, state should be BOUND. tcp_accept_finish
3904                          * will figure out that the connection has received a
3905                          * RST and will send a DISCON_IND to the application.
3906                          */
3907                         tcp_closei_local(tcp);
3908                         if (!tcp->tcp_tconnind_started) {
3909                                 CONN_DEC_REF(tcp->tcp_connp);
3910                         } else {
3911                                 DTRACE_TCP4(state__change, void, NULL,
3912                                     conn_t *, NULL, tcp_t *, tcp, int32_t,
3913                                     TCPS_BOUND);
3914                                 tcp->tcp_state = TCPS_BOUND;
3915                         }
3916                 } else {
3917                         tcp_close_detached(tcp);
3918                 }
3919                 return (0);
3920         }
3921 
3922         TCP_STAT(tcps, tcp_clean_death_nondetached);
3923 
3924         /*
3925          * If T_ORDREL_IND has not been sent yet (done when service routine
3926          * is run) postpone cleaning up the endpoint until service routine
3927          * has sent up the T_ORDREL_IND. Avoid clearing out an existing
3928          * client_errno since tcp_close uses the client_errno field.
3929          */
3930         if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
3931                 if (err != 0)
3932                         tcp->tcp_client_errno = err;
3933 


4627                 (void) tcp_time_wait_remove(tcp, NULL);
4628         CL_INET_DISCONNECT(tcp);
4629         ipcl_hash_remove(connp);
4630 
4631         /*
4632          * Delete the cached ire in conn_ire_cache and also mark
4633          * the conn as CONDEMNED
4634          */
4635         mutex_enter(&connp->conn_lock);
4636         connp->conn_state_flags |= CONN_CONDEMNED;
4637         ire = connp->conn_ire_cache;
4638         connp->conn_ire_cache = NULL;
4639         mutex_exit(&connp->conn_lock);
4640         if (ire != NULL)
4641                 IRE_REFRELE_NOTR(ire);
4642 
4643         /* Need to cleanup any pending ioctls */
4644         ASSERT(tcp->tcp_time_wait_next == NULL);
4645         ASSERT(tcp->tcp_time_wait_prev == NULL);
4646         ASSERT(tcp->tcp_time_wait_expire == 0);
4647         if (connp->conn_fully_bound) {
4648                 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
4649                     tcp_t *, tcp, int32_t, TCPS_CLOSED);
4650         }
4651         tcp->tcp_state = TCPS_CLOSED;
4652 
4653         /* Release any SSL context */
4654         if (tcp->tcp_kssl_ent != NULL) {
4655                 kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
4656                 tcp->tcp_kssl_ent = NULL;
4657         }
4658         if (tcp->tcp_kssl_ctx != NULL) {
4659                 kssl_release_ctx(tcp->tcp_kssl_ctx);
4660                 tcp->tcp_kssl_ctx = NULL;
4661         }
4662         tcp->tcp_kssl_pending = B_FALSE;
4663 
4664         tcp_ipsec_cleanup(tcp);
4665 }
4666 
4667 /*
4668  * tcp is dying (called from ipcl_conn_destroy and error cases).
4669  * Free the tcp_t in either case.
4670  */


5872                         DTRACE_PROBE3(
5873                             tx__ip__log__error__connrequest__tcp,
5874                             char *, "eager connp(1) label on SYN mp(2) failed",
5875                             conn_t *, econnp, mblk_t *, mp);
5876                         goto error3;
5877                 }
5878         }
5879 
5880         eager->tcp_hard_binding = B_TRUE;
5881 
5882         tcp_bind_hash_insert(&tcps->tcps_bind_fanout[
5883             TCP_BIND_HASH(eager->tcp_lport)], eager, 0);
5884 
5885         CL_INET_CONNECT(eager);
5886 
5887         /*
5888          * No need to check for multicast destination since ip will only pass
5889          * up multicasts to those that have expressed interest
5890          * TODO: what about rejecting broadcasts?
5891          * Also check that source is not a multicast or broadcast address.
5892          *
5893          * DTrace tcp:::state-change is probed a little further down,
5894          * where it is set for the second time.
5895          */
5896         eager->tcp_state = TCPS_SYN_RCVD;
5897 
5898 
5899         /*
5900          * There should be no ire in the mp as we are being called after
5901          * receiving the SYN.
5902          */
5903         ASSERT(tcp_ire_mp(mp) == NULL);
5904 
5905         /*
5906          * Adapt our mss, ttl, ... according to information provided in IRE.
5907          */
5908 
5909         if (tcp_adapt_ire(eager, NULL) == 0) {
5910                 /* Undo the bind_hash_insert */
5911                 tcp_bind_hash_remove(eager);
5912                 goto error3;
5913         }
5914 
5915         /*
5916          * DTrace the first SYN as a tcp:::receive. This is placed after
5917          * tcp_adapt_ire() so that tcp->tcp_loopback has been set.
5918          */
5919         DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
5920             mp->b_rptr, tcp_t *, tcp, tcph_t *, tcph);
5921 
5922         /* Process all TCP options. */
5923         tcp_process_options(eager, tcph);
5924 
5925         /* Is the other end ECN capable? */
5926         if (tcps->tcps_ecn_permitted >= 1 &&
5927             (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
5928                 eager->tcp_ecn_ok = B_TRUE;
5929         }
5930 
5931         /*
5932          * listener->tcp_rq->q_hiwat should be the default window size or a
5933          * window size changed via SO_RCVBUF option.  First round up the
5934          * eager's tcp_rwnd to the nearest MSS.  Then find out the window
5935          * scale option value if needed.  Call tcp_rwnd_set() to finish the
5936          * setting.
5937          *
5938          * Note if there is a rpipe metric associated with the remote host,
5939          * we should not inherit receive window size from listener.
5940          */
5941         eager->tcp_rwnd = MSS_ROUNDUP(


6018                 if (addr_cache != NULL && eager->tcp_remote ==
6019                     addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) {
6020                         eager->tcp_dontdrop = B_TRUE;
6021                 }
6022         }
6023 
6024         /*
6025          * We need to insert the eager in its own perimeter but as soon
6026          * as we do that, we expose the eager to the classifier and
6027          * should not touch any field outside the eager's perimeter.
6028          * So do all the work necessary before inserting the eager
6029          * in its own perimeter. Be optimistic that ipcl_conn_insert()
6030          * will succeed but undo everything if it fails.
6031          */
6032         seg_seq = ABE32_TO_U32(tcph->th_seq);
6033         eager->tcp_irs = seg_seq;
6034         eager->tcp_rack = seg_seq;
6035         eager->tcp_rnxt = seg_seq + 1;
6036         U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack);
6037         BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens);
6038         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, eager,
6039             int32_t, TCPS_SYN_RCVD);
6040         eager->tcp_state = TCPS_SYN_RCVD;
6041         mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss,
6042             NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE);
6043         if (mp1 == NULL) {
6044                 /*
6045                  * Increment the ref count as we are going to
6046                  * enqueueing an mp in squeue
6047                  */
6048                 CONN_INC_REF(econnp);
6049                 goto error;
6050         }
6051         DB_CPID(mp1) = tcp->tcp_cpid;
6052         eager->tcp_cpid = tcp->tcp_cpid;
6053         eager->tcp_open_time = lbolt64;
6054 
6055         /*
6056          * We need to start the rto timer. In normal case, we start
6057          * the timer after sending the packet on the wire (or at
6058          * least believing that packet was sent by waiting for
6059          * CALL_IP_WPUT() to return). Since this is the first packet


6129                          * anymore (someone blew it away). Just
6130                          * free this message and hopefully remote
6131                          * will retransmit at which time the SYN can be
6132                          * treated as a new connection or dealth with
6133                          * a TH_RST if a connection already exists.
6134                          */
6135                         CONN_DEC_REF(econnp);
6136                         freemsg(mp);
6137                 } else {
6138                         squeue_fill(econnp->conn_sqp, mp, tcp_input,
6139                             econnp, SQTAG_TCP_CONN_REQ_1);
6140                 }
6141         } else {
6142                 /* Nobody wants this packet */
6143                 freemsg(mp);
6144         }
6145         return;
6146 error3:
6147         CONN_DEC_REF(econnp);
6148 error2:
6149         /*
6150          * DTrace this tcp:::receive event, as we skipped the previous receive
6151          * probe. For DTrace only, we find the IP header length so that the
6152          * TCP header can be found.
6153          */
6154         ipvers = IPH_HDR_VERSION(mp->b_rptr);
6155         if (OK_32PTR(mp->b_rptr) &&
6156             (ipvers == IPV4_VERSION || ipvers == IPV6_VERSION)) {
6157                 if (ipvers == IPV4_VERSION)
6158                         ip_hdr_len = IPH_HDR_LENGTH((ipha_t *)mp->b_rptr);
6159                 else
6160                         ip_hdr_len = ip_hdr_length_v6(mp, (ip6_t *)mp->b_rptr);
6161                 DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL,
6162                     void_ip_t *, mp->b_rptr, tcp_t *, NULL, tcph_t *,
6163                     &mp->b_rptr[ip_hdr_len]);
6164         }
6165 
6166         freemsg(mp);
6167 }
6168 
6169 /*
6170  * In an ideal case of vertical partition in NUMA architecture, its
6171  * beneficial to have the listener and all the incoming connections
6172  * tied to the same squeue. The other constraint is that incoming
6173  * connections should be tied to the squeue attached to interrupted
6174  * CPU for obvious locality reason so this leaves the listener to
6175  * be tied to the same squeue. Our only problem is that when listener
6176  * is binding, the CPU that will get interrupted by the NIC whose
6177  * IP address the listener is binding to is not even known. So
6178  * the code below allows us to change that binding at the time the
6179  * CPU is interrupted by virtue of incoming connection's squeue.
6180  *
6181  * This is usefull only in case of a listener bound to a specific IP
6182  * address. For other kind of listeners, they get bound the
6183  * very first time and there is no attempt to rebind them.
6184  */
6185 void


6644          * At this point the remote destination address and remote port fields
6645          * in the tcp-four-tuple have been filled in the tcp structure. Now we
6646          * have to see which state tcp was in so we can take apropriate action.
6647          */
6648         if (oldstate == TCPS_IDLE) {
6649                 /*
6650                  * We support a quick connect capability here, allowing
6651                  * clients to transition directly from IDLE to SYN_SENT
6652                  * tcp_bindi will pick an unused port, insert the connection
6653                  * in the bind hash and transition to BOUND state.
6654                  */
6655                 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6656                     tcp, B_TRUE);
6657                 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6658                     B_FALSE, B_FALSE);
6659                 if (lport == 0) {
6660                         mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6661                         goto failed;
6662                 }
6663         }
6664         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6665             int32_t, TCPS_SYN_SENT);
6666         tcp->tcp_state = TCPS_SYN_SENT;
6667 
6668         /*
6669          * TODO: allow data with connect requests
6670          * by unlinking M_DATA trailers here and
6671          * linking them in behind the T_OK_ACK mblk.
6672          * The tcp_rput() bind ack handler would then
6673          * feed them to tcp_wput_data() rather than call
6674          * tcp_timer().
6675          */
6676         mp = mi_tpi_ok_ack_alloc(mp);
6677         if (!mp) {
6678                 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6679                     tcp_t *, tcp, int32_t, oldstate);
6680                 tcp->tcp_state = oldstate;
6681                 goto failed;
6682         }
6683         if (tcp->tcp_family == AF_INET) {
6684                 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6685                     sizeof (ipa_conn_t));
6686         } else {
6687                 mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ,
6688                     sizeof (ipa6_conn_t));
6689         }
6690         if (mp1) {
6691                 /*
6692                  * We need to make sure that the conn_recv is set to a non-null
6693                  * value before we insert the conn_t into the classifier table.
6694                  * This is to avoid a race with an incoming packet which does
6695                  * an ipcl_classify().
6696                  */
6697                 tcp->tcp_connp->conn_recv = tcp_input;
6698 
6699                 /* Hang onto the T_OK_ACK for later. */
6700                 linkb(mp1, mp);
6701                 mblk_setcred(mp1, tcp->tcp_cred);
6702                 if (tcp->tcp_family == AF_INET)
6703                         mp1 = ip_bind_v4(tcp->tcp_wq, mp1, tcp->tcp_connp);
6704                 else {
6705                         mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6706                             &tcp->tcp_sticky_ipp);
6707                 }
6708                 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6709                 tcp->tcp_active_open = 1;
6710                 /*
6711                  * If the bind cannot complete immediately
6712                  * IP will arrange to call tcp_rput_other
6713                  * when the bind completes.
6714                  */
6715                 if (mp1 != NULL)
6716                         tcp_rput_other(tcp, mp1);
6717                 return;
6718         }
6719         /* Error case */
6720         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6721             int32_t, oldstate);
6722         tcp->tcp_state = oldstate;
6723         mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6724 
6725 failed:
6726         /* return error ack and blow away saved option results if any */
6727         if (mp != NULL)
6728                 putnext(tcp->tcp_rq, mp);
6729         else {
6730                 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6731                     TSYSERR, ENOMEM);
6732         }
6733         if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6734                 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6735 
6736 }
6737 
6738 /*
6739  * Handle connect to IPv6 destinations.
6740  */
6741 static void


6858          * At this point the remote destination address and remote port fields
6859          * in the tcp-four-tuple have been filled in the tcp structure. Now we
6860          * have to see which state tcp was in so we can take apropriate action.
6861          */
6862         if (oldstate == TCPS_IDLE) {
6863                 /*
6864                  * We support a quick connect capability here, allowing
6865                  * clients to transition directly from IDLE to SYN_SENT
6866                  * tcp_bindi will pick an unused port, insert the connection
6867                  * in the bind hash and transition to BOUND state.
6868                  */
6869                 lport = tcp_update_next_port(tcps->tcps_next_port_to_try,
6870                     tcp, B_TRUE);
6871                 lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE,
6872                     B_FALSE, B_FALSE);
6873                 if (lport == 0) {
6874                         mp = mi_tpi_err_ack_alloc(mp, TNOADDR, 0);
6875                         goto failed;
6876                 }
6877         }
6878         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6879             int32_t, TCPS_SYN_SENT);
6880         tcp->tcp_state = TCPS_SYN_SENT;
6881         /*
6882          * TODO: allow data with connect requests
6883          * by unlinking M_DATA trailers here and
6884          * linking them in behind the T_OK_ACK mblk.
6885          * The tcp_rput() bind ack handler would then
6886          * feed them to tcp_wput_data() rather than call
6887          * tcp_timer().
6888          */
6889         mp = mi_tpi_ok_ack_alloc(mp);
6890         if (!mp) {
6891                 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
6892                     tcp_t *, tcp, int32_t, oldstate);
6893                 tcp->tcp_state = oldstate;
6894                 goto failed;
6895         }
6896         mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, sizeof (ipa6_conn_t));
6897         if (mp1) {
6898                 /*
6899                  * We need to make sure that the conn_recv is set to a non-null
6900                  * value before we insert the conn_t into the classifier table.
6901                  * This is to avoid a race with an incoming packet which does
6902                  * an ipcl_classify().
6903                  */
6904                 tcp->tcp_connp->conn_recv = tcp_input;
6905 
6906                 /* Hang onto the T_OK_ACK for later. */
6907                 linkb(mp1, mp);
6908                 mblk_setcred(mp1, tcp->tcp_cred);
6909                 mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp,
6910                     &tcp->tcp_sticky_ipp);
6911                 BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens);
6912                 tcp->tcp_active_open = 1;
6913                 /* ip_bind_v6() may return ACK or ERROR */
6914                 if (mp1 != NULL)
6915                         tcp_rput_other(tcp, mp1);
6916                 return;
6917         }
6918         /* Error case */
6919         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
6920             int32_t, oldstate);
6921         tcp->tcp_state = oldstate;
6922         mp = mi_tpi_err_ack_alloc(mp, TSYSERR, ENOMEM);
6923 
6924 failed:
6925         /* return error ack and blow away saved option results if any */
6926         if (mp != NULL)
6927                 putnext(tcp->tcp_rq, mp);
6928         else {
6929                 tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
6930                     TSYSERR, ENOMEM);
6931         }
6932         if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
6933                 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
6934 }
6935 
6936 /*
6937  * We need a stream q for detached closing tcp connections
6938  * to use.  Our client hereby indicates that this q is the
6939  * one to use.
6940  */


7049                 ltcp = NULL;
7050                 /*
7051                  * If it used to be a listener, check to make sure no one else
7052                  * has taken the port before switching back to LISTEN state.
7053                  */
7054                 if (tcp->tcp_ipversion == IPV4_VERSION) {
7055                         connp = ipcl_lookup_listener_v4(tcp->tcp_lport,
7056                             tcp->tcp_ipha->ipha_src,
7057                             tcp->tcp_connp->conn_zoneid, ipst);
7058                         if (connp != NULL)
7059                                 ltcp = connp->conn_tcp;
7060                 } else {
7061                         /* Allow tcp_bound_if listeners? */
7062                         connp = ipcl_lookup_listener_v6(tcp->tcp_lport,
7063                             &tcp->tcp_ip6h->ip6_src, 0,
7064                             tcp->tcp_connp->conn_zoneid, ipst);
7065                         if (connp != NULL)
7066                                 ltcp = connp->conn_tcp;
7067                 }
7068                 if (tcp->tcp_conn_req_max && ltcp == NULL) {
7069                         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7070                             tcp_t *, tcp, int32_t, TCPS_LISTEN);
7071                         tcp->tcp_state = TCPS_LISTEN;
7072                 } else if (old_state > TCPS_BOUND) {
7073                         tcp->tcp_conn_req_max = 0;
7074                         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7075                             tcp_t *, tcp, int32_t, TCPS_BOUND);
7076                         tcp->tcp_state = TCPS_BOUND;
7077                 }
7078                 if (ltcp != NULL)
7079                         CONN_DEC_REF(ltcp->tcp_connp);
7080                 if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) {
7081                         BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails);
7082                 } else if (old_state == TCPS_ESTABLISHED ||
7083                     old_state == TCPS_CLOSE_WAIT) {
7084                         BUMP_MIB(&tcps->tcps_mib, tcpEstabResets);
7085                 }
7086 
7087                 if (tcp->tcp_fused)
7088                         tcp_unfuse(tcp);
7089 
7090                 mutex_enter(&tcp->tcp_eager_lock);
7091                 if ((tcp->tcp_conn_req_cnt_q0 != 0) ||
7092                     (tcp->tcp_conn_req_cnt_q != 0)) {
7093                         tcp_eager_cleanup(tcp, 0);
7094                 }
7095                 mutex_exit(&tcp->tcp_eager_lock);


7956         tcp_ipsec_cleanup(tcp);
7957 
7958         if (tcp->tcp_conn_req_max != 0) {
7959                 /*
7960                  * This is the case when a TLI program uses the same
7961                  * transport end point to accept a connection.  This
7962                  * makes the TCP both a listener and acceptor.  When
7963                  * this connection is closed, we need to set the state
7964                  * back to TCPS_LISTEN.  Make sure that the eager list
7965                  * is reinitialized.
7966                  *
7967                  * Note that this stream is still bound to the four
7968                  * tuples of the previous connection in IP.  If a new
7969                  * SYN with different foreign address comes in, IP will
7970                  * not find it and will send it to the global queue.  In
7971                  * the global queue, TCP will do a tcp_lookup_listener()
7972                  * to find this stream.  This works because this stream
7973                  * is only removed from connected hash.
7974                  *
7975                  */
7976                 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7977                     tcp_t *, tcp, int32_t, TCPS_LISTEN);
7978                 tcp->tcp_state = TCPS_LISTEN;
7979                 tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
7980                 tcp->tcp_eager_next_drop_q0 = tcp;
7981                 tcp->tcp_eager_prev_drop_q0 = tcp;
7982                 tcp->tcp_connp->conn_recv = tcp_conn_request;
7983                 if (tcp->tcp_family == AF_INET6) {
7984                         ASSERT(tcp->tcp_connp->conn_af_isv6);
7985                         (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP,
7986                             &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport);
7987                 } else {
7988                         ASSERT(!tcp->tcp_connp->conn_af_isv6);
7989                         (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP,
7990                             tcp->tcp_ipha->ipha_src, tcp->tcp_lport);
7991                 }
7992         } else {
7993                 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
7994                     tcp_t *, tcp, int32_t, TCPS_BOUND);
7995                 tcp->tcp_state = TCPS_BOUND;
7996         }
7997 
7998         /*
7999          * Initialize to default values
8000          * Can't fail since enough header template space already allocated
8001          * at open().
8002          */
8003         err = tcp_init_values(tcp);
8004         ASSERT(err == 0);
8005         /* Restore state in tcp_tcph */
8006         bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN);
8007         if (tcp->tcp_ipversion == IPV4_VERSION)
8008                 tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source;
8009         else
8010                 tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6;
8011         /*
8012          * Copy of the src addr. in tcp_t is needed in tcp_t
8013          * since the lookup funcs can only lookup on tcp_t
8014          */


8336         DONTCARE(tcp->tcmp_stk[0]);
8337 #endif
8338 
8339 
8340 #undef  DONTCARE
8341 #undef  PRESERVE
8342 }
8343 
8344 /*
8345  * Allocate necessary resources and initialize state vector.
8346  * Guaranteed not to fail so that when an error is returned,
8347  * the caller doesn't need to do any additional cleanup.
8348  */
8349 int
8350 tcp_init(tcp_t *tcp, queue_t *q)
8351 {
8352         int     err;
8353 
8354         tcp->tcp_rq = q;
8355         tcp->tcp_wq = WR(q);
8356         /* DTrace ignores this - it isn't a tcp:::state-change */
8357         tcp->tcp_state = TCPS_IDLE;
8358         if ((err = tcp_init_values(tcp)) != 0)
8359                 tcp_timers_stop(tcp);
8360         return (err);
8361 }
8362 
8363 static int
8364 tcp_init_values(tcp_t *tcp)
8365 {
8366         int     err;
8367         tcp_stack_t     *tcps = tcp->tcp_tcps;
8368 
8369         ASSERT((tcp->tcp_family == AF_INET &&
8370             tcp->tcp_ipversion == IPV4_VERSION) ||
8371             (tcp->tcp_family == AF_INET6 &&
8372             (tcp->tcp_ipversion == IPV4_VERSION ||
8373             tcp->tcp_ipversion == IPV6_VERSION)));
8374 
8375         /*
8376          * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO


13338                 if (tcp->tcp_detached || !pullupmsg(mp, -1)) {
13339                         freemsg(mp);
13340                         return;
13341                 }
13342                 /* Update pointers into message */
13343                 iphdr = rptr = mp->b_rptr;
13344                 tcph = (tcph_t *)&rptr[ip_hdr_len];
13345                 if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) {
13346                         /*
13347                          * Since we can't handle any data with this urgent
13348                          * pointer that is out of sequence, we expunge
13349                          * the data.  This allows us to still register
13350                          * the urgent mark and generate the M_PCSIG,
13351                          * which we can do.
13352                          */
13353                         mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph);
13354                         seg_len = 0;
13355                 }
13356         }
13357 
13358         DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
13359             iphdr, tcp_t *, tcp, tcph_t *, tcph);
13360         if (tcp->tcp_state == TCPS_SYN_RCVD && (flags & TH_ACK)) {
13361                 DTRACE_TCP5(accept__established, mblk_t *, NULL, conn_t *,
13362                     NULL, void_ip_t *, iphdr, tcp_t *, tcp, tcph_t *, tcph);
13363         }
13364 
13365         switch (tcp->tcp_state) {
13366         case TCPS_SYN_SENT:
13367                 if (flags & TH_ACK) {
13368                         /*
13369                          * Note that our stack cannot send data before a
13370                          * connection is established, therefore the
13371                          * following check is valid.  Otherwise, it has
13372                          * to be changed.
13373                          */
13374                         if (SEQ_LEQ(seg_ack, tcp->tcp_iss) ||
13375                             SEQ_GT(seg_ack, tcp->tcp_snxt)) {
13376                                 freemsg(mp);
13377                                 if (flags & TH_RST)
13378                                         return;
13379                                 tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq",
13380                                     tcp, seg_ack, 0, TH_RST);
13381                                 return;
13382                         }
13383                         ASSERT(tcp->tcp_suna + 1 == seg_ack);
13384                 }
13385                 if (flags & TH_RST) {
13386                         DTRACE_TCP5(connect__refused, mblk_t *, NULL,
13387                             conn_t *, NULL, void_ip_t *, iphdr, tcp_t *, NULL,
13388                             tcph_t *, tcph);
13389 
13390                         freemsg(mp);
13391                         if (flags & TH_ACK)
13392                                 (void) tcp_clean_death(tcp,
13393                                     ECONNREFUSED, 13);
13394                         return;
13395                 }
13396                 if (!(flags & TH_SYN)) {
13397                         freemsg(mp);
13398                         return;
13399                 }
13400 
13401                 /* Process all TCP options. */
13402                 tcp_process_options(tcp, tcph);
13403                 /*
13404                  * The following changes our rwnd to be a multiple of the
13405                  * MIN(peer MSS, our MSS) for performance reason.
13406                  */
13407                 (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(tcp->tcp_rq->q_hiwat,
13408                     tcp->tcp_mss));
13409 


13444                          *
13445                          * XXX: how can we pretend we didn't see it if we
13446                          * have updated rnxt et. al.
13447                          *
13448                          * For loopback we defer sending up the T_CONN_CON
13449                          * until after some checks below.
13450                          */
13451                         mp1 = NULL;
13452                         if (!tcp_conn_con(tcp, iphdr, tcph, mp,
13453                             tcp->tcp_loopback ? &mp1 : NULL)) {
13454                                 freemsg(mp);
13455                                 return;
13456                         }
13457                         /* SYN was acked - making progress */
13458                         if (tcp->tcp_ipversion == IPV6_VERSION)
13459                                 tcp->tcp_ip_forward_progress = B_TRUE;
13460 
13461                         /* One for the SYN */
13462                         tcp->tcp_suna = tcp->tcp_iss + 1;
13463                         tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
13464                         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
13465                             tcp_t *, tcp, int32_t, TCPS_ESTABLISHED);
13466                         tcp->tcp_state = TCPS_ESTABLISHED;
13467 
13468                         /*
13469                          * For DTrace observability, remember that we just
13470                          * established a connection and are about to send
13471                          * the final ACK.
13472                          */
13473                         tcp->tcp_dtrace_connect_established = B_TRUE;
13474 
13475                         /*
13476                          * If SYN was retransmitted, need to reset all
13477                          * retransmission info.  This is because this
13478                          * segment will be treated as a dup ACK.
13479                          */
13480                         if (tcp->tcp_rexmit) {
13481                                 tcp->tcp_rexmit = B_FALSE;
13482                                 tcp->tcp_rexmit_nxt = tcp->tcp_snxt;
13483                                 tcp->tcp_rexmit_max = tcp->tcp_snxt;
13484                                 tcp->tcp_snd_burst = tcp->tcp_localnet ?
13485                                     TCP_CWND_INFINITE : TCP_CWND_NORMAL;
13486                                 tcp->tcp_ms_we_have_waited = 0;
13487 
13488                                 /*
13489                                  * Set tcp_cwnd back to 1 MSS, per
13490                                  * recommendation from
13491                                  * draft-floyd-incr-init-win-01.txt,
13492                                  * Increasing TCP's Initial Window.
13493                                  */
13494                                 tcp->tcp_cwnd = tcp->tcp_mss;
13495                         }


13562 
13563                         /*
13564                          * Check to see if there is data to be sent.  If
13565                          * yes, set the transmit flag.  Then check to see
13566                          * if received data processing needs to be done.
13567                          * If not, go straight to xmit_check.  This short
13568                          * cut is OK as we don't support T/TCP.
13569                          */
13570                         if (tcp->tcp_unsent)
13571                                 flags |= TH_XMIT_NEEDED;
13572 
13573                         if (seg_len == 0 && !(flags & TH_URG)) {
13574                                 freemsg(mp);
13575                                 goto xmit_check;
13576                         }
13577 
13578                         flags &= ~TH_SYN;
13579                         seg_seq++;
13580                         break;
13581                 }
13582                 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
13583                     tcp_t *, tcp, int32_t, TCPS_SYN_RCVD);
13584                 tcp->tcp_state = TCPS_SYN_RCVD;
13585                 mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss,
13586                     NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE);
13587                 if (mp1) {
13588                         DB_CPID(mp1) = tcp->tcp_cpid;
13589                         TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT);
13590                         tcp_send_data(tcp, tcp->tcp_wq, mp1);
13591                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
13592                 }
13593                 freemsg(mp);
13594                 return;
13595         case TCPS_SYN_RCVD:
13596                 if (flags & TH_ACK) {
13597                         /*
13598                          * In this state, a SYN|ACK packet is either bogus
13599                          * because the other side must be ACKing our SYN which
13600                          * indicates it has seen the ACK for their SYN and
13601                          * shouldn't retransmit it or we're crossing SYNs
13602                          * on active open.
13603                          */


14483                         tcp->tcp_cwnd = mss;
14484                 }
14485 
14486                 /*
14487                  * We set the send window to zero here.
14488                  * This is needed if there is data to be
14489                  * processed already on the queue.
14490                  * Later (at swnd_update label), the
14491                  * "new_swnd > tcp_swnd" condition is satisfied
14492                  * the XMIT_NEEDED flag is set in the current
14493                  * (SYN_RCVD) state. This ensures tcp_wput_data() is
14494                  * called if there is already data on queue in
14495                  * this state.
14496                  */
14497                 tcp->tcp_swnd = 0;
14498 
14499                 if (new_swnd > tcp->tcp_max_swnd)
14500                         tcp->tcp_max_swnd = new_swnd;
14501                 tcp->tcp_swl1 = seg_seq;
14502                 tcp->tcp_swl2 = seg_ack;
14503                 DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL,
14504                     tcp_t *, tcp, int32_t, TCPS_ESTABLISHED);
14505                 tcp->tcp_state = TCPS_ESTABLISHED;
14506                 tcp->tcp_valid_bits &= ~TCP_ISS_VALID;
14507 
14508                 /* Fuse when both sides are in ESTABLISHED state */
14509                 if (tcp->tcp_loopback && do_tcp_fusion)
14510                         tcp_fuse(tcp, iphdr, tcph);
14511 
14512         }
14513         /* This code follows 4.4BSD-Lite2 mostly. */
14514         if (bytes_acked < 0)
14515                 goto est;
14516 
14517         /*
14518          * If TCP is ECN capable and the congestion experience bit is
14519          * set, reduce tcp_cwnd and tcp_ssthresh.  But this should only be
14520          * done once per window (or more loosely, per RTT).
14521          */
14522         if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max))
14523                 tcp->tcp_cwr = B_FALSE;
14524         if (tcp->tcp_ecn_ok && (flags & TH_ECE)) {


15121                  *
15122                  * 1. the segment acknowledges some data.  Or
15123                  * 2. the segment is new, i.e. it has a higher seq num. Or
15124                  * 3. the segment is not old and the advertised window is
15125                  * larger than the previous advertised window.
15126                  */
15127                 if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd)
15128                         flags |= TH_XMIT_NEEDED;
15129                 tcp->tcp_swnd = new_swnd;
15130                 if (new_swnd > tcp->tcp_max_swnd)
15131                         tcp->tcp_max_swnd = new_swnd;
15132                 tcp->tcp_swl1 = seg_seq;
15133                 tcp->tcp_swl2 = seg_ack;
15134         }
15135 est:
15136         if (tcp->tcp_state > TCPS_ESTABLISHED) {
15137 
15138                 switch (tcp->tcp_state) {
15139                 case TCPS_FIN_WAIT_1:
15140                         if (tcp->tcp_fin_acked) {
15141                                 DTRACE_TCP4(state__change, void, NULL,
15142                                     conn_t *, NULL, tcp_t *, tcp, int32_t,
15143                                     TCPS_FIN_WAIT_2);
15144                                 tcp->tcp_state = TCPS_FIN_WAIT_2;
15145                                 /*
15146                                  * We implement the non-standard BSD/SunOS
15147                                  * FIN_WAIT_2 flushing algorithm.
15148                                  * If there is no user attached to this
15149                                  * TCP endpoint, then this TCP struct
15150                                  * could hang around forever in FIN_WAIT_2
15151                                  * state if the peer forgets to send us
15152                                  * a FIN.  To prevent this, we wait only
15153                                  * 2*MSL (a convenient time value) for
15154                                  * the FIN to arrive.  If it doesn't show up,
15155                                  * we flush the TCP endpoint.  This algorithm,
15156                                  * though a violation of RFC-793, has worked
15157                                  * for over 10 years in BSD systems.
15158                                  * Note: SunOS 4.x waits 675 seconds before
15159                                  * flushing the FIN_WAIT_2 connection.
15160                                  */
15161                                 TCP_TIMER_RESTART(tcp,
15162                                     tcps->tcps_fin_wait_2_flush_interval);
15163                         }
15164                         break;
15165                 case TCPS_FIN_WAIT_2:
15166                         break;  /* Shutdown hook? */
15167                 case TCPS_LAST_ACK:
15168                         freemsg(mp);
15169                         if (tcp->tcp_fin_acked) {
15170                                 (void) tcp_clean_death(tcp, 0, 19);
15171                                 return;
15172                         }
15173                         goto xmit_check;
15174                 case TCPS_CLOSING:
15175                         if (tcp->tcp_fin_acked) {
15176                                 DTRACE_TCP4(state__change, void, NULL,
15177                                     conn_t *, NULL, tcp_t *, tcp, int32_t,
15178                                     TCPS_TIME_WAIT);
15179                                 tcp->tcp_state = TCPS_TIME_WAIT;
15180                                 /*
15181                                  * Unconditionally clear the exclusive binding
15182                                  * bit so this TIME-WAIT connection won't
15183                                  * interfere with new ones.
15184                                  */
15185                                 tcp->tcp_exclbind = 0;
15186                                 if (!TCP_IS_DETACHED(tcp)) {
15187                                         TCP_TIMER_RESTART(tcp,
15188                                             tcps->tcps_time_wait_interval);
15189                                 } else {
15190                                         tcp_time_wait_append(tcp);
15191                                         TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15192                                 }
15193                         }
15194                         /*FALLTHRU*/
15195                 case TCPS_CLOSE_WAIT:
15196                         freemsg(mp);
15197                         goto xmit_check;
15198                 default:


15204                 /* Make sure we ack the fin */
15205                 flags |= TH_ACK_NEEDED;
15206                 if (!tcp->tcp_fin_rcvd) {
15207                         tcp->tcp_fin_rcvd = B_TRUE;
15208                         tcp->tcp_rnxt++;
15209                         tcph = tcp->tcp_tcph;
15210                         U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack);
15211 
15212                         /*
15213                          * Generate the ordrel_ind at the end unless we
15214                          * are an eager guy.
15215                          * In the eager case tcp_rsrv will do this when run
15216                          * after tcp_accept is done.
15217                          */
15218                         if (tcp->tcp_listener == NULL &&
15219                             !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding))
15220                                 flags |= TH_ORDREL_NEEDED;
15221                         switch (tcp->tcp_state) {
15222                         case TCPS_SYN_RCVD:
15223                         case TCPS_ESTABLISHED:
15224                                 DTRACE_TCP4(state__change, void, NULL,
15225                                     conn_t *, NULL, tcp_t *, tcp, int32_t,
15226                                     TCPS_CLOSE_WAIT);
15227                                 tcp->tcp_state = TCPS_CLOSE_WAIT;
15228                                 /* Keepalive? */
15229                                 break;
15230                         case TCPS_FIN_WAIT_1:
15231                                 if (!tcp->tcp_fin_acked) {
15232                                         DTRACE_TCP4(state__change, void, NULL,
15233                                             conn_t *, NULL, tcp_t *, tcp,
15234                                             int32_t, TCPS_CLOSING);
15235                                         tcp->tcp_state = TCPS_CLOSING;
15236                                         break;
15237                                 }
15238                                 /* FALLTHRU */
15239                         case TCPS_FIN_WAIT_2:
15240                                 DTRACE_TCP4(state__change, void, NULL,
15241                                     conn_t *, NULL, tcp_t *, tcp, int32_t,
15242                                     TCPS_TIME_WAIT);
15243                                 tcp->tcp_state = TCPS_TIME_WAIT;
15244                                 /*
15245                                  * Unconditionally clear the exclusive binding
15246                                  * bit so this TIME-WAIT connection won't
15247                                  * interfere with new ones.
15248                                  */
15249                                 tcp->tcp_exclbind = 0;
15250                                 if (!TCP_IS_DETACHED(tcp)) {
15251                                         TCP_TIMER_RESTART(tcp,
15252                                             tcps->tcps_time_wait_interval);
15253                                 } else {
15254                                         tcp_time_wait_append(tcp);
15255                                         TCP_DBGSTAT(tcps, tcp_rput_time_wait);
15256                                 }
15257                                 if (seg_len) {
15258                                         /*
15259                                          * implies data piggybacked on FIN.
15260                                          * break to handle data.
15261                                          */
15262                                         break;


16083                 tea = (struct T_error_ack *)mp->b_rptr;
16084                 tea->PRIM_type = T_ERROR_ACK;
16085                 tea->TLI_error = TSYSERR;
16086                 tea->UNIX_error = error;
16087                 if (tcp->tcp_state >= TCPS_SYN_SENT) {
16088                         tea->ERROR_prim = T_CONN_REQ;
16089                 } else {
16090                         tea->ERROR_prim = O_T_BIND_REQ;
16091                 }
16092                 break;
16093 
16094         case T_ERROR_ACK:
16095                 if (tcp->tcp_state >= TCPS_SYN_SENT)
16096                         tea->ERROR_prim = T_CONN_REQ;
16097                 break;
16098         default:
16099                 panic("tcp_bind_failed: unexpected TPI type");
16100                 /*NOTREACHED*/
16101         }
16102 
16103         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
16104             int32_t, TCPS_IDLE);
16105         tcp->tcp_state = TCPS_IDLE;
16106         if (tcp->tcp_ipversion == IPV4_VERSION)
16107                 tcp->tcp_ipha->ipha_src = 0;
16108         else
16109                 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
16110         /*
16111          * Copy of the src addr. in tcp_t is needed since
16112          * the lookup funcs. can only look at tcp_t
16113          */
16114         V6_SET_ZERO(tcp->tcp_ip_src_v6);
16115 
16116         tcph = tcp->tcp_tcph;
16117         tcph->th_lport[0] = 0;
16118         tcph->th_lport[1] = 0;
16119         tcp_bind_hash_remove(tcp);
16120         bzero(&connp->u_port, sizeof (connp->u_port));
16121         /* blow away saved option results if any */
16122         if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
16123                 tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
16124 


16127 }
16128 
16129 /*
16130  * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA
16131  * messages.
16132  */
16133 void
16134 tcp_rput_other(tcp_t *tcp, mblk_t *mp)
16135 {
16136         mblk_t  *mp1;
16137         uchar_t *rptr = mp->b_rptr;
16138         queue_t *q = tcp->tcp_rq;
16139         struct T_error_ack *tea;
16140         uint32_t mss;
16141         mblk_t *syn_mp;
16142         mblk_t *mdti;
16143         mblk_t *lsoi;
16144         int     retval;
16145         mblk_t *ire_mp;
16146         tcp_stack_t     *tcps = tcp->tcp_tcps;
16147         uint_t  ip_hdr_len;
16148 
16149         switch (mp->b_datap->db_type) {
16150         case M_PROTO:
16151         case M_PCPROTO:
16152                 ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX);
16153                 if ((mp->b_wptr - rptr) < sizeof (t_scalar_t))
16154                         break;
16155                 tea = (struct T_error_ack *)rptr;
16156                 switch (tea->PRIM_type) {
16157                 case T_BIND_ACK:
16158                         /*
16159                          * Adapt Multidata information, if any.  The
16160                          * following tcp_mdt_update routine will free
16161                          * the message.
16162                          */
16163                         if ((mdti = tcp_mdt_info_mp(mp)) != NULL) {
16164                                 tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti->
16165                                     b_rptr)->mdt_capab, B_TRUE);
16166                                 freemsg(mdti);
16167                         }


16312                                  * Obtain the credential from the
16313                                  * thread calling connect(); the credential
16314                                  * lives on in the second mblk which
16315                                  * originated from T_CONN_REQ and is echoed
16316                                  * with the T_BIND_ACK from ip.  If none
16317                                  * can be found, default to the creator
16318                                  * of the socket.
16319                                  */
16320                                 if (mp->b_cont == NULL ||
16321                                     (cr = DB_CRED(mp->b_cont)) == NULL) {
16322                                         cr = tcp->tcp_cred;
16323                                         pid = tcp->tcp_cpid;
16324                                 } else {
16325                                         pid = DB_CPID(mp->b_cont);
16326                                 }
16327 
16328                                 TCP_RECORD_TRACE(tcp, syn_mp,
16329                                     TCP_TRACE_SEND_PKT);
16330                                 mblk_setcred(syn_mp, cr);
16331                                 DB_CPID(syn_mp) = pid;
16332 
16333                                 /*
16334                                  * DTrace sending the first SYN as a
16335                                  * tcp:::connect-request event. For DTrace
16336                                  * only, the IP header length is found
16337                                  * so that the TCP header can be retrieved.
16338                                  */
16339                                 if (tcp->tcp_ipversion == IPV4_VERSION)
16340                                         ip_hdr_len = IPH_HDR_LENGTH(
16341                                             (ipha_t *)syn_mp->b_rptr);
16342                                 else
16343                                         ip_hdr_len = ip_hdr_length_v6(mp,
16344                                             (ip6_t *)syn_mp->b_rptr);
16345                                 DTRACE_TCP5(connect__request, mblk_t *, NULL,
16346                                     conn_t *, NULL, void_ip_t *,
16347                                     syn_mp->b_rptr, tcp_t *, tcp, tcph_t *,
16348                                     &syn_mp->b_rptr[ip_hdr_len]);
16349 
16350                                 tcp_send_data(tcp, tcp->tcp_wq, syn_mp);
16351                         }
16352                 after_syn_sent:
16353                         /*
16354                          * A trailer mblk indicates a waiting client upstream.
16355                          * We complete here the processing begun in
16356                          * either tcp_bind() or tcp_connect() by passing
16357                          * upstream the reply message they supplied.
16358                          */
16359                         mp1 = mp;
16360                         mp = mp->b_cont;
16361                         freeb(mp1);
16362                         if (mp)
16363                                 break;
16364                         return;
16365                 case T_ERROR_ACK:
16366                         if (tcp->tcp_debug) {
16367                                 (void) strlog(TCP_MOD_ID, 0, 1,
16368                                     SL_TRACE|SL_ERROR,
16369                                     "tcp_rput_other: case T_ERROR_ACK, "


17949         }
17950 
17951         /*
17952          * Need to clean up all the eagers since after the unbind, segments
17953          * will no longer be delivered to this listener stream.
17954          */
17955         mutex_enter(&tcp->tcp_eager_lock);
17956         if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) {
17957                 tcp_eager_cleanup(tcp, 0);
17958         }
17959         mutex_exit(&tcp->tcp_eager_lock);
17960 
17961         if (tcp->tcp_ipversion == IPV4_VERSION) {
17962                 tcp->tcp_ipha->ipha_src = 0;
17963         } else {
17964                 V6_SET_ZERO(tcp->tcp_ip6h->ip6_src);
17965         }
17966         V6_SET_ZERO(tcp->tcp_ip_src_v6);
17967         bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport));
17968         tcp_bind_hash_remove(tcp);
17969         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
17970             int32_t, TCPS_IDLE);
17971         tcp->tcp_state = TCPS_IDLE;
17972         tcp->tcp_mdt = B_FALSE;
17973         /* Send M_FLUSH according to TPI */
17974         (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW);
17975         connp = tcp->tcp_connp;
17976         connp->conn_mdt_ok = B_FALSE;
17977         ipcl_hash_remove(connp);
17978         bzero(&connp->conn_ports, sizeof (connp->conn_ports));
17979         mp = mi_tpi_ok_ack_alloc(mp);
17980         putnext(tcp->tcp_rq, mp);
17981 }
17982 
17983 /*
17984  * Don't let port fall into the privileged range.
17985  * Since the extra privileged ports can be arbitrary we also
17986  * ensure that we exclude those from consideration.
17987  * tcp_g_epriv_ports is not sorted thus we loop over it until
17988  * there are no changes.
17989  *
17990  * Note: No locks are held when inspecting tcp_g_*epriv_ports


19629 
19630         return (B_TRUE);
19631 }
19632 
19633 static void
19634 tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
19635 {
19636         ipha_t          *ipha;
19637         ipaddr_t        src;
19638         ipaddr_t        dst;
19639         uint32_t        cksum;
19640         ire_t           *ire;
19641         uint16_t        *up;
19642         ill_t           *ill;
19643         conn_t          *connp = tcp->tcp_connp;
19644         uint32_t        hcksum_txflags = 0;
19645         mblk_t          *ire_fp_mp;
19646         uint_t          ire_fp_mp_len;
19647         tcp_stack_t     *tcps = tcp->tcp_tcps;
19648         ip_stack_t      *ipst = tcps->tcps_netstack->netstack_ip;
19649         uint_t          ip_hdr_len;
19650 
19651         ASSERT(DB_TYPE(mp) == M_DATA);
19652 
19653         if (DB_CRED(mp) == NULL)
19654                 mblk_setcred(mp, CONN_CRED(connp));
19655 
19656         ipha = (ipha_t *)mp->b_rptr;
19657         src = ipha->ipha_src;
19658         dst = ipha->ipha_dst;
19659 
19660         if (tcp->tcp_ipversion == IPV4_VERSION) {
19661                 DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL,
19662                     void_ip_t *, ipha, tcp_t *, tcp, tcph_t *,
19663                     &mp->b_rptr[IPH_HDR_LENGTH(mp->b_rptr)]);
19664                 if (tcp->tcp_dtrace_connect_established) {
19665                         DTRACE_TCP5(connect__established, mblk_t *, NULL,
19666                             conn_t *, NULL, void_ip_t *, ipha, tcp_t *, tcp,
19667                             tcph_t *, &mp->b_rptr[IPH_HDR_LENGTH(mp->b_rptr)]);
19668                         tcp->tcp_dtrace_connect_established = B_FALSE;
19669                 }
19670         }
19671 
19672         /*
19673          * Drop off fast path for IPv6 and also if options are present or
19674          * we need to resolve a TS label.
19675          */
19676         if (tcp->tcp_ipversion != IPV4_VERSION ||
19677             !IPCL_IS_CONNECTED(connp) ||
19678             !CONN_IS_LSO_MD_FASTPATH(connp) ||
19679             (connp->conn_flags & IPCL_CHECK_POLICY) != 0 ||
19680             !connp->conn_ulp_labeled ||
19681             ipha->ipha_ident == IP_HDR_INCLUDED ||
19682             ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION ||
19683             IPP_ENABLED(IPP_LOCAL_OUT, ipst)) {
19684                 if (tcp->tcp_snd_zcopy_aware)
19685                         mp = tcp_zcopy_disable(tcp, mp);
19686                 TCP_STAT(tcps, tcp_ip_send);
19687 
19688                 if (tcp->tcp_ipversion == IPV6_VERSION) {
19689                         ip_hdr_len = ip_hdr_length_v6(mp, (ip6_t *)mp->b_rptr);
19690                         DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL,
19691                             void_ip_t *, mp->b_rptr, tcp_t *, tcp, tcph_t *,
19692                             &mp->b_rptr[ip_hdr_len]);
19693                         if (tcp->tcp_dtrace_connect_established) {
19694                                 DTRACE_TCP5(connect__established, mblk_t *,
19695                                     NULL, conn_t *, NULL, void_ip_t *,
19696                                     mp->b_rptr, tcp_t *, tcp, tcph_t *,
19697                                     &mp->b_rptr[ip_hdr_len]);
19698                                 tcp->tcp_dtrace_connect_established = B_FALSE;
19699                         }
19700                 }
19701 
19702                 CALL_IP_WPUT(connp, q, mp);
19703                 return;
19704         }
19705 
19706         if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) {
19707                 if (tcp->tcp_snd_zcopy_aware)
19708                         mp = tcp_zcopy_backoff(tcp, mp, 0);
19709                 CALL_IP_WPUT(connp, q, mp);
19710                 return;
19711         }
19712         ire_fp_mp = ire->ire_nce->nce_fp_mp;
19713         ire_fp_mp_len = MBLKL(ire_fp_mp);
19714 
19715         ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED);
19716         ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1);
19717 #ifndef _BIG_ENDIAN
19718         ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8);
19719 #endif
19720 
19721         /*


21090 
21091                         /*
21092                          * Set FIN bit if this is our last segment; snxt
21093                          * already includes its length, and it will not
21094                          * be adjusted after this point.
21095                          */
21096                         if (tcp->tcp_valid_bits == TCP_FSS_VALID &&
21097                             *snxt == tcp->tcp_fss) {
21098                                 if (!tcp->tcp_fin_acked) {
21099                                         tcp->tcp_tcph->th_flags[0] |= TH_FIN;
21100                                         BUMP_MIB(&tcps->tcps_mib,
21101                                             tcpOutControl);
21102                                 }
21103                                 if (!tcp->tcp_fin_sent) {
21104                                         tcp->tcp_fin_sent = B_TRUE;
21105                                         /*
21106                                          * tcp state must be ESTABLISHED
21107                                          * in order for us to get here in
21108                                          * the first place.
21109                                          */
21110                                         DTRACE_TCP4(state__change, void, NULL,
21111                                             conn_t *, NULL, tcp_t *, tcp,
21112                                             int32_t, TCPS_FIN_WAIT_1);
21113                                         tcp->tcp_state = TCPS_FIN_WAIT_1;
21114 
21115                                         /*
21116                                          * Upon returning from this routine,
21117                                          * tcp_wput_data() will set tcp_snxt
21118                                          * to be equal to snxt + tcp_fin_sent.
21119                                          * This is essentially the same as
21120                                          * setting it to tcp_fss + 1.
21121                                          */
21122                                 }
21123                         }
21124 
21125                         tcp->tcp_last_sent_len = (ushort_t)len;
21126 
21127                         len += tcp_hdr_len;
21128                         if (tcp->tcp_ipversion == IPV4_VERSION)
21129                                 tcp->tcp_ipha->ipha_length = htons(len);
21130                         else
21131                                 tcp->tcp_ip6h->ip6_plen = htons(len -
21132                                     ((char *)&tcp->tcp_ip6h[1] -


21728         if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) {
21729                 ASSERT(ill->ill_hcksum_capab != NULL);
21730                 hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags;
21731         }
21732 
21733         /*
21734          * Since the TCP checksum should be recalculated by h/w, we can just
21735          * zero the checksum field for HCK_FULLCKSUM, or calculate partial
21736          * pseudo-header checksum for HCK_PARTIALCKSUM.
21737          * The partial pseudo-header excludes TCP length, that was calculated
21738          * in tcp_send(), so to zero *up before further processing.
21739          */
21740         cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF);
21741 
21742         up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
21743         *up = 0;
21744 
21745         IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up,
21746             IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum);
21747 
21748         DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL, void_ip_t *, ipha,
21749             tcp_t *, tcp, tcph_t *, &mp->b_rptr[IPH_HDR_LENGTH(mp->b_rptr)]);
21750 
21751         /*
21752          * Append LSO flag to DB_LSOFLAGS(mp) and set the mss to DB_LSOMSS(mp).
21753          */
21754         DB_LSOFLAGS(mp) |= HW_LSO;
21755         DB_LSOMSS(mp) = mss;
21756 
21757         ipha->ipha_fragment_offset_and_flags |=
21758             (uint32_t)htons(ire->ire_frag_flag);
21759 
21760         ire_fp_mp = ire->ire_nce->nce_fp_mp;
21761         ire_fp_mp_len = MBLKL(ire_fp_mp);
21762         ASSERT(DB_TYPE(ire_fp_mp) == M_DATA);
21763         mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len;
21764         bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len);
21765 
21766         UPDATE_OB_PKT_COUNT(ire);
21767         ire->ire_last_used_time = lbolt;
21768         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests);
21769         BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits);
21770         UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets,


23376         }
23377 
23378         if (mctl_present) {
23379                 ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr;
23380 
23381                 ASSERT(ii->ipsec_in_type == IPSEC_IN);
23382                 if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h)) {
23383                         return;
23384                 }
23385         }
23386         if (zoneid == ALL_ZONES)
23387                 zoneid = GLOBAL_ZONEID;
23388 
23389         /* Add the zoneid so ip_output routes it properly */
23390         if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) {
23391                 freemsg(ipsec_mp);
23392                 return;
23393         }
23394         ipsec_mp = nmp;
23395 
23396         DTRACE_TCP5(send, mblk_t *, NULL, conn_t *, NULL, void_ip_t *,
23397             mp->b_rptr, tcp_t *, NULL, tcph_t *, tcph);
23398         if (tcph->th_flags[0] == (TH_RST|TH_ACK)) {
23399                 DTRACE_TCP5(accept__refused, mblk_t *, NULL, conn_t *, NULL,
23400                     void_ip_t *, mp->b_rptr, tcp_t *, NULL, tcph_t *, tcph);
23401         }
23402 
23403         /*
23404          * NOTE:  one might consider tracing a TCP packet here, but
23405          * this function has no active TCP state and no tcp structure
23406          * that has a trace buffer.  If we traced here, we would have
23407          * to keep a local trace buffer in tcp_record_trace().
23408          *
23409          * TSol note: The mblk that contains the incoming packet was
23410          * reused by tcp_xmit_listener_reset, so it already contains
23411          * the right credentials and we don't need to call mblk_setcred.
23412          * Also the conn's cred is not right since it is associated
23413          * with tcps_g_q.
23414          */
23415         CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp);
23416 
23417         /*
23418          * Tell IP to mark the IRE used for this destination temporary.
23419          * This way, we can limit our exposure to DoS attack because IP
23420          * creates an IRE for each destination.  If there are too many,
23421          * the time to do any routing lookup will be extremely long.  And
23422          * the lookup can be in interrupt context.


23603                 if (ipsec_mp == NULL)
23604                         return;
23605         }
23606         if (is_system_labeled() && !tsol_can_reply_error(mp)) {
23607                 DTRACE_PROBE2(
23608                     tx__ip__log__error__nolistener__tcp,
23609                     char *, "Could not reply with RST to mp(1)",
23610                     mblk_t *, mp);
23611                 ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n"));
23612                 freemsg(ipsec_mp);
23613                 return;
23614         }
23615 
23616         rptr = mp->b_rptr;
23617 
23618         tcph = (tcph_t *)&rptr[ip_hdr_len];
23619         seg_seq = BE32_TO_U32(tcph->th_seq);
23620         seg_ack = BE32_TO_U32(tcph->th_ack);
23621         flags = tcph->th_flags[0];
23622 
23623         /*
23624          * DTrace this "unknown" segment as a tcp:::receive, as we did
23625          * just receive something that was TCP.
23626          */
23627         DTRACE_TCP5(receive, mblk_t *, NULL, conn_t *, NULL, void_ip_t *, rptr,
23628             tcp_t *, NULL, tcph_t *, tcph);
23629 
23630         seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len);
23631         if (flags & TH_RST) {
23632                 freemsg(ipsec_mp);
23633         } else if (flags & TH_ACK) {
23634                 tcp_xmit_early_reset("no tcp, reset",
23635                     ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps,
23636                     connp);
23637         } else {
23638                 if (flags & TH_SYN) {
23639                         seg_len++;
23640                 } else {
23641                         /*
23642                          * Here we violate the RFC.  Note that a normal
23643                          * TCP will never send a segment without the ACK
23644                          * flag, except for RST or SYN segment.  This
23645                          * segment is neither.  Just drop it on the
23646                          * floor.
23647                          */
23648                         freemsg(ipsec_mp);
23649                         tcps->tcps_rst_unsent++;


23947                         /*
23948                          * Get IP set to checksum on our behalf
23949                          * Include the adjustment for a source route if any.
23950                          */
23951                         u1 += tcp->tcp_sum;
23952                         u1 = (u1 >> 16) + (u1 & 0xFFFF);
23953                         U16_TO_BE16(u1, tcph->th_sum);
23954                         BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23955                 }
23956                 if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
23957                     (seq + data_length) == tcp->tcp_fss) {
23958                         if (!tcp->tcp_fin_acked) {
23959                                 flags |= TH_FIN;
23960                                 BUMP_MIB(&tcps->tcps_mib, tcpOutControl);
23961                         }
23962                         if (!tcp->tcp_fin_sent) {
23963                                 tcp->tcp_fin_sent = B_TRUE;
23964                                 switch (tcp->tcp_state) {
23965                                 case TCPS_SYN_RCVD:
23966                                 case TCPS_ESTABLISHED:
23967                                         DTRACE_TCP4(state__change, void, NULL,
23968                                             conn_t *, NULL, tcp_t *, tcp,
23969                                             int32_t, TCPS_FIN_WAIT_1);
23970                                         tcp->tcp_state = TCPS_FIN_WAIT_1;
23971                                         break;
23972                                 case TCPS_CLOSE_WAIT:
23973                                         DTRACE_TCP4(state__change, void, NULL,
23974                                             conn_t *, NULL, tcp_t *, tcp,
23975                                             int32_t, TCPS_LAST_ACK);
23976                                         tcp->tcp_state = TCPS_LAST_ACK;
23977                                         break;
23978                                 }
23979                                 if (tcp->tcp_suna == tcp->tcp_snxt)
23980                                         TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
23981                                 tcp->tcp_snxt = tcp->tcp_fss + 1;
23982                         }
23983                 }
23984                 /*
23985                  * Note the trick here.  u1 is unsigned.  When tcp_urg
23986                  * is smaller than seq, u1 will become a very huge value.
23987                  * So the comparison will fail.  Also note that tcp_urp
23988                  * should be positive, see RFC 793 page 17.
23989                  */
23990                 u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION;
23991                 if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 &&
23992                     u1 < (uint32_t)(64 * 1024)) {
23993                         flags |= TH_URG;
23994                         BUMP_MIB(&tcps->tcps_mib, tcpOutUrg);
23995                         U32_TO_ABE16(u1, tcph->th_urp);


24305 /* ARGSUSED */
24306 static tcp_t *
24307 tcp_alloc_temp_tcp(in_port_t port, tcp_stack_t *tcps)
24308 {
24309         conn_t  *connp;
24310         tcp_t   *tcp;
24311 
24312         connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP, tcps->tcps_netstack);
24313         if (connp == NULL)
24314                 return (NULL);
24315 
24316         tcp = connp->conn_tcp;
24317         tcp->tcp_tcps = tcps;
24318         TCPS_REFHOLD(tcps);
24319 
24320         /*
24321          * Only initialize the necessary info in those structures.  Note
24322          * that since INADDR_ANY is all 0, we do not need to set
24323          * tcp_bound_source to INADDR_ANY here.
24324          */
24325         DTRACE_TCP4(state__change, void, NULL, conn_t *, NULL, tcp_t *, tcp,
24326             int32_t, TCPS_BOUND);
24327         tcp->tcp_state = TCPS_BOUND;
24328         tcp->tcp_lport = port;
24329         tcp->tcp_exclbind = 1;
24330         tcp->tcp_reserved_port = 1;
24331 
24332         /* Just for place holding... */
24333         tcp->tcp_ipversion = IPV4_VERSION;
24334 
24335         return (tcp);
24336 }
24337 
24338 /*
24339  * To remove a port range specified by lo_port and hi_port from the
24340  * reserved port ranges.  This is one of the three public functions of
24341  * the reserved port interface.  Note that a port range has to be removed
24342  * as a whole.  Ports in a range cannot be removed individually.
24343  *
24344  * Params:
24345  *      in_port_t lo_port: the beginning port of the reserved port range to
24346  *              be deleted.