diff options
Diffstat (limited to 'sys/netinet/tcp_input.c')
-rw-r--r-- | sys/netinet/tcp_input.c | 344 |
1 files changed, 200 insertions, 144 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index a247138..eca5cb2 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *, #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ - ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ - (tp)->t_inpcb->in6p_route.ro_rt) \ - nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ + nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) @@ -358,8 +357,7 @@ tcp_input(m, off0) int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ - struct rmxp_tao *taop; /* pointer to our TAO cache entry */ - struct rmxp_tao tao_noncached; /* in case there's no cached entry */ + struct rmxp_tao tao; /* our TAO cache entry */ int headlocked = 0; struct sockaddr_in *next_hop = NULL; int rstreason; /* For badport_bandlim accounting purposes */ @@ -389,6 +387,7 @@ tcp_input(m, off0) #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif + bzero(&tao, sizeof(tao)); bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; @@ -707,11 +706,9 @@ findpcb: if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; - inc.inc6_route.ro_rt = NULL; /* XXX */ } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; - inc.inc_route.ro_rt = NULL; /* XXX */ } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; @@ -916,7 +913,7 @@ findpcb: } after_listen: -/* XXX temp debugging */ + /* XXX temp debugging */ /* should not happen - syncache should pick up these connections */ if (tp->t_state == TCPS_LISTEN) panic("tcp_input: TCPS_LISTEN"); @@ -930,8 +927,9 @@ after_listen: callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* - * Process options. - * XXX this is tradtitional behavior, may need to be cleaned up. + * Process options only when we get SYN/ACK back. The SYN case + * for incoming connections is handled in tcp_syncache. + * XXX this is traditional behavior, may need to be cleaned up. */ tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); if (thflags & TH_SYN) { @@ -1179,10 +1177,8 @@ after_listen: * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: - if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { - taop = &tao_noncached; - bzero(taop, sizeof(*taop)); - } + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || @@ -1195,7 +1191,7 @@ after_listen: * Our new SYN, when it arrives, will serve as the * needed ACK. */ - if (taop->tao_ccsent != 0) + if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; @@ -1225,7 +1221,7 @@ after_listen: */ if (to.to_flags & TOF_CCECHO) { if (tp->cc_send != to.to_ccecho) { - if (taop->tao_ccsent != 0) + if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; @@ -1246,8 +1242,8 @@ after_listen: tp->rcv_scale = tp->request_r_scale; } /* Segment is acceptable, update cache if undefined. */ - if (taop->tao_ccsent == 0) - taop->tao_ccsent = to.to_ccecho; + if (tao.tao_ccsent == 0 && tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0); tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ @@ -1290,14 +1286,16 @@ after_listen: tp->t_flags |= TF_ACKNOW; callout_stop(tp->tt_rexmt); if (to.to_flags & TOF_CC) { - if (taop->tao_cc != 0 && - CC_GT(to.to_cc, taop->tao_cc)) { + if (tao.tao_cc != 0 && + CC_GT(to.to_cc, tao.tao_cc)) { /* * update cache and make transition: * SYN-SENT -> ESTABLISHED* * SYN-SENT* -> FIN-WAIT-1* */ - taop->tao_cc = to.to_cc; + tao.tao_cc = to.to_cc; + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_CC, to.to_cc, 0); tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; @@ -1313,8 +1311,12 @@ after_listen: } else tp->t_state = TCPS_SYN_RECEIVED; } else { - /* CC.NEW or no option => invalidate cache */ - taop->tao_cc = 0; + if (tcp_do_rfc1644) { + /* CC.NEW or no option => invalidate cache */ + tao.tao_cc = 0; + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_CC, to.to_cc, 0); + } tp->t_state = TCPS_SYN_RECEIVED; } } @@ -1682,13 +1684,14 @@ trimthenstep6: } /* * Upon successful completion of 3-way handshake, - * update cache.CC if it was undefined, pass any queued - * data to the user, and advance state appropriately. + * update cache.CC, pass any queued data to the user, + * and advance state appropriately. */ - if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && - taop->tao_cc == 0) - taop->tao_cc = tp->cc_recv; - + if (tcp_do_rfc1644) { + tao.tao_cc = tp->cc_recv; + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, + tp->cc_recv, 0); + } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED @@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt) * are present. Store the upper limit of the length of options plus * data in maxopd. * - * NOTE that this routine is only called when we process an incoming - * segment, for outgoing segments only tcp_mssopt is called. * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. + * + * NOTE that this routine is only called when we process an incoming + * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { - register struct rtentry *rt; - struct ifnet *ifp; - register int rtt, mss; + int rtt, mss; u_long bufsize; + u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; - struct rmxp_tao *taop; + struct hc_metrics_lite metrics; + struct rmxp_tao tao; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; @@ -2637,96 +2641,96 @@ tcp_mss(tp, offer) sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else - const int isipv6 = 0; - const size_t min_protoh = sizeof (struct tcpiphdr); + const size_t min_protoh = sizeof(struct tcpiphdr); #endif + bzero(&tao, sizeof(tao)); - if (isipv6) - rt = tcp_rtlookup6(&inp->inp_inc); - else - rt = tcp_rtlookup(&inp->inp_inc); - if (rt == NULL) { - tp->t_maxopd = tp->t_maxseg = - isipv6 ? tcp_v6mssdflt : tcp_mssdflt; - return; + /* initialize */ +#ifdef INET6 + if (isipv6) { + maxmtu = tcp_maxmtu6(&inp->inp_inc); + tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; + } else +#endif + { + maxmtu = tcp_maxmtu(&inp->inp_inc); + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } - ifp = rt->rt_ifp; so = inp->inp_socket; - taop = rmx_taop(rt->rt_rmx); /* - * Offer == -1 means that we didn't receive SYN yet, - * use cached value in that case; + * no route to sender, take default mss and return */ - if (offer == -1) - offer = taop->tao_mssopt; - /* - * Offer == 0 means that there was no MSS on the SYN segment, - * in this case we use tcp_mssdflt. - */ - if (offer == 0) - offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; - else - /* - * Sanity check: make sure that maxopd will be large - * enough to allow some data on segments even is the - * all the option space is used (40bytes). Otherwise - * funny things may happen in tcp_output. - */ - offer = max(offer, 64); - taop->tao_mssopt = offer; + if (maxmtu == 0) + return; + + /* what have we got? */ + switch (offer) { + case 0: + /* + * Offer == 0 means that there was no MSS on the SYN + * segment, in this case we use tcp_mssdflt. + */ + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif + tcp_mssdflt; + break; + + case -1: + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); + if (tao.tao_mssopt != 0) + offer = tao.tao_mssopt; + /* FALLTHROUGH */ + + default: + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even if the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + if (tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_MSSOPT, 0, offer); + } /* - * While we're here, check if there's an initial rtt - * or rttvar. Convert from the route-table units - * to scaled multiples of the slow timeout timer. + * rmx information is now retrieved from tcp_hostcache */ - if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { - /* - * XXX the lock bit for RTT indicates that the value - * is also a minimum value; this is subject to time. - */ - if (rt->rt_rmx.rmx_locks & RTV_RTT) - tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); - tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); - tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; - tcpstat.tcps_usedrtt++; - if (rt->rt_rmx.rmx_rttvar) { - tp->t_rttvar = rt->rt_rmx.rmx_rttvar / - (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); - tcpstat.tcps_usedrttvar++; - } else { - /* default variation is +- 1 rtt */ - tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; - } - TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); - } + tcp_hc_get(&inp->inp_inc, &metrics); + /* - * if there's an mtu associated with the route, use it + * if there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ - if (rt->rt_rmx.rmx_mtu) - mss = rt->rt_rmx.rmx_mtu - min_protoh; + if (metrics.rmx_mtu) + mss = metrics.rmx_mtu - min_protoh; else { #ifdef INET6 - mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu) - - min_protoh; -#else - mss = ifp->if_mtu - min_protoh; -#endif -#ifdef INET6 if (isipv6) { - if (!in6_localaddr(&inp->in6p_faddr)) + mss = maxmtu - min_protoh; + if (!path_mtu_discovery && + !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif - if (!in_localaddr(inp->inp_faddr)) + { + mss = maxmtu - min_protoh; + if (!path_mtu_discovery && + !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); + } } mss = min(mss, offer); + /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal @@ -2749,6 +2753,7 @@ tcp_mss(tp, offer) (origoffer == -1 || (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; + tp->t_maxseg = mss; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) @@ -2757,15 +2762,18 @@ tcp_mss(tp, offer) if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif + tp->t_maxseg = mss; + /* - * If there's a pipesize, change the socket buffer - * to that size. Make the socket buffers an integral - * number of mss units; if the mss is larger than - * the socket buffer, decrease the mss. + * If there's a pipesize, change the socket buffer to that size, + * don't change if sb_hiwat is different than default (then it + * has been changed on purpose with setsockopt). + * Make the socket buffers an integral number of mss units; + * if the mss is larger than the socket buffer, decrease the mss. */ -#ifdef RTV_SPIPE - if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) -#endif + if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) + bufsize = metrics.rmx_sendpipe; + else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; @@ -2778,9 +2786,9 @@ tcp_mss(tp, offer) } tp->t_maxseg = mss; -#ifdef RTV_RPIPE - if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) -#endif + if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) + bufsize = metrics.rmx_recvpipe; + else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); @@ -2789,62 +2797,110 @@ tcp_mss(tp, offer) if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve(&so->so_rcv, bufsize, so, NULL); } + /* + * While we're here, check the others too + */ + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + tp->t_srtt = rtt; + tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; + tcpstat.tcps_usedrtt++; + if (metrics.rmx_rttvar) { + tp->t_rttvar = metrics.rmx_rttvar; + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + if (metrics.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } + if (metrics.rmx_bandwidth) + tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. */ +#define TCP_METRICS_CWND +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(mss, + min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); +#ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) tp->snd_cwnd = mss * ss_fltsz_local; +#endif else tp->snd_cwnd = mss * ss_fltsz; - - if (rt->rt_rmx.rmx_ssthresh) { - /* - * There's some sort of gateway or interface - * buffer limit on the path. Use this to set - * the slow start threshhold, but set the - * threshold to no less than 2*mss. - */ - tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); - tcpstat.tcps_usedssthresh++; - } } /* * Determine the MSS option to send on an outgoing SYN. */ int -tcp_mssopt(tp) - struct tcpcb *tp; +tcp_mssopt(inc) + struct in_conninfo *inc; { - struct rtentry *rt; + int mss = 0; + u_long maxmtu = 0; + u_long thcmtu = 0; + size_t min_protoh; #ifdef INET6 - int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; - size_t min_protoh = isipv6 ? - sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : - sizeof (struct tcpiphdr); -#else - const int isipv6 = 0; - const size_t min_protoh = sizeof (struct tcpiphdr); + int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif - if (isipv6) - rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); - else - rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); - if (rt == NULL) - return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt); + KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 - return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : - rt->rt_ifp->if_mtu - min_protoh); -#else - return (rt->rt_ifp->if_mtu - min_protoh); + if (isipv6) { + mss = tcp_v6mssdflt; + maxmtu = tcp_maxmtu6(inc); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + } else #endif + { + mss = tcp_mssdflt; + maxmtu = tcp_maxmtu(inc); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct tcpiphdr); + } + if (maxmtu && thcmtu) + mss = min(maxmtu, thcmtu) - min_protoh; + else if (maxmtu || thcmtu) + mss = max(maxmtu, thcmtu) - min_protoh; + + return (mss); } |