summaryrefslogtreecommitdiffstats
path: root/sys/netinet/tcp_input.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet/tcp_input.c')
-rw-r--r--sys/netinet/tcp_input.c344
1 files changed, 200 insertions, 144 deletions
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index a247138..eca5cb2 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *,
#define ND6_HINT(tp) \
do { \
if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
- (tp)->t_inpcb->in6p_route.ro_rt) \
- nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
+ nd6_nud_hint(NULL, NULL, 0); \
} while (0)
#else
#define ND6_HINT(tp)
@@ -358,8 +357,7 @@ tcp_input(m, off0)
int todrop, acked, ourfinisacked, needoutput = 0;
u_long tiwin;
struct tcpopt to; /* options in this segment */
- struct rmxp_tao *taop; /* pointer to our TAO cache entry */
- struct rmxp_tao tao_noncached; /* in case there's no cached entry */
+ struct rmxp_tao tao; /* our TAO cache entry */
int headlocked = 0;
struct sockaddr_in *next_hop = NULL;
int rstreason; /* For badport_bandlim accounting purposes */
@@ -389,6 +387,7 @@ tcp_input(m, off0)
#ifdef INET6
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ bzero(&tao, sizeof(tao));
bzero((char *)&to, sizeof(to));
tcpstat.tcps_rcvtotal++;
@@ -707,11 +706,9 @@ findpcb:
if (isipv6) {
inc.inc6_faddr = ip6->ip6_src;
inc.inc6_laddr = ip6->ip6_dst;
- inc.inc6_route.ro_rt = NULL; /* XXX */
} else {
inc.inc_faddr = ip->ip_src;
inc.inc_laddr = ip->ip_dst;
- inc.inc_route.ro_rt = NULL; /* XXX */
}
inc.inc_fport = th->th_sport;
inc.inc_lport = th->th_dport;
@@ -916,7 +913,7 @@ findpcb:
}
after_listen:
-/* XXX temp debugging */
+ /* XXX temp debugging */
/* should not happen - syncache should pick up these connections */
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
@@ -930,8 +927,9 @@ after_listen:
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
/*
- * Process options.
- * XXX this is tradtitional behavior, may need to be cleaned up.
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
if (thflags & TH_SYN) {
@@ -1179,10 +1177,8 @@ after_listen:
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
- if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
@@ -1195,7 +1191,7 @@ after_listen:
* Our new SYN, when it arrives, will serve as the
* needed ACK.
*/
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1225,7 +1221,7 @@ after_listen:
*/
if (to.to_flags & TOF_CCECHO) {
if (tp->cc_send != to.to_ccecho) {
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1246,8 +1242,8 @@ after_listen:
tp->rcv_scale = tp->request_r_scale;
}
/* Segment is acceptable, update cache if undefined. */
- if (taop->tao_ccsent == 0)
- taop->tao_ccsent = to.to_ccecho;
+ if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
@@ -1290,14 +1286,16 @@ after_listen:
tp->t_flags |= TF_ACKNOW;
callout_stop(tp->tt_rexmt);
if (to.to_flags & TOF_CC) {
- if (taop->tao_cc != 0 &&
- CC_GT(to.to_cc, taop->tao_cc)) {
+ if (tao.tao_cc != 0 &&
+ CC_GT(to.to_cc, tao.tao_cc)) {
/*
* update cache and make transition:
* SYN-SENT -> ESTABLISHED*
* SYN-SENT* -> FIN-WAIT-1*
*/
- taop->tao_cc = to.to_cc;
+ tao.tao_cc = to.to_cc;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
tp->t_state = TCPS_FIN_WAIT_1;
@@ -1313,8 +1311,12 @@ after_listen:
} else
tp->t_state = TCPS_SYN_RECEIVED;
} else {
- /* CC.NEW or no option => invalidate cache */
- taop->tao_cc = 0;
+ if (tcp_do_rfc1644) {
+ /* CC.NEW or no option => invalidate cache */
+ tao.tao_cc = 0;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
+ }
tp->t_state = TCPS_SYN_RECEIVED;
}
}
@@ -1682,13 +1684,14 @@ trimthenstep6:
}
/*
* Upon successful completion of 3-way handshake,
- * update cache.CC if it was undefined, pass any queued
- * data to the user, and advance state appropriately.
+ * update cache.CC, pass any queued data to the user,
+ * and advance state appropriately.
*/
- if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
- taop->tao_cc == 0)
- taop->tao_cc = tp->cc_recv;
-
+ if (tcp_do_rfc1644) {
+ tao.tao_cc = tp->cc_recv;
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
+ tp->cc_recv, 0);
+ }
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
@@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt)
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
- * NOTE that this routine is only called when we process an incoming
- * segment, for outgoing segments only tcp_mssopt is called.
*
* In case of T/TCP, we call this routine during implicit connection
* setup as well (offer = -1), to initialize maxseg from the cached
* MSS of our peer.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
*/
void
tcp_mss(tp, offer)
struct tcpcb *tp;
int offer;
{
- register struct rtentry *rt;
- struct ifnet *ifp;
- register int rtt, mss;
+ int rtt, mss;
u_long bufsize;
+ u_long maxmtu;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- struct rmxp_tao *taop;
+ struct hc_metrics_lite metrics;
+ struct rmxp_tao tao;
int origoffer = offer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
@@ -2637,96 +2641,96 @@ tcp_mss(tp, offer)
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ const size_t min_protoh = sizeof(struct tcpiphdr);
#endif
+ bzero(&tao, sizeof(tao));
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
- rt = tcp_rtlookup(&inp->inp_inc);
- if (rt == NULL) {
- tp->t_maxopd = tp->t_maxseg =
- isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- return;
+ /* initialize */
+#ifdef INET6
+ if (isipv6) {
+ maxmtu = tcp_maxmtu6(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
+ } else
+#endif
+ {
+ maxmtu = tcp_maxmtu(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
}
- ifp = rt->rt_ifp;
so = inp->inp_socket;
- taop = rmx_taop(rt->rt_rmx);
/*
- * Offer == -1 means that we didn't receive SYN yet,
- * use cached value in that case;
+ * no route to sender, take default mss and return
*/
- if (offer == -1)
- offer = taop->tao_mssopt;
- /*
- * Offer == 0 means that there was no MSS on the SYN segment,
- * in this case we use tcp_mssdflt.
- */
- if (offer == 0)
- offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- else
- /*
- * Sanity check: make sure that maxopd will be large
- * enough to allow some data on segments even is the
- * all the option space is used (40bytes). Otherwise
- * funny things may happen in tcp_output.
- */
- offer = max(offer, 64);
- taop->tao_mssopt = offer;
+ if (maxmtu == 0)
+ return;
+
+ /* what have we got? */
+ switch (offer) {
+ case 0:
+ /*
+ * Offer == 0 means that there was no MSS on the SYN
+ * segment, in this case we use tcp_mssdflt.
+ */
+ offer =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif
+ tcp_mssdflt;
+ break;
+
+ case -1:
+ /*
+ * Offer == -1 means that we didn't receive SYN yet,
+ * use cached value in that case;
+ */
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt != 0)
+ offer = tao.tao_mssopt;
+ /* FALLTHROUGH */
+
+ default:
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even if the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ offer = max(offer, 64);
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_MSSOPT, 0, offer);
+ }
/*
- * While we're here, check if there's an initial rtt
- * or rttvar. Convert from the route-table units
- * to scaled multiples of the slow timeout timer.
+ * rmx information is now retrieved from tcp_hostcache
*/
- if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
- /*
- * XXX the lock bit for RTT indicates that the value
- * is also a minimum value; this is subject to time.
- */
- if (rt->rt_rmx.rmx_locks & RTV_RTT)
- tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
- tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
- tcpstat.tcps_usedrtt++;
- if (rt->rt_rmx.rmx_rttvar) {
- tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- tcpstat.tcps_usedrttvar++;
- } else {
- /* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
- }
- TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
- }
+ tcp_hc_get(&inp->inp_inc, &metrics);
+
/*
- * if there's an mtu associated with the route, use it
+ * if there's a discovered mtu int tcp hostcache, use it
* else, use the link mtu.
*/
- if (rt->rt_rmx.rmx_mtu)
- mss = rt->rt_rmx.rmx_mtu - min_protoh;
+ if (metrics.rmx_mtu)
+ mss = metrics.rmx_mtu - min_protoh;
else {
#ifdef INET6
- mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
- - min_protoh;
-#else
- mss = ifp->if_mtu - min_protoh;
-#endif
-#ifdef INET6
if (isipv6) {
- if (!in6_localaddr(&inp->in6p_faddr))
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in6_localaddr(&inp->in6p_faddr))
mss = min(mss, tcp_v6mssdflt);
} else
#endif
- if (!in_localaddr(inp->inp_faddr))
+ {
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in_localaddr(inp->inp_faddr))
mss = min(mss, tcp_mssdflt);
+ }
}
mss = min(mss, offer);
+
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
@@ -2749,6 +2753,7 @@ tcp_mss(tp, offer)
(origoffer == -1 ||
(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
mss -= TCPOLEN_CC_APPA;
+ tp->t_maxseg = mss;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
if (mss > MCLBYTES)
@@ -2757,15 +2762,18 @@ tcp_mss(tp, offer)
if (mss > MCLBYTES)
mss = mss / MCLBYTES * MCLBYTES;
#endif
+ tp->t_maxseg = mss;
+
/*
- * If there's a pipesize, change the socket buffer
- * to that size. Make the socket buffers an integral
- * number of mss units; if the mss is larger than
- * the socket buffer, decrease the mss.
+ * If there's a pipesize, change the socket buffer to that size,
+ * don't change if sb_hiwat is different than default (then it
+ * has been changed on purpose with setsockopt).
+ * Make the socket buffers an integral number of mss units;
+ * if the mss is larger than the socket buffer, decrease the mss.
*/
-#ifdef RTV_SPIPE
- if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
-#endif
+ if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
+ bufsize = metrics.rmx_sendpipe;
+ else
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss)
mss = bufsize;
@@ -2778,9 +2786,9 @@ tcp_mss(tp, offer)
}
tp->t_maxseg = mss;
-#ifdef RTV_RPIPE
- if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
-#endif
+ if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
+ bufsize = metrics.rmx_recvpipe;
+ else
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@@ -2789,62 +2797,110 @@ tcp_mss(tp, offer)
if (bufsize > so->so_rcv.sb_hiwat)
(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
}
+ /*
+ * While we're here, check the others too
+ */
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ tp->t_srtt = rtt;
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ tcpstat.tcps_usedrtt++;
+ if (metrics.rmx_rttvar) {
+ tp->t_rttvar = metrics.rmx_rttvar;
+ tcpstat.tcps_usedrttvar++;
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ if (metrics.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
+ tcpstat.tcps_usedssthresh++;
+ }
+ if (metrics.rmx_bandwidth)
+ tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
*/
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(mss,
+ min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
+#ifdef INET6
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
+#endif
else
tp->snd_cwnd = mss * ss_fltsz;
-
- if (rt->rt_rmx.rmx_ssthresh) {
- /*
- * There's some sort of gateway or interface
- * buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
- * threshold to no less than 2*mss.
- */
- tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
- tcpstat.tcps_usedssthresh++;
- }
}
/*
* Determine the MSS option to send on an outgoing SYN.
*/
int
-tcp_mssopt(tp)
- struct tcpcb *tp;
+tcp_mssopt(inc)
+ struct in_conninfo *inc;
{
- struct rtentry *rt;
+ int mss = 0;
+ u_long maxmtu = 0;
+ u_long thcmtu = 0;
+ size_t min_protoh;
#ifdef INET6
- int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
- size_t min_protoh = isipv6 ?
- sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
- sizeof (struct tcpiphdr);
-#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ int isipv6 = inc->inc_isipv6 ? 1 : 0;
#endif
- if (isipv6)
- rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
- else
- rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
- if (rt == NULL)
- return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
+ KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
#ifdef INET6
- return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
- rt->rt_ifp->if_mtu - min_protoh);
-#else
- return (rt->rt_ifp->if_mtu - min_protoh);
+ if (isipv6) {
+ mss = tcp_v6mssdflt;
+ maxmtu = tcp_maxmtu6(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
#endif
+ {
+ mss = tcp_mssdflt;
+ maxmtu = tcp_maxmtu(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct tcpiphdr);
+ }
+ if (maxmtu && thcmtu)
+ mss = min(maxmtu, thcmtu) - min_protoh;
+ else if (maxmtu || thcmtu)
+ mss = max(maxmtu, thcmtu) - min_protoh;
+
+ return (mss);
}
OpenPOWER on IntegriCloud