summaryrefslogtreecommitdiffstats
path: root/sys/netinet/tcp_subr.c
diff options
context:
space:
mode:
authorandre <andre@FreeBSD.org>2003-11-20 20:07:39 +0000
committerandre <andre@FreeBSD.org>2003-11-20 20:07:39 +0000
commit6164d7c280688f20cf827e8374984c6e0175fab0 (patch)
treef947a08d66395dd498056038f0c360783fa281c7 /sys/netinet/tcp_subr.c
parent6dca20de0718f19b3cdc5a7d5ebb71cd54b2374e (diff)
downloadFreeBSD-src-6164d7c280688f20cf827e8374984c6e0175fab0.zip
FreeBSD-src-6164d7c280688f20cf827e8374984c6e0175fab0.tar.gz
Introduce tcp_hostcache and remove the tcp specific metrics from
the routing table. Move all usage and references in the tcp stack from the routing table metrics to the tcp hostcache. It caches measured parameters of past tcp sessions to provide better initial start values for following connections from or to the same source or destination. Depending on the network parameters to/from the remote host this can lead to significant speedups for new tcp connections after the first one because they inherit and shortcut the learning curve. tcp_hostcache is designed for multiple concurrent access in SMP environments with high contention and is hash indexed by remote ip address. It removes significant locking requirements from the tcp stack with regard to the routing table. Reviewed by: sam (mentor), bms Reviewed by: -net, -current, core@kame.net (IPv6 parts) Approved by: re (scottl)
Diffstat (limited to 'sys/netinet/tcp_subr.c')
-rw-r--r--sys/netinet/tcp_subr.c348
1 files changed, 125 insertions, 223 deletions
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 7ce06f6..dfd6de1 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
- tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
+ tcp_hc_init();
}
/*
@@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
- struct route *ro = 0;
- struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
- struct route_in6 *ro6 = 0;
- struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
- struct inpcb *inp;
+ struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
-#ifdef INET6
- if (isipv6)
- ro6 = &inp->in6p_route;
- else
-#endif /* INET6 */
- ro = &inp->inp_route;
- } else {
- inp = NULL;
-#ifdef INET6
- if (isipv6) {
- ro6 = &sro6;
- bzero(ro6, sizeof *ro6);
- } else
-#endif /* INET6 */
- {
- ro = &sro;
- bzero(ro, sizeof *ro);
- }
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
- ip6->ip6_hlim = in6_selecthlim(inp,
- ro6 && ro6->ro_rt ?
- ro6->ro_rt->rt_ifp :
- NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
- if (isipv6) {
- (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
- if (ro6 == &sro6 && ro6->ro_rt) {
- RTFREE(ro6->ro_rt);
- ro6->ro_rt = NULL;
- }
- } else
+ if (isipv6)
+ (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ else
#endif /* INET6 */
- {
- (void) ip_output(m, NULL, ro, ipflags, NULL, inp);
- if (ro == &sro && ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
- }
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- struct rtentry *rt;
- int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
- * 'Enough' is arbitrarily defined as the 16 samples.
- * 16 samples is enough for the srtt filter to converge
- * to within 5% of the correct value; fewer samples and
- * we could save a very bogus rtt.
- *
- * Don't update the default route's characteristics and don't
- * update anything that the user "locked".
+ * 'Enough' is arbitrarily defined as 4 rtt samples.
+ * 4 samples is enough for the srtt filter to converge
+ * to within enough % of the correct value; fewer samples
+ * and we could save a bogus rtt. The danger is not high
+ * as tcp quickly recovers from everything.
+ * XXX: Works very well but needs some more statistics!
*/
- if (tp->t_rttupdated >= 16) {
- register u_long i = 0;
-#ifdef INET6
- if (isipv6) {
- struct sockaddr_in6 *sin6;
+ if (tp->t_rttupdated >= 4) {
+ struct hc_metrics_lite metrics;
+ u_long ssthresh;
- if ((rt = inp->in6p_route.ro_rt) == NULL)
- goto no_valid_rt;
- sin6 = (struct sockaddr_in6 *)rt_key(rt);
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- goto no_valid_rt;
- }
- else
-#endif /* INET6 */
- if ((rt = inp->inp_route.ro_rt) == NULL ||
- ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
- == INADDR_ANY)
- goto no_valid_rt;
-
- if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
- i = tp->t_srtt *
- (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- if (rt->rt_rmx.rmx_rtt && i)
- /*
- * filter this update to half the old & half
- * the new values, converting scale.
- * See route.h and tcp_var.h for a
- * description of the scaling constants.
- */
- rt->rt_rmx.rmx_rtt =
- (rt->rt_rmx.rmx_rtt + i) / 2;
- else
- rt->rt_rmx.rmx_rtt = i;
- tcpstat.tcps_cachedrtt++;
- }
- if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
- i = tp->t_rttvar *
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- if (rt->rt_rmx.rmx_rttvar && i)
- rt->rt_rmx.rmx_rttvar =
- (rt->rt_rmx.rmx_rttvar + i) / 2;
- else
- rt->rt_rmx.rmx_rttvar = i;
- tcpstat.tcps_cachedrttvar++;
- }
+ bzero(&metrics, sizeof(metrics));
/*
- * The old comment here said:
- * update the pipelimit (ssthresh) if it has been updated
- * already or if a pipesize was specified & the threshhold
- * got below half the pipesize. I.e., wait for bad news
- * before we start updating, then update on both good
- * and bad news.
- *
- * But we want to save the ssthresh even if no pipesize is
- * specified explicitly in the route, because such
- * connections still have an implicit pipesize specified
- * by the global tcp_sendspace. In the absence of a reliable
- * way to calculate the pipesize, it will have to do.
+ * Update the ssthresh always when the conditions below
+ * are satisfied. This gives us better new start value
+ * for the congestion avoidance for new connections.
+ * ssthresh is only set if packet loss occured on a session.
*/
- i = tp->snd_ssthresh;
- if (rt->rt_rmx.rmx_sendpipe != 0)
- dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
- else
- dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
- if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
- i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
- || dosavessthresh) {
+ ssthresh = tp->snd_ssthresh;
+ if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
- i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
- if (i < 2)
- i = 2;
- i *= (u_long)(tp->t_maxseg +
+ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (ssthresh < 2)
+ ssthresh = 2;
+ ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
- if (rt->rt_rmx.rmx_ssthresh)
- rt->rt_rmx.rmx_ssthresh =
- (rt->rt_rmx.rmx_ssthresh + i) / 2;
- else
- rt->rt_rmx.rmx_ssthresh = i;
- tcpstat.tcps_cachedssthresh++;
- }
+ } else
+ ssthresh = 0;
+ metrics.rmx_ssthresh = ssthresh;
+
+ metrics.rmx_rtt = tp->t_srtt;
+ metrics.rmx_rttvar = tp->t_rttvar;
+ /* XXX: This wraps if the pipe is more than 4 Gbit per second */
+ metrics.rmx_bandwidth = tp->snd_bandwidth;
+ metrics.rmx_cwnd = tp->snd_cwnd;
+ metrics.rmx_sendpipe = 0;
+ metrics.rmx_recvpipe = 0;
+
+ tcp_hc_update(&inp->inp_inc, &metrics);
}
- no_valid_rt:
+
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
- else if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
- struct rtentry *rt;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
- int offered;
+ u_int maxmtu;
+ u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ bzero(&tao, sizeof(tao));
if (tp) {
+ maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
+ romtu =
#ifdef INET6
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
+ isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
- rt = tcp_rtlookup(&inp->inp_inc);
- if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tcp_maxmtu(&inp->inp_inc);
+ if (!maxmtu)
+ maxmtu = romtu;
+ else
+ maxmtu = min(maxmtu, romtu);
+ if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
- taop = rmx_taop(rt->rt_rmx);
- offered = taop->tao_mssopt;
- mss = rt->rt_rmx.rmx_mtu -
+ mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
- if (offered)
- mss = min(mss, offered);
+ if (tcp_do_rfc1644) {
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt)
+ mss = min(mss, tao.tao_mssopt);
+ }
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
-struct rtentry *
-tcp_rtlookup(inc)
+u_long
+tcp_maxmtu(inc)
struct in_conninfo *inc;
{
- struct route *ro;
- struct rtentry *rt;
-
- ro = &inc->inc_route;
- rt = ro->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (inc->inc_faddr.s_addr != INADDR_ANY) {
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
- inc->inc_faddr;
- rtalloc(ro);
- rt = ro->ro_rt;
- }
+ struct route sro;
+ struct sockaddr_in *dst;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
+
+ sro.ro_rt = NULL;
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ dst = (struct sockaddr_in *)&sro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = inc->inc_faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
+ }
+ if (sro.ro_rt != NULL) {
+ ifp = sro.ro_rt->rt_ifp;
+ if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = ifp->if_mtu;
+ else
+ maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ RTFREE(sro.ro_rt);
}
- return rt;
+ return (maxmtu);
}
#ifdef INET6
-struct rtentry *
-tcp_rtlookup6(inc)
+u_long
+tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
- struct route_in6 *ro6;
- struct rtentry *rt;
-
- ro6 = &inc->inc6_route;
- rt = ro6->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
- ro6->ro_dst.sin6_family = AF_INET6;
- ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
- ro6->ro_dst.sin6_addr = inc->inc6_faddr;
- rtalloc((struct route *)ro6);
- rt = ro6->ro_rt;
- }
+ struct route_in6 sro6;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
+
+ sro6.ro_rt = NULL;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ sro6.ro_dst.sin6_family = AF_INET6;
+ sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ sro6.ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
- return rt;
+ if (sro6.ro_rt != NULL) {
+ ifp = sro6.ro_rt->rt_ifp;
+ if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
+ else
+ maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
+ IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+ RTFREE(sro6.ro_rt);
+ }
+
+ return (maxmtu);
}
#endif /* INET6 */
@@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp)
#endif /*IPSEC*/
/*
- * Return a pointer to the cached information about the remote host.
- * The cached information is stored in the protocol specific part of
- * the route metrics.
- */
-struct rmxp_tao *
-tcp_gettaocache(inc)
- struct in_conninfo *inc;
-{
- struct rtentry *rt;
-
-#ifdef INET6
- if (inc->inc_isipv6)
- rt = tcp_rtlookup6(inc);
- else
-#endif /* INET6 */
- rt = tcp_rtlookup(inc);
-
- /* Make sure this is a host route and is up. */
- if (rt == NULL ||
- (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
- return NULL;
-
- return rmx_taop(rt->rt_rmx);
-}
-
-/*
- * Clear all the TAO cache entries, called from tcp_init.
- *
- * XXX
- * This routine is just an empty one, because we assume that the routing
- * routing tables are initialized at the same time when TCP, so there is
- * nothing in the cache left over.
- */
-static void
-tcp_cleartaocache()
-{
-}
-
-/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
* inp is locked, and is unlocked before returning.
@@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
- ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
- inp->in6p_route.ro_rt->rt_ifp : NULL);
- error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
- error = ip_output(m, inp->inp_options, &inp->inp_route,
+ error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)
OpenPOWER on IntegriCloud