summaryrefslogtreecommitdiffstats
path: root/sys/netinet/tcp_subr.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/netinet/tcp_subr.c')
-rw-r--r--sys/netinet/tcp_subr.c348
1 files changed, 125 insertions, 223 deletions
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 7ce06f6..dfd6de1 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
- tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
+ tcp_hc_init();
}
/*
@@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
- struct route *ro = 0;
- struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
- struct route_in6 *ro6 = 0;
- struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
- struct inpcb *inp;
+ struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
-#ifdef INET6
- if (isipv6)
- ro6 = &inp->in6p_route;
- else
-#endif /* INET6 */
- ro = &inp->inp_route;
- } else {
- inp = NULL;
-#ifdef INET6
- if (isipv6) {
- ro6 = &sro6;
- bzero(ro6, sizeof *ro6);
- } else
-#endif /* INET6 */
- {
- ro = &sro;
- bzero(ro, sizeof *ro);
- }
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
- ip6->ip6_hlim = in6_selecthlim(inp,
- ro6 && ro6->ro_rt ?
- ro6->ro_rt->rt_ifp :
- NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
- if (isipv6) {
- (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
- if (ro6 == &sro6 && ro6->ro_rt) {
- RTFREE(ro6->ro_rt);
- ro6->ro_rt = NULL;
- }
- } else
+ if (isipv6)
+ (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ else
#endif /* INET6 */
- {
- (void) ip_output(m, NULL, ro, ipflags, NULL, inp);
- if (ro == &sro && ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
- }
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- struct rtentry *rt;
- int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
- * 'Enough' is arbitrarily defined as the 16 samples.
- * 16 samples is enough for the srtt filter to converge
- * to within 5% of the correct value; fewer samples and
- * we could save a very bogus rtt.
- *
- * Don't update the default route's characteristics and don't
- * update anything that the user "locked".
+ * 'Enough' is arbitrarily defined as 4 rtt samples.
+ * 4 samples is enough for the srtt filter to converge
+ * to within enough % of the correct value; fewer samples
+ * and we could save a bogus rtt. The danger is not high
+ * as tcp quickly recovers from everything.
+ * XXX: Works very well but needs some more statistics!
*/
- if (tp->t_rttupdated >= 16) {
- register u_long i = 0;
-#ifdef INET6
- if (isipv6) {
- struct sockaddr_in6 *sin6;
+ if (tp->t_rttupdated >= 4) {
+ struct hc_metrics_lite metrics;
+ u_long ssthresh;
- if ((rt = inp->in6p_route.ro_rt) == NULL)
- goto no_valid_rt;
- sin6 = (struct sockaddr_in6 *)rt_key(rt);
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- goto no_valid_rt;
- }
- else
-#endif /* INET6 */
- if ((rt = inp->inp_route.ro_rt) == NULL ||
- ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
- == INADDR_ANY)
- goto no_valid_rt;
-
- if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
- i = tp->t_srtt *
- (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- if (rt->rt_rmx.rmx_rtt && i)
- /*
- * filter this update to half the old & half
- * the new values, converting scale.
- * See route.h and tcp_var.h for a
- * description of the scaling constants.
- */
- rt->rt_rmx.rmx_rtt =
- (rt->rt_rmx.rmx_rtt + i) / 2;
- else
- rt->rt_rmx.rmx_rtt = i;
- tcpstat.tcps_cachedrtt++;
- }
- if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
- i = tp->t_rttvar *
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- if (rt->rt_rmx.rmx_rttvar && i)
- rt->rt_rmx.rmx_rttvar =
- (rt->rt_rmx.rmx_rttvar + i) / 2;
- else
- rt->rt_rmx.rmx_rttvar = i;
- tcpstat.tcps_cachedrttvar++;
- }
+ bzero(&metrics, sizeof(metrics));
/*
- * The old comment here said:
- * update the pipelimit (ssthresh) if it has been updated
- * already or if a pipesize was specified & the threshhold
- * got below half the pipesize. I.e., wait for bad news
- * before we start updating, then update on both good
- * and bad news.
- *
- * But we want to save the ssthresh even if no pipesize is
- * specified explicitly in the route, because such
- * connections still have an implicit pipesize specified
- * by the global tcp_sendspace. In the absence of a reliable
- * way to calculate the pipesize, it will have to do.
+ * Update the ssthresh always when the conditions below
+ * are satisfied. This gives us better new start value
+ * for the congestion avoidance for new connections.
+ * ssthresh is only set if packet loss occured on a session.
*/
- i = tp->snd_ssthresh;
- if (rt->rt_rmx.rmx_sendpipe != 0)
- dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
- else
- dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
- if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
- i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
- || dosavessthresh) {
+ ssthresh = tp->snd_ssthresh;
+ if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
- i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
- if (i < 2)
- i = 2;
- i *= (u_long)(tp->t_maxseg +
+ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (ssthresh < 2)
+ ssthresh = 2;
+ ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
- if (rt->rt_rmx.rmx_ssthresh)
- rt->rt_rmx.rmx_ssthresh =
- (rt->rt_rmx.rmx_ssthresh + i) / 2;
- else
- rt->rt_rmx.rmx_ssthresh = i;
- tcpstat.tcps_cachedssthresh++;
- }
+ } else
+ ssthresh = 0;
+ metrics.rmx_ssthresh = ssthresh;
+
+ metrics.rmx_rtt = tp->t_srtt;
+ metrics.rmx_rttvar = tp->t_rttvar;
+ /* XXX: This wraps if the pipe is more than 4 Gbit per second */
+ metrics.rmx_bandwidth = tp->snd_bandwidth;
+ metrics.rmx_cwnd = tp->snd_cwnd;
+ metrics.rmx_sendpipe = 0;
+ metrics.rmx_recvpipe = 0;
+
+ tcp_hc_update(&inp->inp_inc, &metrics);
}
- no_valid_rt:
+
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
- else if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
- struct rtentry *rt;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
- int offered;
+ u_int maxmtu;
+ u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ bzero(&tao, sizeof(tao));
if (tp) {
+ maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
+ romtu =
#ifdef INET6
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
+ isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
- rt = tcp_rtlookup(&inp->inp_inc);
- if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tcp_maxmtu(&inp->inp_inc);
+ if (!maxmtu)
+ maxmtu = romtu;
+ else
+ maxmtu = min(maxmtu, romtu);
+ if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
- taop = rmx_taop(rt->rt_rmx);
- offered = taop->tao_mssopt;
- mss = rt->rt_rmx.rmx_mtu -
+ mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
- if (offered)
- mss = min(mss, offered);
+ if (tcp_do_rfc1644) {
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt)
+ mss = min(mss, tao.tao_mssopt);
+ }
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
-struct rtentry *
-tcp_rtlookup(inc)
+u_long
+tcp_maxmtu(inc)
struct in_conninfo *inc;
{
- struct route *ro;
- struct rtentry *rt;
-
- ro = &inc->inc_route;
- rt = ro->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (inc->inc_faddr.s_addr != INADDR_ANY) {
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
- inc->inc_faddr;
- rtalloc(ro);
- rt = ro->ro_rt;
- }
+ struct route sro;
+ struct sockaddr_in *dst;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
+
+ sro.ro_rt = NULL;
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ dst = (struct sockaddr_in *)&sro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = inc->inc_faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
+ }
+ if (sro.ro_rt != NULL) {
+ ifp = sro.ro_rt->rt_ifp;
+ if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = ifp->if_mtu;
+ else
+ maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ RTFREE(sro.ro_rt);
}
- return rt;
+ return (maxmtu);
}
#ifdef INET6
-struct rtentry *
-tcp_rtlookup6(inc)
+u_long
+tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
- struct route_in6 *ro6;
- struct rtentry *rt;
-
- ro6 = &inc->inc6_route;
- rt = ro6->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
- ro6->ro_dst.sin6_family = AF_INET6;
- ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
- ro6->ro_dst.sin6_addr = inc->inc6_faddr;
- rtalloc((struct route *)ro6);
- rt = ro6->ro_rt;
- }
+ struct route_in6 sro6;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
+
+ sro6.ro_rt = NULL;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ sro6.ro_dst.sin6_family = AF_INET6;
+ sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ sro6.ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
- return rt;
+ if (sro6.ro_rt != NULL) {
+ ifp = sro6.ro_rt->rt_ifp;
+ if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
+ else
+ maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
+ IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+ RTFREE(sro6.ro_rt);
+ }
+
+ return (maxmtu);
}
#endif /* INET6 */
@@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp)
#endif /*IPSEC*/
/*
- * Return a pointer to the cached information about the remote host.
- * The cached information is stored in the protocol specific part of
- * the route metrics.
- */
-struct rmxp_tao *
-tcp_gettaocache(inc)
- struct in_conninfo *inc;
-{
- struct rtentry *rt;
-
-#ifdef INET6
- if (inc->inc_isipv6)
- rt = tcp_rtlookup6(inc);
- else
-#endif /* INET6 */
- rt = tcp_rtlookup(inc);
-
- /* Make sure this is a host route and is up. */
- if (rt == NULL ||
- (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
- return NULL;
-
- return rmx_taop(rt->rt_rmx);
-}
-
-/*
- * Clear all the TAO cache entries, called from tcp_init.
- *
- * XXX
- * This routine is just an empty one, because we assume that the routing
- * routing tables are initialized at the same time when TCP, so there is
- * nothing in the cache left over.
- */
-static void
-tcp_cleartaocache()
-{
-}
-
-/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
* inp is locked, and is unlocked before returning.
@@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
- ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
- inp->in6p_route.ro_rt->rt_ifp : NULL);
- error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
- error = ip_output(m, inp->inp_options, &inp->inp_route,
+ error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)
OpenPOWER on IntegriCloud