summaryrefslogtreecommitdiffstats
path: root/sys
diff options
context:
space:
mode:
authorandre <andre@FreeBSD.org>2003-11-20 20:07:39 +0000
committerandre <andre@FreeBSD.org>2003-11-20 20:07:39 +0000
commit6164d7c280688f20cf827e8374984c6e0175fab0 (patch)
treef947a08d66395dd498056038f0c360783fa281c7 /sys
parent6dca20de0718f19b3cdc5a7d5ebb71cd54b2374e (diff)
downloadFreeBSD-src-6164d7c280688f20cf827e8374984c6e0175fab0.zip
FreeBSD-src-6164d7c280688f20cf827e8374984c6e0175fab0.tar.gz
Introduce tcp_hostcache and remove the tcp specific metrics from
the routing table. Move all usage and references in the tcp stack from the routing table metrics to the tcp hostcache. It caches measured parameters of past tcp sessions to provide better initial start values for following connections from or to the same source or destination. Depending on the network parameters to/from the remote host this can lead to significant speedups for new tcp connections after the first one because they inherit and shortcut the learning curve. tcp_hostcache is designed for multiple concurrent access in SMP environments with high contention and is hash indexed by remote ip address. It removes significant locking requirements from the tcp stack with regard to the routing table. Reviewed by: sam (mentor), bms Reviewed by: -net, -current, core@kame.net (IPv6 parts) Approved by: re (scottl)
Diffstat (limited to 'sys')
-rw-r--r--sys/conf/files1
-rw-r--r--sys/net/if_faith.c13
-rw-r--r--sys/net/if_loop.c13
-rw-r--r--sys/net/route.h10
-rw-r--r--sys/net/rtsock.c38
-rw-r--r--sys/netinet/in_pcb.c97
-rw-r--r--sys/netinet/in_pcb.h19
-rw-r--r--sys/netinet/in_rmx.c3
-rw-r--r--sys/netinet/ip_divert.c9
-rw-r--r--sys/netinet/ip_fw2.c9
-rw-r--r--sys/netinet/ip_icmp.c57
-rw-r--r--sys/netinet/ip_input.c16
-rw-r--r--sys/netinet/ip_output.c25
-rw-r--r--sys/netinet/raw_ip.c2
-rw-r--r--sys/netinet/tcp_hostcache.c728
-rw-r--r--sys/netinet/tcp_input.c344
-rw-r--r--sys/netinet/tcp_output.c42
-rw-r--r--sys/netinet/tcp_reass.c344
-rw-r--r--sys/netinet/tcp_subr.c348
-rw-r--r--sys/netinet/tcp_syncache.c100
-rw-r--r--sys/netinet/tcp_timer.c7
-rw-r--r--sys/netinet/tcp_timewait.c348
-rw-r--r--sys/netinet/tcp_usrreq.c45
-rw-r--r--sys/netinet/tcp_var.h44
-rw-r--r--sys/netinet/udp_usrreq.c17
-rw-r--r--sys/netinet6/icmp6.c32
-rw-r--r--sys/netinet6/in6_pcb.c44
-rw-r--r--sys/netinet6/in6_rmx.c3
-rw-r--r--sys/netinet6/in6_src.c45
-rw-r--r--sys/netinet6/ip6_output.c27
-rw-r--r--sys/netinet6/udp6_output.c9
31 files changed, 1686 insertions, 1153 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 63c378b..8eee001 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1457,6 +1457,7 @@ netinet/ip_mroute.c optional mrouting
netinet/ip_output.c optional inet
netinet/raw_ip.c optional inet
netinet/tcp_debug.c optional tcpdebug
+netinet/tcp_hostcache.c optional inet
netinet/tcp_input.c optional inet
netinet/tcp_output.c optional inet
netinet/tcp_subr.c optional inet
diff --git a/sys/net/if_faith.c b/sys/net/if_faith.c
index 07216b5..a8da4ad 100644
--- a/sys/net/if_faith.c
+++ b/sys/net/if_faith.c
@@ -270,17 +270,8 @@ faithrtrequest(cmd, rt, info)
struct rt_addrinfo *info;
{
RT_LOCK_ASSERT(rt);
-
- if (rt) {
- rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */
- /*
- * For optimal performance, the send and receive buffers
- * should be at least twice the MTU plus a little more for
- * overhead.
- */
- rt->rt_rmx.rmx_recvpipe =
- rt->rt_rmx.rmx_sendpipe = 3 * FAITHMTU;
- }
+ if (rt)
+ rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
}
/*
diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c
index afe0a73..9a54af4 100644
--- a/sys/net/if_loop.c
+++ b/sys/net/if_loop.c
@@ -329,17 +329,8 @@ lortrequest(cmd, rt, info)
struct rt_addrinfo *info;
{
RT_LOCK_ASSERT(rt);
-
- if (rt) {
- rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */
- /*
- * For optimal performance, the send and receive buffers
- * should be at least twice the MTU plus a little more for
- * overhead.
- */
- rt->rt_rmx.rmx_recvpipe =
- rt->rt_rmx.rmx_sendpipe = 3 * LOMTU;
- }
+ if (rt)
+ rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
}
/*
diff --git a/sys/net/route.h b/sys/net/route.h
index 8fff560..34c33eb 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -58,6 +58,12 @@ struct route {
* These numbers are used by reliable protocols for determining
* retransmission behavior and are included in the routing structure.
*/
+struct rt_metrics_lite {
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_expire; /* lifetime for route, e.g. redirect */
+ u_long rmx_pksent; /* packets sent using this route */
+};
+
struct rt_metrics {
u_long rmx_locks; /* Kernel must leave these values alone */
u_long rmx_mtu; /* MTU for this path */
@@ -104,10 +110,10 @@ struct rtentry {
long rt_refcnt; /* # held references */
u_long rt_flags; /* up/down?, host/net */
struct ifnet *rt_ifp; /* the answer: interface to use */
- struct ifaddr *rt_ifa; /* the answer: interface to use */
+ struct ifaddr *rt_ifa; /* the answer: interface address to use */
struct sockaddr *rt_genmask; /* for generation of cloned routes */
caddr_t rt_llinfo; /* pointer to link level info cache */
- struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */
+ struct rt_metrics_lite rt_rmx; /* metrics used by rx'ing protocols */
struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */
int (*rt_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
struct rtentry *);
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 4fba1a2..3290c0c 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -87,7 +87,8 @@ static int sysctl_dumpentry(struct radix_node *rn, void *vw);
static int sysctl_iflist(int af, struct walkarg *w);
static int sysctl_ifmalist(int af, struct walkarg *w);
static int route_output(struct mbuf *, struct socket *);
-static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics *);
+static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics_lite *);
+static void rt_getmetrics(struct rt_metrics_lite *, struct rt_metrics *);
static void rt_dispatch(struct mbuf *, struct sockaddr *);
/*
@@ -355,9 +356,6 @@ route_output(m, so)
RT_LOCK(saved_nrt);
rt_setmetrics(rtm->rtm_inits,
&rtm->rtm_rmx, &saved_nrt->rt_rmx);
- saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
- saved_nrt->rt_rmx.rmx_locks |=
- (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
RT_REMREF(saved_nrt);
saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK];
RT_UNLOCK(saved_nrt);
@@ -428,7 +426,7 @@ route_output(m, so)
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm,
(struct walkarg *)0);
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_rmx = rt->rt_rmx;
+ rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
@@ -478,9 +476,7 @@ route_output(m, so)
rt->rt_genmask = info.rti_info[RTAX_GENMASK];
/* FALLTHROUGH */
case RTM_LOCK:
- rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
- rt->rt_rmx.rmx_locks |=
- (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
+ /* We don't support locks anymore */
break;
}
RT_UNLOCK(rt);
@@ -542,20 +538,28 @@ flush:
}
static void
-rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics *out)
+rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics_lite *out)
{
#define metric(f, e) if (which & (f)) out->e = in->e;
- metric(RTV_RPIPE, rmx_recvpipe);
- metric(RTV_SPIPE, rmx_sendpipe);
- metric(RTV_SSTHRESH, rmx_ssthresh);
- metric(RTV_RTT, rmx_rtt);
- metric(RTV_RTTVAR, rmx_rttvar);
- metric(RTV_HOPCOUNT, rmx_hopcount);
+ /*
+ * Only these are stored in the routing entry since introduction
+ * of tcp hostcache. The rest is ignored.
+ */
metric(RTV_MTU, rmx_mtu);
metric(RTV_EXPIRE, rmx_expire);
#undef metric
}
+static void
+rt_getmetrics(struct rt_metrics_lite *in, struct rt_metrics *out)
+{
+#define metric(e) out->e = in->e;
+ bzero(out, sizeof(*out));
+ metric(rmx_mtu);
+ metric(rmx_expire);
+#undef metric
+}
+
#define ROUNDUP(a) \
((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
@@ -948,8 +952,8 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_use = rt->rt_use;
- rtm->rtm_rmx = rt->rt_rmx;
+ rtm->rtm_use = rt->rt_rmx.rmx_pksent;
+ rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
rtm->rtm_addrs = info.rti_addrs;
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 11735ec..898c0d4 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -561,7 +561,6 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
if (error)
return (error);
}
-
if (!TAILQ_EMPTY(&in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
@@ -579,32 +578,20 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
&in_ifaddrhead)->ia_broadaddr)->sin_addr;
}
if (laddr.s_addr == INADDR_ANY) {
- register struct route *ro;
+ struct route sro;
+ sro.ro_rt = NULL;
ia = (struct in_ifaddr *)0;
/*
- * If route is known or can be allocated now,
- * our src addr is taken from the i/f, else punt.
- * Note that we should check the address family of the cached
- * destination, in case of sharing the cache with IPv6.
+ * If route is known our src addr is taken from the i/f,
+ * else punt.
*/
- ro = &inp->inp_route;
- if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
- ro->ro_dst.sa_family != AF_INET ||
- satosin(&ro->ro_dst)->sin_addr.s_addr != faddr.s_addr ||
- inp->inp_socket->so_options & SO_DONTROUTE)) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = (struct rtentry *)0;
- }
- if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
- (ro->ro_rt == (struct rtentry *)0 ||
- ro->ro_rt->rt_ifp == (struct ifnet *)0)) {
- /* No route yet, so try to acquire one */
- bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = faddr;
- rtalloc(ro);
+ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) {
+ /* Find out route to destination */
+ sro.ro_dst.sa_family = AF_INET;
+ sro.ro_dst.sa_len = sizeof(struct sockaddr_in);
+ ((struct sockaddr_in *)&sro.ro_dst)->sin_addr = faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
}
/*
* If we found a route, use the address
@@ -612,8 +599,10 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
* unless it is the loopback (in case a route
* to our address on another net goes to loopback).
*/
- if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
- ia = ifatoia(ro->ro_rt->rt_ifa);
+ if (sro.ro_rt && !(sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
+ ia = ifatoia(sro.ro_rt->rt_ifa);
+ if (sro.ro_rt)
+ RTFREE(sro.ro_rt);
if (ia == 0) {
bzero(&sa, sizeof(sa));
sa.sin_addr = faddr;
@@ -706,8 +695,6 @@ in_pcbdetach(inp)
}
if (inp->inp_options)
(void)m_free(inp->inp_options);
- if (inp->inp_route.ro_rt)
- RTFREE(inp->inp_route.ro_rt);
ip_freemoptions(inp->inp_moptions);
inp->inp_vflag = 0;
INP_LOCK_DESTROY(inp);
@@ -884,62 +871,6 @@ in_pcbpurgeif0(pcbinfo, ifp)
}
/*
- * Check for alternatives when higher level complains
- * about service problems. For now, invalidate cached
- * routing information. If the route was created dynamically
- * (by a redirect), time to try a default gateway again.
- */
-void
-in_losing(inp)
- struct inpcb *inp;
-{
- register struct rtentry *rt;
- struct rt_addrinfo info;
-
- INP_LOCK_ASSERT(inp);
-
- if ((rt = inp->inp_route.ro_rt)) {
- RT_LOCK(rt);
- inp->inp_route.ro_rt = NULL;
- bzero((caddr_t)&info, sizeof(info));
- info.rti_flags = rt->rt_flags;
- info.rti_info[RTAX_DST] = rt_key(rt);
- info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
- if (rt->rt_flags & RTF_DYNAMIC)
- rtexpunge(rt);
- RTFREE_LOCKED(rt);
- /*
- * A new route can be allocated
- * the next time output is attempted.
- */
- }
-}
-
-/*
- * After a routing change, flush old routing
- * and allocate a (hopefully) better one.
- */
-struct inpcb *
-in_rtchange(inp, errno)
- register struct inpcb *inp;
- int errno;
-{
- INP_LOCK_ASSERT(inp);
-
- if (inp->inp_route.ro_rt) {
- RTFREE(inp->inp_route.ro_rt);
- inp->inp_route.ro_rt = 0;
- /*
- * A new route can be allocated the next time
- * output is attempted.
- */
- }
- return inp;
-}
-
-/*
* Lookup a PCB based on the local address and port.
*/
struct inpcb *
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 8a6717c..5e93328 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -94,31 +94,22 @@ struct in_endpoints {
/*
* XXX
- * At some point struct route should possibly change to:
- * struct rtentry *rt
- * struct in_endpoints *ie;
+ * the defines for inc_* are hacks and should be changed to direct references
*/
struct in_conninfo {
u_int8_t inc_flags;
u_int8_t inc_len;
u_int16_t inc_pad; /* XXX alignment for in_endpoints */
- /* protocol dependent part; cached route */
+ /* protocol dependent part */
struct in_endpoints inc_ie;
- union {
- /* placeholder for routing entry */
- struct route inc4_route;
- struct route_in6 inc6_route;
- } inc_dependroute;
};
#define inc_isipv6 inc_flags /* temp compatability */
#define inc_fport inc_ie.ie_fport
#define inc_lport inc_ie.ie_lport
#define inc_faddr inc_ie.ie_faddr
#define inc_laddr inc_ie.ie_laddr
-#define inc_route inc_dependroute.inc4_route
#define inc6_faddr inc_ie.ie6_faddr
#define inc6_laddr inc_ie.ie6_laddr
-#define inc6_route inc_dependroute.inc6_route
struct icmp6_filter;
@@ -157,7 +148,6 @@ struct inpcb {
#define inp_lport inp_inc.inc_lport
#define inp_faddr inp_inc.inc_faddr
#define inp_laddr inp_inc.inc_laddr
-#define inp_route inp_inc.inc_route
#define inp_ip_tos inp_depend4.inp4_ip_tos
#define inp_options inp_depend4.inp4_options
#define inp_moptions inp_depend4.inp4_moptions
@@ -182,7 +172,7 @@ struct inpcb {
#define in6p_faddr inp_inc.inc6_faddr
#define in6p_laddr inp_inc.inc6_laddr
-#define in6p_route inp_inc.inc6_route
+#define in6p_ip6_hlim inp_depend6.inp6_hlim
#define in6p_hops inp_depend6.inp6_hops /* default hop limit */
#define in6p_ip6_nxt inp_ip_p
#define in6p_flowinfo inp_flow
@@ -347,9 +337,6 @@ extern int ipport_hifirstauto;
extern int ipport_hilastauto;
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
-void in_losing(struct inpcb *);
-struct inpcb *
- in_rtchange(struct inpcb *, int);
int in_pcballoc(struct socket *, struct inpcbinfo *, struct thread *);
int in_pcbbind(struct inpcb *, struct sockaddr *, struct thread *);
int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c
index 4625030..ea11792 100644
--- a/sys/netinet/in_rmx.c
+++ b/sys/netinet/in_rmx.c
@@ -98,8 +98,7 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
rt->rt_flags |= RTF_MULTICAST;
- if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
- rt->rt_ifp)
+ if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
ret = rn_addroute(v_arg, n_arg, head, treenodes);
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index 172021b..bd777dd 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -336,7 +336,7 @@ div_output(struct socket *so, struct mbuf *m,
ipstat.ips_rawout++; /* XXX */
error = ip_output((struct mbuf *)&divert_tag,
- inp->inp_options, &inp->inp_route,
+ inp->inp_options, NULL,
(so->so_options & SO_DONTROUTE) |
IP_ALLOWBROADCAST | IP_RAWOUTPUT,
inp->inp_moptions, NULL);
@@ -527,11 +527,8 @@ div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
faddr = ((struct sockaddr_in *)sa)->sin_addr;
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
return;
- if (PRC_IS_REDIRECT(cmd)) {
- /* flush held routes */
- in_pcbnotifyall(&divcbinfo, faddr,
- inetctlerrmap[cmd], in_rtchange);
- }
+ if (PRC_IS_REDIRECT(cmd))
+ return;
}
static int
diff --git a/sys/netinet/ip_fw2.c b/sys/netinet/ip_fw2.c
index 5d3e3da..999d064 100644
--- a/sys/netinet/ip_fw2.c
+++ b/sys/netinet/ip_fw2.c
@@ -466,10 +466,13 @@ verify_rev_path(struct in_addr src, struct ifnet *ifp)
rtalloc_ign(&ro, RTF_CLONING);
}
- if ((ro.ro_rt == NULL) || (ifp == NULL) ||
- (ro.ro_rt->rt_ifp->if_index != ifp->if_index))
+ if (ro.ro_rt == NULL)
return 0;
-
+ if ((ifp == NULL) || (ro.ro_rt->rt_ifp->if_index != ifp->if_index)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+ RTFREE(ro.ro_rt);
return 1;
}
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
index f94e7b9..b84d689 100644
--- a/sys/netinet/ip_icmp.c
+++ b/sys/netinet/ip_icmp.c
@@ -52,11 +52,15 @@
#include <net/route.h>
#include <netinet/in.h>
+#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
#include <netinet/icmp_var.h>
#ifdef IPSEC
@@ -395,7 +399,7 @@ icmp_input(m, off)
printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
#endif
icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
-#if 1
+
/*
* MTU discovery:
* If we got a needfrag and there is a host route to the
@@ -405,40 +409,37 @@ icmp_input(m, off)
* notice that the MTU has changed and adapt accordingly.
* If no new MTU was suggested, then we guess a new one
* less than the current value. If the new MTU is
- * unreasonably small (arbitrarily set at 296), then
- * we reset the MTU to the interface value and enable the
- * lock bit, indicating that we are no longer doing MTU
- * discovery.
+ * unreasonably small, then we don't update the MTU value.
+ *
+ * XXX: All this should be done in tcp_mtudisc() because
+ * the way we do it now, everyone can send us bogus ICMP
+ * MSGSIZE packets for any destination. By doing this far
+ * higher in the chain we have a matching tcp connection.
+ * Thus spoofing is much harder. However there is no easy
+ * non-hackish way to pass the new MTU up to tcp_mtudisc().
+ * Also see next XXX regarding IPv4 AH TCP.
*/
if (code == PRC_MSGSIZE) {
- struct rtentry *rt;
int mtu;
+ struct in_conninfo inc;
+
+ bzero(&inc, sizeof(inc));
+ inc.inc_flags = 0; /* IPv4 */
+ inc.inc_faddr = icmpsrc.sin_addr;
+
+ mtu = ntohs(icp->icmp_nextmtu);
+ if (!mtu)
+ mtu = ip_next_mtu(mtu, 1);
+
+ if (mtu >= 256 + sizeof(struct tcpiphdr))
+ tcp_hc_updatemtu(&inc, mtu);
- rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
- RTF_CLONING);
- if (rt && (rt->rt_flags & RTF_HOST)
- && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
- mtu = ntohs(icp->icmp_nextmtu);
- if (!mtu)
- mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu,
- 1);
#ifdef DEBUG_MTUDISC
- printf("MTU for %s reduced to %d\n",
- inet_ntoa(icmpsrc.sin_addr), mtu);
+ printf("MTU for %s reduced to %d\n",
+ inet_ntoa(icmpsrc.sin_addr), mtu);
#endif
- if (mtu < 296) {
- /* rt->rt_rmx.rmx_mtu =
- rt->rt_ifp->if_mtu; */
- rt->rt_rmx.rmx_locks |= RTV_MTU;
- } else if (rt->rt_rmx.rmx_mtu > mtu) {
- rt->rt_rmx.rmx_mtu = mtu;
- }
- }
- if (rt)
- rtfree(rt);
}
-#endif
/*
* XXX if the packet contains [IPv4 AH TCP], we can't make a
* notification to TCP layer.
@@ -785,7 +786,6 @@ iptime()
return (htonl(t));
}
-#if 1
/*
* Return the next larger or smaller MTU plateau (table from RFC 1191)
* given current value MTU. If DIR is less than zero, a larger plateau
@@ -823,7 +823,6 @@ ip_next_mtu(mtu, dir)
}
}
}
-#endif
/*
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index df67d22..3d528f4 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -1612,22 +1612,22 @@ struct in_ifaddr *
ip_rtaddr(dst)
struct in_addr dst;
{
+ struct route sro;
struct sockaddr_in *sin;
struct in_ifaddr *ifa;
- struct route ro;
- bzero(&ro, sizeof(ro));
- sin = (struct sockaddr_in *)&ro.ro_dst;
+ sro.ro_rt = NULL;
+ sin = (struct sockaddr_in *)&sro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = dst;
- rtalloc_ign(&ro, RTF_CLONING);
+ rtalloc_ign(&sro, RTF_CLONING);
- if (ro.ro_rt == 0)
+ if (sro.ro_rt == NULL)
return ((struct in_ifaddr *)0);
- ifa = ifatoia(ro.ro_rt->rt_ifa);
- RTFREE(ro.ro_rt);
+ ifa = ifatoia(sro.ro_rt->rt_ifa);
+ RTFREE(sro.ro_rt);
return ifa;
}
@@ -1879,7 +1879,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
struct route ro;
struct rtentry *rt;
- bzero(&ro, sizeof(ro));
+ ro.ro_rt = NULL;
sin = (struct sockaddr_in *)&ro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index cdf8b87..0a11524 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -302,13 +302,9 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
isbroadcast = 0; /* fool gcc */
} else {
/*
- * If this is the case, we probably don't want to allocate
- * a protocol-cloned route since we didn't get one from the
- * ULP. This lets TCP do its thing, while not burdening
- * forwarding or ICMP with the overhead of cloning a route.
- * Of course, we still want to do any cloning requested by
- * the link layer, as this is probably required in all cases
- * for correct operation (as it is for ARP).
+ * We want to do any cloning requested by the link layer,
+ * as this is probably required in all cases for correct
+ * operation (as it is for ARP).
*/
if (ro->ro_rt == 0)
rtalloc(ro);
@@ -319,7 +315,7 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
}
ia = ifatoia(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
- ro->ro_rt->rt_use++;
+ ro->ro_rt->rt_rmx.rmx_pksent++;
if (ro->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
if (ro->ro_rt->rt_flags & RTF_HOST)
@@ -931,16 +927,14 @@ spd_done:
ip_input((struct mbuf *)&tag);
goto done;
}
- /* Some of the logic for this was
+ /*
+ * Some of the logic for this was
* nicked from above.
- *
- * This rewrites the cached route in a local PCB.
- * Is this what we want to do?
*/
bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
ro_fwd->ro_rt = 0;
- rtalloc(ro_fwd);
+ rtalloc_ign(ro_fwd, RTF_CLONING);
if (ro_fwd->ro_rt == 0) {
ipstat.ips_noroute++;
@@ -950,7 +944,7 @@ spd_done:
ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
ifp = ro_fwd->ro_rt->rt_ifp;
- ro_fwd->ro_rt->rt_use++;
+ ro_fwd->ro_rt->rt_rmx.rmx_pksent++;
if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)
ro_fwd->ro_rt->rt_gateway;
@@ -1045,7 +1039,6 @@ pass:
* routes when the MTU is changed.
*/
if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
- !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
(ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
}
@@ -1983,7 +1976,7 @@ ip_setmoptions(sopt, imop)
dst->sin_len = sizeof(*dst);
dst->sin_family = AF_INET;
dst->sin_addr = mreq.imr_multiaddr;
- rtalloc(&ro);
+ rtalloc_ign(&ro, RTF_CLONING);
if (ro.ro_rt == NULL) {
error = EADDRNOTAVAIL;
splx(s);
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 632e00a..0a76a7f 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -302,7 +302,7 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst)
if (inp->inp_flags & INP_ONESBCAST)
flags |= IP_SENDONES;
- return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
+ return (ip_output(m, inp->inp_options, NULL, flags,
inp->inp_moptions, inp));
}
diff --git a/sys/netinet/tcp_hostcache.c b/sys/netinet/tcp_hostcache.c
new file mode 100644
index 0000000..461ce85
--- /dev/null
+++ b/sys/netinet/tcp_hostcache.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The tcp_hostcache moves the tcp specific cached metrics from the routing
+ * table into a dedicated structure indexed by the remote IP address. It
+ * keeps information on the measured tcp parameters of past tcp sessions
+ * to have better initial start values for following connections from the
+ * same source. Depending on the network parameters (delay, bandwidth, max
+ * MTU, congestion window) between local and remote site this can lead to
+ * significant speedups for new tcp connections after the first one.
+ *
+ * Due to this new tcp_hostcache all tcp specific metrics information in
+ * the routing table has been removed. The INPCB no longer keeps a pointer
+ * to the routing entry and protocol initiated route cloning has been
+ * removed as well. With these changes the routing table has gone back
+ * to being more lightwight and only carries information related to packet
+ * forwarding.
+ *
+ * Tcp_hostcache is designed for multiple concurrent access in SMP
+ * environments and high contention. All bucket rows have their own
+ * lock and thus multiple lookups and modifies can be done at the same
+ * time as long as they are in different bucket rows. If a request for
+ * insertion of a new record can't be satisfied it simply returns an
+ * empty structure. Nobody and nothing shall ever point directly to
+ * any entry in tcp_hostcache. All communication is done in an object
+ * oriented way and only funtions of tcp_hostcache will manipulate hostcache
+ * entries. Otherwise we are unable to achieve good behaviour in concurrent
+ * access situations. Since tcp_hostcache is only caching information there
+ * are no fatal consequences if we either can't satisfy any particular request
+ * or have to drop/overwrite an existing entry because of bucket limit
+ * memory constrains.
+ */
+
+/*
+ * Many thanks to jlemon for basic structure of tcp_syncache which is being
+ * followed here.
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+
+#include <vm/uma.h>
+
+
+TAILQ_HEAD(hc_qhead, hc_metrics);
+
+struct hc_head {
+ struct hc_qhead hch_bucket;
+ u_int hch_length;
+ struct mtx hch_mtx;
+};
+
+struct hc_metrics {
+ /* housekeeping */
+ TAILQ_ENTRY(hc_metrics) rmx_q;
+ struct hc_head *rmx_head; /* head of bucket tail queue */
+ struct in_addr ip4; /* IP address */
+ struct in6_addr ip6; /* IP6 address */
+ /* endpoint specific values for tcp */
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_ssthresh; /* outbound gateway buffer limit */
+ u_long rmx_rtt; /* estimated round trip time */
+ u_long rmx_rttvar; /* estimated rtt variance */
+ u_long rmx_bandwidth; /* estimated bandwidth */
+ u_long rmx_cwnd; /* congestion window */
+ u_long rmx_sendpipe; /* outbound delay-bandwidth product */
+ u_long rmx_recvpipe; /* inbound delay-bandwidth product */
+ struct rmxp_tao rmx_tao; /* TAO cache for T/TCP */
+ /* tcp hostcache internal data */
+ int rmx_expire; /* lifetime for object */
+ u_long rmx_hits; /* number of hits */
+ u_long rmx_updates; /* number of updates */
+};
+
+/* Arbitrary values */
+#define TCP_HOSTCACHE_HASHSIZE 512
+#define TCP_HOSTCACHE_BUCKETLIMIT 30
+#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */
+#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */
+
+struct tcp_hostcache {
+ struct hc_head *hashbase;
+ uma_zone_t zone;
+ u_int hashsize;
+ u_int hashmask;
+ u_int bucket_limit;
+ u_int cache_count;
+ u_int cache_limit;
+ int expire;
+ int purgeall;
+};
+static struct tcp_hostcache tcp_hostcache;
+
+static struct callout tcp_hc_callout;
+
+static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
+static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
+static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
+static void tcp_hc_purge(void *);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+ &tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+ &tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
+ &tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
+ &tcp_hostcache.cache_count, 0, "Current number of entries in hostcache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
+ &tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
+ &tcp_hostcache.purgeall, 0, "Expire all entires on next purge run");
+
+SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
+ sysctl_tcp_hc_list, "A", "List of all hostcache entries");
+
+
+static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
+
+#define HOSTCACHE_HASH(ip) \
+ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \
+ tcp_hostcache.hashmask)
+
+/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
+#define HOSTCACHE_HASH6(ip6) \
+ (((ip6)->s6_addr32[0] ^ \
+ (ip6)->s6_addr32[1] ^ \
+ (ip6)->s6_addr32[2] ^ \
+ (ip6)->s6_addr32[3]) & \
+ tcp_hostcache.hashmask)
+
+#define THC_LOCK(lp) mtx_lock(lp)
+#define THC_UNLOCK(lp) mtx_unlock(lp)
+
+void
+tcp_hc_init(void)
+{
+ int i;
+
+ /*
+ * Initialize hostcache structures
+ */
+ tcp_hostcache.cache_count = 0;
+ tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
+ tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
+ tcp_hostcache.cache_limit =
+ tcp_hostcache.hashsize * tcp_hostcache.bucket_limit;
+ tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
+
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
+ &tcp_hostcache.hashsize);
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
+ &tcp_hostcache.cache_limit);
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
+ &tcp_hostcache.bucket_limit);
+ if (!powerof2(tcp_hostcache.hashsize)) {
+ printf("WARNING: hostcache hash size is not a power of 2.\n");
+ tcp_hostcache.hashsize = 512; /* safe default */
+ }
+ tcp_hostcache.hashmask = tcp_hostcache.hashsize - 1;
+
+ /*
+ * Allocate the hash table
+ */
+ tcp_hostcache.hashbase = (struct hc_head *)
+ malloc(tcp_hostcache.hashsize * sizeof(struct hc_head),
+ M_HOSTCACHE, M_WAITOK | M_ZERO);
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < tcp_hostcache.hashsize; i++) {
+ TAILQ_INIT(&tcp_hostcache.hashbase[i].hch_bucket);
+ tcp_hostcache.hashbase[i].hch_length = 0;
+ mtx_init(&tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
+ NULL, MTX_DEF);
+ }
+
+ /*
+ * Allocate the hostcache entries.
+ */
+ tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcp_hostcache.zone, tcp_hostcache.cache_limit);
+
+ /*
+ * Set up periodic cache cleanup.
+ */
+ callout_init(&tcp_hc_callout, CALLOUT_MPSAFE);
+ callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0);
+}
+
+/*
+ * Internal function: lookup an entry in the hostcache or return NULL.
+ *
+ * If an entry has been returned, the caller becomes responsible for
+ * unlocking the bucket row after he is done reading/modifying the entry.
+ */
+static struct hc_metrics *
+tcp_hc_lookup(struct in_conninfo *inc)
+{
+ int hash;
+ struct hc_head *hc_head;
+ struct hc_metrics *hc_entry;
+
+ KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
+
+ /*
+ * Hash the foreign ip address.
+ */
+ if (inc->inc_isipv6)
+ hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
+ else
+ hash = HOSTCACHE_HASH(&inc->inc_faddr);
+
+ hc_head = &tcp_hostcache.hashbase[hash];
+
+ /*
+ * aquire lock for this bucket row
+ * we release the lock if we don't find an entry,
+ * otherwise the caller has to unlock after he is done
+ */
+ THC_LOCK(&hc_head->hch_mtx);
+
+ /*
+ * circle through entries in bucket row looking for a match
+ */
+ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
+ if (inc->inc_isipv6) {
+ if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
+ sizeof(inc->inc6_faddr)) == 0)
+ return hc_entry;
+ } else {
+ if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
+ sizeof(inc->inc_faddr)) == 0)
+ return hc_entry;
+ }
+ }
+
+ /*
+ * We were unsuccessful and didn't find anything
+ */
+ THC_UNLOCK(&hc_head->hch_mtx);
+ return NULL;
+}
+
+/*
+ * Internal function: insert an entry into the hostcache or return NULL
+ * if unable to allocate a new one.
+ *
+ * If an entry has been returned, the caller becomes responsible for
+ * unlocking the bucket row after he is done reading/modifying the entry.
+ */
+static struct hc_metrics *
+tcp_hc_insert(struct in_conninfo *inc)
+{
+ int hash;
+ struct hc_head *hc_head;
+ struct hc_metrics *hc_entry;
+
+ KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
+
+ /*
+ * Hash the foreign ip address
+ */
+ if (inc->inc_isipv6)
+ hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
+ else
+ hash = HOSTCACHE_HASH(&inc->inc_faddr);
+
+ hc_head = &tcp_hostcache.hashbase[hash];
+
+ /*
+ * aquire lock for this bucket row
+ * we release the lock if we don't find an entry,
+ * otherwise the caller has to unlock after he is done
+ */
+ THC_LOCK(&hc_head->hch_mtx);
+
+ /*
+ * If the bucket limit is reached reuse the least used element
+ */
+ if (hc_head->hch_length >= tcp_hostcache.bucket_limit ||
+ tcp_hostcache.cache_count >= tcp_hostcache.cache_limit) {
+ hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
+ /*
+ * At first we were dropping the last element, just to
+ * reaquire it in the next two lines again which ain't
+ * very efficient. Instead just reuse the least used element.
+ * maybe we drop something that is still "in-use" but we can
+ * be "lossy".
+ */
+#if 0
+ TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
+ uma_zfree(tcp_hostcache.zone, hc_entry);
+ tcp_hostcache.hashbase[hash].hch_length--;
+ tcp_hostcache.cache_count--;
+#endif
+ tcpstat.tcps_hc_bucketoverflow++;
+ } else {
+ /*
+ * Allocate a new entry, or balk if not possible
+ */
+ hc_entry = uma_zalloc(tcp_hostcache.zone, M_NOWAIT);
+ if (hc_entry == NULL) {
+ THC_UNLOCK(&hc_head->hch_mtx);
+ return NULL;
+ }
+ }
+
+ /*
+ * Initialize basic information of hostcache entry
+ */
+ bzero(hc_entry, sizeof(*hc_entry));
+ if (inc->inc_isipv6)
+ bcopy(&hc_entry->ip6, &inc->inc6_faddr, sizeof(hc_entry->ip6));
+ else
+ hc_entry->ip4 = inc->inc_faddr;
+ hc_entry->rmx_head = hc_head;
+ hc_entry->rmx_expire = tcp_hostcache.expire;
+
+ /*
+ * Put it upfront
+ */
+ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
+ tcp_hostcache.hashbase[hash].hch_length++;
+ tcp_hostcache.cache_count++;
+ tcpstat.tcps_hc_added++;
+
+ return hc_entry;
+}
+
+/*
+ * External function: lookup an entry in the hostcache and fill out the
+ * supplied tcp metrics structure. Fills in null when no entry was found
+ * or a value is not set.
+ */
+void
+tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
+{
+ struct hc_metrics *hc_entry;
+
+ /*
+ * Find the right bucket
+ */
+ hc_entry = tcp_hc_lookup(inc);
+
+ /*
+ * If we don't have an existing object
+ */
+ if (hc_entry == NULL) {
+ bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
+ return;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
+ hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
+ hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
+ hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
+ hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
+ hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
+ hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
+ hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
+
+ /*
+ * unlock bucket row
+ */
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: lookup an entry in the hostcache and return the
+ * discovered path mtu. Returns null if no entry found or value not is set.
+ */
+u_long
+tcp_hc_getmtu(struct in_conninfo *inc)
+{
+ struct hc_metrics *hc_entry;
+ u_long mtu;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ return 0;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ mtu = hc_entry->rmx_mtu;
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+ return mtu;
+}
+
+/*
+ * External function: lookup an entry in the hostcache and fill out the
+ * supplied t/tcp tao structure. Fills in null when no entry was found
+ * or a value is not set.
+ */
+void
+tcp_hc_gettao(struct in_conninfo *inc, struct rmxp_tao *tao)
+{
+ struct hc_metrics *hc_entry;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ bzero(tao, sizeof(*tao));
+ return;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ bcopy(tao, &hc_entry->rmx_tao, sizeof(*tao));
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: update the mtu value of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
+{
+ struct hc_metrics *hc_entry;
+
+ /*
+ * Find the right bucket
+ */
+ hc_entry = tcp_hc_lookup(inc);
+
+ /*
+ * If we don't have an existing object try to insert a new one
+ */
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ hc_entry->rmx_mtu = mtu;
+
+ /*
+ * put it upfront so we find it faster next time
+ */
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+
+ /*
+ * unlock bucket row
+ */
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: update the tcp metrics of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
+{
+ struct hc_metrics *hc_entry;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ if (hcml->rmx_rtt != 0) {
+ if (hc_entry->rmx_rtt == 0)
+ hc_entry->rmx_rtt = hcml->rmx_rtt;
+ else
+ hc_entry->rmx_rtt =
+ (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
+ tcpstat.tcps_cachedrtt++;
+ }
+ if (hcml->rmx_rttvar != 0) {
+ if (hc_entry->rmx_rttvar == 0)
+ hc_entry->rmx_rttvar = hcml->rmx_rttvar;
+ else
+ hc_entry->rmx_rttvar =
+ (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
+ tcpstat.tcps_cachedrttvar++;
+ }
+ if (hcml->rmx_ssthresh != 0) {
+ if (hc_entry->rmx_ssthresh == 0)
+ hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
+ else
+ hc_entry->rmx_ssthresh =
+ (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
+ tcpstat.tcps_cachedssthresh++;
+ }
+ if (hcml->rmx_bandwidth != 0) {
+ if (hc_entry->rmx_bandwidth == 0)
+ hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
+ else
+ hc_entry->rmx_bandwidth =
+ (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
+ /* tcpstat.tcps_cachedbandwidth++; */
+ }
+ if (hcml->rmx_cwnd != 0) {
+ if (hc_entry->rmx_cwnd == 0)
+ hc_entry->rmx_cwnd = hcml->rmx_cwnd;
+ else
+ hc_entry->rmx_cwnd =
+ (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
+ /* tcpstat.tcps_cachedcwnd++; */
+ }
+ if (hcml->rmx_sendpipe != 0) {
+ if (hc_entry->rmx_sendpipe == 0)
+ hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
+ else
+ hc_entry->rmx_sendpipe =
+ (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
+ /* tcpstat.tcps_cachedsendpipe++; */
+ }
+ if (hcml->rmx_recvpipe != 0) {
+ if (hc_entry->rmx_recvpipe == 0)
+ hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
+ else
+ hc_entry->rmx_recvpipe =
+ (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
+ /* tcpstat.tcps_cachedrecvpipe++; */
+ }
+
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: update the t/tcp tao of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_updatetao(struct in_conninfo *inc, int field, tcp_cc ccount, u_short mss)
+{
+ struct hc_metrics *hc_entry;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ switch(field) {
+ case TCP_HC_TAO_CC:
+ hc_entry->rmx_tao.tao_cc = ccount;
+ break;
+
+ case TCP_HC_TAO_CCSENT:
+ hc_entry->rmx_tao.tao_ccsent = ccount;
+ break;
+
+ case TCP_HC_TAO_MSSOPT:
+ hc_entry->rmx_tao.tao_mssopt = mss;
+ break;
+ }
+
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * Sysctl function: prints the list and values of all hostcache entries in
+ * unsorted order.
+ */
+static int
+sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
+{
+ int bufsize;
+ int linesize = 128;
+ char *p, *buf;
+ int len, i, error;
+ struct hc_metrics *hc_entry;
+
+ bufsize = linesize * (tcp_hostcache.cache_count + 1);
+
+ p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
+
+ len = snprintf(p, linesize,
+ "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
+ " CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
+ p += len;
+
+#define msec(u) (((u) + 500) / 1000)
+ for (i = 0; i < tcp_hostcache.hashsize; i++) {
+ THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket,
+ rmx_q) {
+ len = snprintf(p, linesize,
+ "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
+ "%4lu %4lu %4i\n",
+ hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
+#ifdef INET6
+ ip6_sprintf(&hc_entry->ip6),
+#else
+ "IPv6?",
+#endif
+ hc_entry->rmx_mtu,
+ hc_entry->rmx_ssthresh,
+ msec(hc_entry->rmx_rtt *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
+ msec(hc_entry->rmx_rttvar *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
+ hc_entry->rmx_bandwidth * hz * 8,
+ hc_entry->rmx_cwnd,
+ hc_entry->rmx_sendpipe,
+ hc_entry->rmx_recvpipe,
+ hc_entry->rmx_hits,
+ hc_entry->rmx_updates,
+ hc_entry->rmx_expire);
+ p += len;
+ }
+ THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ }
+#undef msec
+ error = SYSCTL_OUT(req, buf, p - buf);
+ free(buf, M_TEMP);
+ return(error);
+}
+
+/*
+ * Expire and purge (old|all) entries in the tcp_hostcache. Runs periodically
+ * from the callout.
+ */
+static void
+tcp_hc_purge(void *arg)
+{
+ struct hc_metrics *hc_entry;
+ int all = (intptr_t)arg;
+ int i;
+
+ if (tcp_hostcache.purgeall) {
+ all = 1;
+ tcp_hostcache.purgeall = 0;
+ }
+
+ for (i = 0; i < tcp_hostcache.hashsize; i++) {
+ THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket,
+ rmx_q) {
+ if (all || hc_entry->rmx_expire <= 0) {
+ TAILQ_REMOVE(&tcp_hostcache.hashbase[i].hch_bucket,
+ hc_entry, rmx_q);
+ uma_zfree(tcp_hostcache.zone, hc_entry);
+ tcp_hostcache.hashbase[i].hch_length--;
+ tcp_hostcache.cache_count--;
+ } else
+ hc_entry->rmx_expire -= TCP_HOSTCACHE_PRUNE;
+ }
+ THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ }
+ callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0);
+}
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index a247138..eca5cb2 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *,
#define ND6_HINT(tp) \
do { \
if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
- (tp)->t_inpcb->in6p_route.ro_rt) \
- nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
+ nd6_nud_hint(NULL, NULL, 0); \
} while (0)
#else
#define ND6_HINT(tp)
@@ -358,8 +357,7 @@ tcp_input(m, off0)
int todrop, acked, ourfinisacked, needoutput = 0;
u_long tiwin;
struct tcpopt to; /* options in this segment */
- struct rmxp_tao *taop; /* pointer to our TAO cache entry */
- struct rmxp_tao tao_noncached; /* in case there's no cached entry */
+ struct rmxp_tao tao; /* our TAO cache entry */
int headlocked = 0;
struct sockaddr_in *next_hop = NULL;
int rstreason; /* For badport_bandlim accounting purposes */
@@ -389,6 +387,7 @@ tcp_input(m, off0)
#ifdef INET6
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ bzero(&tao, sizeof(tao));
bzero((char *)&to, sizeof(to));
tcpstat.tcps_rcvtotal++;
@@ -707,11 +706,9 @@ findpcb:
if (isipv6) {
inc.inc6_faddr = ip6->ip6_src;
inc.inc6_laddr = ip6->ip6_dst;
- inc.inc6_route.ro_rt = NULL; /* XXX */
} else {
inc.inc_faddr = ip->ip_src;
inc.inc_laddr = ip->ip_dst;
- inc.inc_route.ro_rt = NULL; /* XXX */
}
inc.inc_fport = th->th_sport;
inc.inc_lport = th->th_dport;
@@ -916,7 +913,7 @@ findpcb:
}
after_listen:
-/* XXX temp debugging */
+ /* XXX temp debugging */
/* should not happen - syncache should pick up these connections */
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
@@ -930,8 +927,9 @@ after_listen:
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
/*
- * Process options.
- * XXX this is tradtitional behavior, may need to be cleaned up.
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
if (thflags & TH_SYN) {
@@ -1179,10 +1177,8 @@ after_listen:
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
- if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
@@ -1195,7 +1191,7 @@ after_listen:
* Our new SYN, when it arrives, will serve as the
* needed ACK.
*/
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1225,7 +1221,7 @@ after_listen:
*/
if (to.to_flags & TOF_CCECHO) {
if (tp->cc_send != to.to_ccecho) {
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1246,8 +1242,8 @@ after_listen:
tp->rcv_scale = tp->request_r_scale;
}
/* Segment is acceptable, update cache if undefined. */
- if (taop->tao_ccsent == 0)
- taop->tao_ccsent = to.to_ccecho;
+ if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
@@ -1290,14 +1286,16 @@ after_listen:
tp->t_flags |= TF_ACKNOW;
callout_stop(tp->tt_rexmt);
if (to.to_flags & TOF_CC) {
- if (taop->tao_cc != 0 &&
- CC_GT(to.to_cc, taop->tao_cc)) {
+ if (tao.tao_cc != 0 &&
+ CC_GT(to.to_cc, tao.tao_cc)) {
/*
* update cache and make transition:
* SYN-SENT -> ESTABLISHED*
* SYN-SENT* -> FIN-WAIT-1*
*/
- taop->tao_cc = to.to_cc;
+ tao.tao_cc = to.to_cc;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
tp->t_state = TCPS_FIN_WAIT_1;
@@ -1313,8 +1311,12 @@ after_listen:
} else
tp->t_state = TCPS_SYN_RECEIVED;
} else {
- /* CC.NEW or no option => invalidate cache */
- taop->tao_cc = 0;
+ if (tcp_do_rfc1644) {
+ /* CC.NEW or no option => invalidate cache */
+ tao.tao_cc = 0;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
+ }
tp->t_state = TCPS_SYN_RECEIVED;
}
}
@@ -1682,13 +1684,14 @@ trimthenstep6:
}
/*
* Upon successful completion of 3-way handshake,
- * update cache.CC if it was undefined, pass any queued
- * data to the user, and advance state appropriately.
+ * update cache.CC, pass any queued data to the user,
+ * and advance state appropriately.
*/
- if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
- taop->tao_cc == 0)
- taop->tao_cc = tp->cc_recv;
-
+ if (tcp_do_rfc1644) {
+ tao.tao_cc = tp->cc_recv;
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
+ tp->cc_recv, 0);
+ }
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
@@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt)
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
- * NOTE that this routine is only called when we process an incoming
- * segment, for outgoing segments only tcp_mssopt is called.
*
* In case of T/TCP, we call this routine during implicit connection
* setup as well (offer = -1), to initialize maxseg from the cached
* MSS of our peer.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
*/
void
tcp_mss(tp, offer)
struct tcpcb *tp;
int offer;
{
- register struct rtentry *rt;
- struct ifnet *ifp;
- register int rtt, mss;
+ int rtt, mss;
u_long bufsize;
+ u_long maxmtu;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- struct rmxp_tao *taop;
+ struct hc_metrics_lite metrics;
+ struct rmxp_tao tao;
int origoffer = offer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
@@ -2637,96 +2641,96 @@ tcp_mss(tp, offer)
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ const size_t min_protoh = sizeof(struct tcpiphdr);
#endif
+ bzero(&tao, sizeof(tao));
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
- rt = tcp_rtlookup(&inp->inp_inc);
- if (rt == NULL) {
- tp->t_maxopd = tp->t_maxseg =
- isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- return;
+ /* initialize */
+#ifdef INET6
+ if (isipv6) {
+ maxmtu = tcp_maxmtu6(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
+ } else
+#endif
+ {
+ maxmtu = tcp_maxmtu(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
}
- ifp = rt->rt_ifp;
so = inp->inp_socket;
- taop = rmx_taop(rt->rt_rmx);
/*
- * Offer == -1 means that we didn't receive SYN yet,
- * use cached value in that case;
+ * no route to sender, take default mss and return
*/
- if (offer == -1)
- offer = taop->tao_mssopt;
- /*
- * Offer == 0 means that there was no MSS on the SYN segment,
- * in this case we use tcp_mssdflt.
- */
- if (offer == 0)
- offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- else
- /*
- * Sanity check: make sure that maxopd will be large
- * enough to allow some data on segments even is the
- * all the option space is used (40bytes). Otherwise
- * funny things may happen in tcp_output.
- */
- offer = max(offer, 64);
- taop->tao_mssopt = offer;
+ if (maxmtu == 0)
+ return;
+
+ /* what have we got? */
+ switch (offer) {
+ case 0:
+ /*
+ * Offer == 0 means that there was no MSS on the SYN
+ * segment, in this case we use tcp_mssdflt.
+ */
+ offer =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif
+ tcp_mssdflt;
+ break;
+
+ case -1:
+ /*
+ * Offer == -1 means that we didn't receive SYN yet,
+ * use cached value in that case;
+ */
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt != 0)
+ offer = tao.tao_mssopt;
+ /* FALLTHROUGH */
+
+ default:
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even if the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ offer = max(offer, 64);
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_MSSOPT, 0, offer);
+ }
/*
- * While we're here, check if there's an initial rtt
- * or rttvar. Convert from the route-table units
- * to scaled multiples of the slow timeout timer.
+ * rmx information is now retrieved from tcp_hostcache
*/
- if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
- /*
- * XXX the lock bit for RTT indicates that the value
- * is also a minimum value; this is subject to time.
- */
- if (rt->rt_rmx.rmx_locks & RTV_RTT)
- tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
- tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
- tcpstat.tcps_usedrtt++;
- if (rt->rt_rmx.rmx_rttvar) {
- tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- tcpstat.tcps_usedrttvar++;
- } else {
- /* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
- }
- TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
- }
+ tcp_hc_get(&inp->inp_inc, &metrics);
+
/*
- * if there's an mtu associated with the route, use it
+ * if there's a discovered mtu int tcp hostcache, use it
* else, use the link mtu.
*/
- if (rt->rt_rmx.rmx_mtu)
- mss = rt->rt_rmx.rmx_mtu - min_protoh;
+ if (metrics.rmx_mtu)
+ mss = metrics.rmx_mtu - min_protoh;
else {
#ifdef INET6
- mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
- - min_protoh;
-#else
- mss = ifp->if_mtu - min_protoh;
-#endif
-#ifdef INET6
if (isipv6) {
- if (!in6_localaddr(&inp->in6p_faddr))
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in6_localaddr(&inp->in6p_faddr))
mss = min(mss, tcp_v6mssdflt);
} else
#endif
- if (!in_localaddr(inp->inp_faddr))
+ {
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in_localaddr(inp->inp_faddr))
mss = min(mss, tcp_mssdflt);
+ }
}
mss = min(mss, offer);
+
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
@@ -2749,6 +2753,7 @@ tcp_mss(tp, offer)
(origoffer == -1 ||
(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
mss -= TCPOLEN_CC_APPA;
+ tp->t_maxseg = mss;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
if (mss > MCLBYTES)
@@ -2757,15 +2762,18 @@ tcp_mss(tp, offer)
if (mss > MCLBYTES)
mss = mss / MCLBYTES * MCLBYTES;
#endif
+ tp->t_maxseg = mss;
+
/*
- * If there's a pipesize, change the socket buffer
- * to that size. Make the socket buffers an integral
- * number of mss units; if the mss is larger than
- * the socket buffer, decrease the mss.
+ * If there's a pipesize, change the socket buffer to that size,
+ * don't change if sb_hiwat is different than default (then it
+ * has been changed on purpose with setsockopt).
+ * Make the socket buffers an integral number of mss units;
+ * if the mss is larger than the socket buffer, decrease the mss.
*/
-#ifdef RTV_SPIPE
- if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
-#endif
+ if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
+ bufsize = metrics.rmx_sendpipe;
+ else
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss)
mss = bufsize;
@@ -2778,9 +2786,9 @@ tcp_mss(tp, offer)
}
tp->t_maxseg = mss;
-#ifdef RTV_RPIPE
- if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
-#endif
+ if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
+ bufsize = metrics.rmx_recvpipe;
+ else
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@@ -2789,62 +2797,110 @@ tcp_mss(tp, offer)
if (bufsize > so->so_rcv.sb_hiwat)
(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
}
+ /*
+ * While we're here, check the others too
+ */
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ tp->t_srtt = rtt;
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ tcpstat.tcps_usedrtt++;
+ if (metrics.rmx_rttvar) {
+ tp->t_rttvar = metrics.rmx_rttvar;
+ tcpstat.tcps_usedrttvar++;
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ if (metrics.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
+ tcpstat.tcps_usedssthresh++;
+ }
+ if (metrics.rmx_bandwidth)
+ tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
*/
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(mss,
+ min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
+#ifdef INET6
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
+#endif
else
tp->snd_cwnd = mss * ss_fltsz;
-
- if (rt->rt_rmx.rmx_ssthresh) {
- /*
- * There's some sort of gateway or interface
- * buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
- * threshold to no less than 2*mss.
- */
- tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
- tcpstat.tcps_usedssthresh++;
- }
}
/*
* Determine the MSS option to send on an outgoing SYN.
*/
int
-tcp_mssopt(tp)
- struct tcpcb *tp;
+tcp_mssopt(inc)
+ struct in_conninfo *inc;
{
- struct rtentry *rt;
+ int mss = 0;
+ u_long maxmtu = 0;
+ u_long thcmtu = 0;
+ size_t min_protoh;
#ifdef INET6
- int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
- size_t min_protoh = isipv6 ?
- sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
- sizeof (struct tcpiphdr);
-#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ int isipv6 = inc->inc_isipv6 ? 1 : 0;
#endif
- if (isipv6)
- rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
- else
- rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
- if (rt == NULL)
- return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
+ KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
#ifdef INET6
- return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
- rt->rt_ifp->if_mtu - min_protoh);
-#else
- return (rt->rt_ifp->if_mtu - min_protoh);
+ if (isipv6) {
+ mss = tcp_v6mssdflt;
+ maxmtu = tcp_maxmtu6(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
#endif
+ {
+ mss = tcp_mssdflt;
+ maxmtu = tcp_maxmtu(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct tcpiphdr);
+ }
+ if (maxmtu && thcmtu)
+ mss = min(maxmtu, thcmtu) - min_protoh;
+ else if (maxmtu || thcmtu)
+ mss = max(maxmtu, thcmtu) - min_protoh;
+
+ return (mss);
}
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index a48ec4a..a8b8e53 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -125,11 +125,12 @@ tcp_output(struct tcpcb *tp)
#if 0
int maxburst = TCP_MAXBURST;
#endif
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int isipv6;
+ bzero(&tao, sizeof(tao));
isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
@@ -232,7 +233,6 @@ again:
*/
len = (long)ulmin(so->so_snd.sb_cc, win) - off;
- taop = tcp_gettaocache(&tp->t_inpcb->inp_inc);
/*
* Lop off SYN bit if it has already been sent. However, if this
@@ -242,8 +242,10 @@ again:
if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
flags &= ~TH_SYN;
off--, len++;
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao);
if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
- (taop == NULL || taop->tao_ccsent == 0))
+ tao.tao_ccsent == 0)
return 0;
}
@@ -429,7 +431,7 @@ send:
opt[0] = TCPOPT_MAXSEG;
opt[1] = TCPOLEN_MAXSEG;
- mss = htons((u_short) tcp_mssopt(tp));
+ mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc));
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
@@ -872,10 +874,7 @@ send:
* Also, desired default hop limit might be changed via
* Neighbor Discovery.
*/
- ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
- tp->t_inpcb->in6p_route.ro_rt ?
- tp->t_inpcb->in6p_route.ro_rt->rt_ifp
- : NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
/* TODO: IPv6 IP6TOS_ECT bit on */
#if defined(IPSEC) && !defined(FAST_IPSEC)
@@ -886,36 +885,27 @@ send:
}
#endif /*IPSEC*/
error = ip6_output(m,
- tp->t_inpcb->in6p_outputopts,
- &tp->t_inpcb->in6p_route,
+ tp->t_inpcb->in6p_outputopts, NULL,
(so->so_options & SO_DONTROUTE), NULL, NULL,
tp->t_inpcb);
} else
#endif /* INET6 */
{
- struct rtentry *rt;
ip->ip_len = m->m_pkthdr.len;
#ifdef INET6
if (INP_CHECK_SOCKAF(so, AF_INET6))
- ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
- tp->t_inpcb->in6p_route.ro_rt ?
- tp->t_inpcb->in6p_route.ro_rt->rt_ifp
- : NULL);
+ ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
#endif /* INET6 */
/*
- * See if we should do MTU discovery. We do it only if the following
- * are true:
- * 1) we have a valid route to the destination
- * 2) the MTU is not locked (if it is, then discovery has been
- * disabled)
+ * If we do path MTU discovery, then we set DF on every packet.
+ * This might not be the best thing to do according to RFC3390
+ * Section 2. However the tcp hostcache migitates the problem
+ * so it affects only the first tcp connection with a host.
*/
- if (path_mtu_discovery
- && (rt = tp->t_inpcb->inp_route.ro_rt)
- && rt->rt_flags & RTF_UP
- && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
+ if (path_mtu_discovery)
ip->ip_off |= IP_DF;
- }
- error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
+
+ error = ip_output(m, tp->t_inpcb->inp_options, NULL,
(so->so_options & SO_DONTROUTE), 0, tp->t_inpcb);
}
if (error) {
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index a247138..eca5cb2 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *,
#define ND6_HINT(tp) \
do { \
if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
- (tp)->t_inpcb->in6p_route.ro_rt) \
- nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
+ nd6_nud_hint(NULL, NULL, 0); \
} while (0)
#else
#define ND6_HINT(tp)
@@ -358,8 +357,7 @@ tcp_input(m, off0)
int todrop, acked, ourfinisacked, needoutput = 0;
u_long tiwin;
struct tcpopt to; /* options in this segment */
- struct rmxp_tao *taop; /* pointer to our TAO cache entry */
- struct rmxp_tao tao_noncached; /* in case there's no cached entry */
+ struct rmxp_tao tao; /* our TAO cache entry */
int headlocked = 0;
struct sockaddr_in *next_hop = NULL;
int rstreason; /* For badport_bandlim accounting purposes */
@@ -389,6 +387,7 @@ tcp_input(m, off0)
#ifdef INET6
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ bzero(&tao, sizeof(tao));
bzero((char *)&to, sizeof(to));
tcpstat.tcps_rcvtotal++;
@@ -707,11 +706,9 @@ findpcb:
if (isipv6) {
inc.inc6_faddr = ip6->ip6_src;
inc.inc6_laddr = ip6->ip6_dst;
- inc.inc6_route.ro_rt = NULL; /* XXX */
} else {
inc.inc_faddr = ip->ip_src;
inc.inc_laddr = ip->ip_dst;
- inc.inc_route.ro_rt = NULL; /* XXX */
}
inc.inc_fport = th->th_sport;
inc.inc_lport = th->th_dport;
@@ -916,7 +913,7 @@ findpcb:
}
after_listen:
-/* XXX temp debugging */
+ /* XXX temp debugging */
/* should not happen - syncache should pick up these connections */
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
@@ -930,8 +927,9 @@ after_listen:
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
/*
- * Process options.
- * XXX this is tradtitional behavior, may need to be cleaned up.
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
if (thflags & TH_SYN) {
@@ -1179,10 +1177,8 @@ after_listen:
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
- if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
@@ -1195,7 +1191,7 @@ after_listen:
* Our new SYN, when it arrives, will serve as the
* needed ACK.
*/
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1225,7 +1221,7 @@ after_listen:
*/
if (to.to_flags & TOF_CCECHO) {
if (tp->cc_send != to.to_ccecho) {
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1246,8 +1242,8 @@ after_listen:
tp->rcv_scale = tp->request_r_scale;
}
/* Segment is acceptable, update cache if undefined. */
- if (taop->tao_ccsent == 0)
- taop->tao_ccsent = to.to_ccecho;
+ if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
@@ -1290,14 +1286,16 @@ after_listen:
tp->t_flags |= TF_ACKNOW;
callout_stop(tp->tt_rexmt);
if (to.to_flags & TOF_CC) {
- if (taop->tao_cc != 0 &&
- CC_GT(to.to_cc, taop->tao_cc)) {
+ if (tao.tao_cc != 0 &&
+ CC_GT(to.to_cc, tao.tao_cc)) {
/*
* update cache and make transition:
* SYN-SENT -> ESTABLISHED*
* SYN-SENT* -> FIN-WAIT-1*
*/
- taop->tao_cc = to.to_cc;
+ tao.tao_cc = to.to_cc;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
tp->t_state = TCPS_FIN_WAIT_1;
@@ -1313,8 +1311,12 @@ after_listen:
} else
tp->t_state = TCPS_SYN_RECEIVED;
} else {
- /* CC.NEW or no option => invalidate cache */
- taop->tao_cc = 0;
+ if (tcp_do_rfc1644) {
+ /* CC.NEW or no option => invalidate cache */
+ tao.tao_cc = 0;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
+ }
tp->t_state = TCPS_SYN_RECEIVED;
}
}
@@ -1682,13 +1684,14 @@ trimthenstep6:
}
/*
* Upon successful completion of 3-way handshake,
- * update cache.CC if it was undefined, pass any queued
- * data to the user, and advance state appropriately.
+ * update cache.CC, pass any queued data to the user,
+ * and advance state appropriately.
*/
- if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
- taop->tao_cc == 0)
- taop->tao_cc = tp->cc_recv;
-
+ if (tcp_do_rfc1644) {
+ tao.tao_cc = tp->cc_recv;
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
+ tp->cc_recv, 0);
+ }
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
@@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt)
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
- * NOTE that this routine is only called when we process an incoming
- * segment, for outgoing segments only tcp_mssopt is called.
*
* In case of T/TCP, we call this routine during implicit connection
* setup as well (offer = -1), to initialize maxseg from the cached
* MSS of our peer.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
*/
void
tcp_mss(tp, offer)
struct tcpcb *tp;
int offer;
{
- register struct rtentry *rt;
- struct ifnet *ifp;
- register int rtt, mss;
+ int rtt, mss;
u_long bufsize;
+ u_long maxmtu;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- struct rmxp_tao *taop;
+ struct hc_metrics_lite metrics;
+ struct rmxp_tao tao;
int origoffer = offer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
@@ -2637,96 +2641,96 @@ tcp_mss(tp, offer)
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ const size_t min_protoh = sizeof(struct tcpiphdr);
#endif
+ bzero(&tao, sizeof(tao));
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
- rt = tcp_rtlookup(&inp->inp_inc);
- if (rt == NULL) {
- tp->t_maxopd = tp->t_maxseg =
- isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- return;
+ /* initialize */
+#ifdef INET6
+ if (isipv6) {
+ maxmtu = tcp_maxmtu6(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
+ } else
+#endif
+ {
+ maxmtu = tcp_maxmtu(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
}
- ifp = rt->rt_ifp;
so = inp->inp_socket;
- taop = rmx_taop(rt->rt_rmx);
/*
- * Offer == -1 means that we didn't receive SYN yet,
- * use cached value in that case;
+ * no route to sender, take default mss and return
*/
- if (offer == -1)
- offer = taop->tao_mssopt;
- /*
- * Offer == 0 means that there was no MSS on the SYN segment,
- * in this case we use tcp_mssdflt.
- */
- if (offer == 0)
- offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- else
- /*
- * Sanity check: make sure that maxopd will be large
- * enough to allow some data on segments even is the
- * all the option space is used (40bytes). Otherwise
- * funny things may happen in tcp_output.
- */
- offer = max(offer, 64);
- taop->tao_mssopt = offer;
+ if (maxmtu == 0)
+ return;
+
+ /* what have we got? */
+ switch (offer) {
+ case 0:
+ /*
+ * Offer == 0 means that there was no MSS on the SYN
+ * segment, in this case we use tcp_mssdflt.
+ */
+ offer =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif
+ tcp_mssdflt;
+ break;
+
+ case -1:
+ /*
+ * Offer == -1 means that we didn't receive SYN yet,
+ * use cached value in that case;
+ */
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt != 0)
+ offer = tao.tao_mssopt;
+ /* FALLTHROUGH */
+
+ default:
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even if the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ offer = max(offer, 64);
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_MSSOPT, 0, offer);
+ }
/*
- * While we're here, check if there's an initial rtt
- * or rttvar. Convert from the route-table units
- * to scaled multiples of the slow timeout timer.
+ * rmx information is now retrieved from tcp_hostcache
*/
- if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
- /*
- * XXX the lock bit for RTT indicates that the value
- * is also a minimum value; this is subject to time.
- */
- if (rt->rt_rmx.rmx_locks & RTV_RTT)
- tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
- tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
- tcpstat.tcps_usedrtt++;
- if (rt->rt_rmx.rmx_rttvar) {
- tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- tcpstat.tcps_usedrttvar++;
- } else {
- /* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
- }
- TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
- }
+ tcp_hc_get(&inp->inp_inc, &metrics);
+
/*
- * if there's an mtu associated with the route, use it
+ * if there's a discovered mtu int tcp hostcache, use it
* else, use the link mtu.
*/
- if (rt->rt_rmx.rmx_mtu)
- mss = rt->rt_rmx.rmx_mtu - min_protoh;
+ if (metrics.rmx_mtu)
+ mss = metrics.rmx_mtu - min_protoh;
else {
#ifdef INET6
- mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
- - min_protoh;
-#else
- mss = ifp->if_mtu - min_protoh;
-#endif
-#ifdef INET6
if (isipv6) {
- if (!in6_localaddr(&inp->in6p_faddr))
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in6_localaddr(&inp->in6p_faddr))
mss = min(mss, tcp_v6mssdflt);
} else
#endif
- if (!in_localaddr(inp->inp_faddr))
+ {
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in_localaddr(inp->inp_faddr))
mss = min(mss, tcp_mssdflt);
+ }
}
mss = min(mss, offer);
+
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
@@ -2749,6 +2753,7 @@ tcp_mss(tp, offer)
(origoffer == -1 ||
(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
mss -= TCPOLEN_CC_APPA;
+ tp->t_maxseg = mss;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
if (mss > MCLBYTES)
@@ -2757,15 +2762,18 @@ tcp_mss(tp, offer)
if (mss > MCLBYTES)
mss = mss / MCLBYTES * MCLBYTES;
#endif
+ tp->t_maxseg = mss;
+
/*
- * If there's a pipesize, change the socket buffer
- * to that size. Make the socket buffers an integral
- * number of mss units; if the mss is larger than
- * the socket buffer, decrease the mss.
+ * If there's a pipesize, change the socket buffer to that size,
+ * don't change if sb_hiwat is different than default (then it
+ * has been changed on purpose with setsockopt).
+ * Make the socket buffers an integral number of mss units;
+ * if the mss is larger than the socket buffer, decrease the mss.
*/
-#ifdef RTV_SPIPE
- if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
-#endif
+ if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
+ bufsize = metrics.rmx_sendpipe;
+ else
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss)
mss = bufsize;
@@ -2778,9 +2786,9 @@ tcp_mss(tp, offer)
}
tp->t_maxseg = mss;
-#ifdef RTV_RPIPE
- if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
-#endif
+ if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
+ bufsize = metrics.rmx_recvpipe;
+ else
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@@ -2789,62 +2797,110 @@ tcp_mss(tp, offer)
if (bufsize > so->so_rcv.sb_hiwat)
(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
}
+ /*
+ * While we're here, check the others too
+ */
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ tp->t_srtt = rtt;
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ tcpstat.tcps_usedrtt++;
+ if (metrics.rmx_rttvar) {
+ tp->t_rttvar = metrics.rmx_rttvar;
+ tcpstat.tcps_usedrttvar++;
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ if (metrics.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
+ tcpstat.tcps_usedssthresh++;
+ }
+ if (metrics.rmx_bandwidth)
+ tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
*/
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(mss,
+ min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
+#ifdef INET6
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
+#endif
else
tp->snd_cwnd = mss * ss_fltsz;
-
- if (rt->rt_rmx.rmx_ssthresh) {
- /*
- * There's some sort of gateway or interface
- * buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
- * threshold to no less than 2*mss.
- */
- tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
- tcpstat.tcps_usedssthresh++;
- }
}
/*
* Determine the MSS option to send on an outgoing SYN.
*/
int
-tcp_mssopt(tp)
- struct tcpcb *tp;
+tcp_mssopt(inc)
+ struct in_conninfo *inc;
{
- struct rtentry *rt;
+ int mss = 0;
+ u_long maxmtu = 0;
+ u_long thcmtu = 0;
+ size_t min_protoh;
#ifdef INET6
- int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
- size_t min_protoh = isipv6 ?
- sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
- sizeof (struct tcpiphdr);
-#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ int isipv6 = inc->inc_isipv6 ? 1 : 0;
#endif
- if (isipv6)
- rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
- else
- rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
- if (rt == NULL)
- return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
+ KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
#ifdef INET6
- return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
- rt->rt_ifp->if_mtu - min_protoh);
-#else
- return (rt->rt_ifp->if_mtu - min_protoh);
+ if (isipv6) {
+ mss = tcp_v6mssdflt;
+ maxmtu = tcp_maxmtu6(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
#endif
+ {
+ mss = tcp_mssdflt;
+ maxmtu = tcp_maxmtu(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct tcpiphdr);
+ }
+ if (maxmtu && thcmtu)
+ mss = min(maxmtu, thcmtu) - min_protoh;
+ else if (maxmtu || thcmtu)
+ mss = max(maxmtu, thcmtu) - min_protoh;
+
+ return (mss);
}
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 7ce06f6..dfd6de1 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
- tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
+ tcp_hc_init();
}
/*
@@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
- struct route *ro = 0;
- struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
- struct route_in6 *ro6 = 0;
- struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
- struct inpcb *inp;
+ struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
-#ifdef INET6
- if (isipv6)
- ro6 = &inp->in6p_route;
- else
-#endif /* INET6 */
- ro = &inp->inp_route;
- } else {
- inp = NULL;
-#ifdef INET6
- if (isipv6) {
- ro6 = &sro6;
- bzero(ro6, sizeof *ro6);
- } else
-#endif /* INET6 */
- {
- ro = &sro;
- bzero(ro, sizeof *ro);
- }
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
- ip6->ip6_hlim = in6_selecthlim(inp,
- ro6 && ro6->ro_rt ?
- ro6->ro_rt->rt_ifp :
- NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
- if (isipv6) {
- (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
- if (ro6 == &sro6 && ro6->ro_rt) {
- RTFREE(ro6->ro_rt);
- ro6->ro_rt = NULL;
- }
- } else
+ if (isipv6)
+ (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ else
#endif /* INET6 */
- {
- (void) ip_output(m, NULL, ro, ipflags, NULL, inp);
- if (ro == &sro && ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
- }
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- struct rtentry *rt;
- int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
- * 'Enough' is arbitrarily defined as the 16 samples.
- * 16 samples is enough for the srtt filter to converge
- * to within 5% of the correct value; fewer samples and
- * we could save a very bogus rtt.
- *
- * Don't update the default route's characteristics and don't
- * update anything that the user "locked".
+ * 'Enough' is arbitrarily defined as 4 rtt samples.
+ * 4 samples is enough for the srtt filter to converge
+ * to within enough % of the correct value; fewer samples
+ * and we could save a bogus rtt. The danger is not high
+ * as tcp quickly recovers from everything.
+ * XXX: Works very well but needs some more statistics!
*/
- if (tp->t_rttupdated >= 16) {
- register u_long i = 0;
-#ifdef INET6
- if (isipv6) {
- struct sockaddr_in6 *sin6;
+ if (tp->t_rttupdated >= 4) {
+ struct hc_metrics_lite metrics;
+ u_long ssthresh;
- if ((rt = inp->in6p_route.ro_rt) == NULL)
- goto no_valid_rt;
- sin6 = (struct sockaddr_in6 *)rt_key(rt);
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- goto no_valid_rt;
- }
- else
-#endif /* INET6 */
- if ((rt = inp->inp_route.ro_rt) == NULL ||
- ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
- == INADDR_ANY)
- goto no_valid_rt;
-
- if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
- i = tp->t_srtt *
- (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- if (rt->rt_rmx.rmx_rtt && i)
- /*
- * filter this update to half the old & half
- * the new values, converting scale.
- * See route.h and tcp_var.h for a
- * description of the scaling constants.
- */
- rt->rt_rmx.rmx_rtt =
- (rt->rt_rmx.rmx_rtt + i) / 2;
- else
- rt->rt_rmx.rmx_rtt = i;
- tcpstat.tcps_cachedrtt++;
- }
- if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
- i = tp->t_rttvar *
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- if (rt->rt_rmx.rmx_rttvar && i)
- rt->rt_rmx.rmx_rttvar =
- (rt->rt_rmx.rmx_rttvar + i) / 2;
- else
- rt->rt_rmx.rmx_rttvar = i;
- tcpstat.tcps_cachedrttvar++;
- }
+ bzero(&metrics, sizeof(metrics));
/*
- * The old comment here said:
- * update the pipelimit (ssthresh) if it has been updated
- * already or if a pipesize was specified & the threshhold
- * got below half the pipesize. I.e., wait for bad news
- * before we start updating, then update on both good
- * and bad news.
- *
- * But we want to save the ssthresh even if no pipesize is
- * specified explicitly in the route, because such
- * connections still have an implicit pipesize specified
- * by the global tcp_sendspace. In the absence of a reliable
- * way to calculate the pipesize, it will have to do.
+ * Update the ssthresh always when the conditions below
+ * are satisfied. This gives us better new start value
+ * for the congestion avoidance for new connections.
+ * ssthresh is only set if packet loss occured on a session.
*/
- i = tp->snd_ssthresh;
- if (rt->rt_rmx.rmx_sendpipe != 0)
- dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
- else
- dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
- if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
- i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
- || dosavessthresh) {
+ ssthresh = tp->snd_ssthresh;
+ if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
- i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
- if (i < 2)
- i = 2;
- i *= (u_long)(tp->t_maxseg +
+ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (ssthresh < 2)
+ ssthresh = 2;
+ ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
- if (rt->rt_rmx.rmx_ssthresh)
- rt->rt_rmx.rmx_ssthresh =
- (rt->rt_rmx.rmx_ssthresh + i) / 2;
- else
- rt->rt_rmx.rmx_ssthresh = i;
- tcpstat.tcps_cachedssthresh++;
- }
+ } else
+ ssthresh = 0;
+ metrics.rmx_ssthresh = ssthresh;
+
+ metrics.rmx_rtt = tp->t_srtt;
+ metrics.rmx_rttvar = tp->t_rttvar;
+ /* XXX: This wraps if the pipe is more than 4 Gbit per second */
+ metrics.rmx_bandwidth = tp->snd_bandwidth;
+ metrics.rmx_cwnd = tp->snd_cwnd;
+ metrics.rmx_sendpipe = 0;
+ metrics.rmx_recvpipe = 0;
+
+ tcp_hc_update(&inp->inp_inc, &metrics);
}
- no_valid_rt:
+
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
- else if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
- struct rtentry *rt;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
- int offered;
+ u_int maxmtu;
+ u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ bzero(&tao, sizeof(tao));
if (tp) {
+ maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
+ romtu =
#ifdef INET6
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
+ isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
- rt = tcp_rtlookup(&inp->inp_inc);
- if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tcp_maxmtu(&inp->inp_inc);
+ if (!maxmtu)
+ maxmtu = romtu;
+ else
+ maxmtu = min(maxmtu, romtu);
+ if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
- taop = rmx_taop(rt->rt_rmx);
- offered = taop->tao_mssopt;
- mss = rt->rt_rmx.rmx_mtu -
+ mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
- if (offered)
- mss = min(mss, offered);
+ if (tcp_do_rfc1644) {
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt)
+ mss = min(mss, tao.tao_mssopt);
+ }
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
-struct rtentry *
-tcp_rtlookup(inc)
+u_long
+tcp_maxmtu(inc)
struct in_conninfo *inc;
{
- struct route *ro;
- struct rtentry *rt;
-
- ro = &inc->inc_route;
- rt = ro->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (inc->inc_faddr.s_addr != INADDR_ANY) {
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
- inc->inc_faddr;
- rtalloc(ro);
- rt = ro->ro_rt;
- }
+ struct route sro;
+ struct sockaddr_in *dst;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
+
+ sro.ro_rt = NULL;
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ dst = (struct sockaddr_in *)&sro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = inc->inc_faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
+ }
+ if (sro.ro_rt != NULL) {
+ ifp = sro.ro_rt->rt_ifp;
+ if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = ifp->if_mtu;
+ else
+ maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ RTFREE(sro.ro_rt);
}
- return rt;
+ return (maxmtu);
}
#ifdef INET6
-struct rtentry *
-tcp_rtlookup6(inc)
+u_long
+tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
- struct route_in6 *ro6;
- struct rtentry *rt;
-
- ro6 = &inc->inc6_route;
- rt = ro6->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
- ro6->ro_dst.sin6_family = AF_INET6;
- ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
- ro6->ro_dst.sin6_addr = inc->inc6_faddr;
- rtalloc((struct route *)ro6);
- rt = ro6->ro_rt;
- }
+ struct route_in6 sro6;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
+
+ sro6.ro_rt = NULL;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ sro6.ro_dst.sin6_family = AF_INET6;
+ sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ sro6.ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
- return rt;
+ if (sro6.ro_rt != NULL) {
+ ifp = sro6.ro_rt->rt_ifp;
+ if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
+ else
+ maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
+ IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+ RTFREE(sro6.ro_rt);
+ }
+
+ return (maxmtu);
}
#endif /* INET6 */
@@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp)
#endif /*IPSEC*/
/*
- * Return a pointer to the cached information about the remote host.
- * The cached information is stored in the protocol specific part of
- * the route metrics.
- */
-struct rmxp_tao *
-tcp_gettaocache(inc)
- struct in_conninfo *inc;
-{
- struct rtentry *rt;
-
-#ifdef INET6
- if (inc->inc_isipv6)
- rt = tcp_rtlookup6(inc);
- else
-#endif /* INET6 */
- rt = tcp_rtlookup(inc);
-
- /* Make sure this is a host route and is up. */
- if (rt == NULL ||
- (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
- return NULL;
-
- return rmx_taop(rt->rt_rmx);
-}
-
-/*
- * Clear all the TAO cache entries, called from tcp_init.
- *
- * XXX
- * This routine is just an empty one, because we assume that the routing
- * routing tables are initialized at the same time when TCP, so there is
- * nothing in the cache left over.
- */
-static void
-tcp_cleartaocache()
-{
-}
-
-/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
* inp is locked, and is unlocked before returning.
@@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
- ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
- inp->in6p_route.ro_rt->rt_ifp : NULL);
- error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
- error = ip_output(m, inp->inp_options, &inp->inp_route,
+ error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 822ffeb..e2d96e9 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -202,29 +202,9 @@ static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
static void
syncache_free(struct syncache *sc)
{
- struct rtentry *rt;
-
if (sc->sc_ipopts)
(void) m_free(sc->sc_ipopts);
-#ifdef INET6
- if (sc->sc_inc.inc_isipv6)
- rt = sc->sc_route6.ro_rt;
- else
-#endif
- rt = sc->sc_route.ro_rt;
- if (rt != NULL) {
- /*
- * If this is the only reference to a protocol cloned
- * route, remove it immediately.
- */
- if (rt->rt_flags & RTF_WASCLONED &&
- (sc->sc_flags & SCF_KEEPROUTE) == 0 &&
- rt->rt_refcnt == 1)
- rtrequest(RTM_DELETE, rt_key(rt),
- rt->rt_gateway, rt_mask(rt),
- rt->rt_flags, NULL);
- RTFREE(rt);
- }
+
uma_zfree(tcp_syncache.zone, sc);
}
@@ -644,8 +624,6 @@ syncache_socket(sc, lso, m)
if (oinp->in6p_outputopts)
inp->in6p_outputopts =
ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
- inp->in6p_route = sc->sc_route6;
- sc->sc_route6.ro_rt = NULL;
MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
M_SONAME, M_NOWAIT | M_ZERO);
@@ -675,8 +653,6 @@ syncache_socket(sc, lso, m)
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
}
- inp->inp_route = sc->sc_route;
- sc->sc_route.ro_rt = NULL;
MALLOC(sin, struct sockaddr_in *, sizeof *sin,
M_SONAME, M_NOWAIT | M_ZERO);
@@ -733,6 +709,10 @@ syncache_socket(sc, lso, m)
tp->cc_recv = sc->sc_cc_recv;
}
+ /*
+ * Set up MSS and get cached values from tcp_hostcache.
+ * This might overwrite some of the defaults we just set.
+ */
tcp_mss(tp, sc->sc_peer_mss);
/*
@@ -811,10 +791,9 @@ resetandabort:
#endif
m_freem(m); /* XXX only needed for above */
tcpstat.tcps_sc_aborted++;
- } else {
- sc->sc_flags |= SCF_KEEPROUTE;
+ } else
tcpstat.tcps_sc_completed++;
- }
+
if (sch == NULL)
syncache_free(sc);
else
@@ -849,13 +828,14 @@ syncache_add(inc, to, th, sop, m)
struct syncache *sc = NULL;
struct syncache_head *sch;
struct mbuf *ipopts = NULL;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
int i, win;
INP_INFO_WLOCK_ASSERT(&tcbinfo);
so = *sop;
tp = sototcpcb(so);
+ bzero(&tao, sizeof(tao));
/*
* Remember the IP options, if any.
@@ -949,13 +929,11 @@ syncache_add(inc, to, th, sop, m)
if (inc->inc_isipv6) {
sc->sc_inc.inc6_faddr = inc->inc6_faddr;
sc->sc_inc.inc6_laddr = inc->inc6_laddr;
- sc->sc_route6.ro_rt = NULL;
} else
#endif
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
- sc->sc_route.ro_rt = NULL;
}
sc->sc_irs = th->th_seq;
sc->sc_flags = 0;
@@ -1027,17 +1005,19 @@ syncache_add(inc, to, th, sop, m)
* processing: drop SYN, process data and FIN.
* - otherwise do a normal 3-way handshake.
*/
- taop = tcp_gettaocache(&sc->sc_inc);
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&sc->sc_inc, &tao);
+
if ((to->to_flags & TOF_CC) != 0) {
if (((tp->t_flags & TF_NOPUSH) != 0) &&
- sc->sc_flags & SCF_CC &&
- taop != NULL && taop->tao_cc != 0 &&
- CC_GT(to->to_cc, taop->tao_cc)) {
+ sc->sc_flags & SCF_CC && tao.tao_cc != 0 &&
+ CC_GT(to->to_cc, tao.tao_cc)) {
sc->sc_rxtslot = 0;
so = syncache_socket(sc, *sop, m);
if (so != NULL) {
- sc->sc_flags |= SCF_KEEPROUTE;
- taop->tao_cc = to->to_cc;
+ tao.tao_cc = to->to_cc;
+ tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC,
+ tao.tao_cc, 0);
*sop = so;
}
syncache_free(sc);
@@ -1047,9 +1027,13 @@ syncache_add(inc, to, th, sop, m)
/*
* No CC option, but maybe CC.NEW: invalidate cached value.
*/
- if (taop != NULL)
- taop->tao_cc = 0;
+ if (tcp_do_rfc1644) {
+ tao.tao_cc = 0;
+ tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC,
+ tao.tao_cc, 0);
+ }
}
+
/*
* TAO test failed or there was no CC option,
* do a standard 3-way handshake.
@@ -1087,33 +1071,22 @@ syncache_respond(sc, m)
int optlen, error;
u_int16_t tlen, hlen, mssopt;
struct ip *ip = NULL;
- struct rtentry *rt;
struct tcphdr *th;
struct inpcb *inp;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
+ hlen =
#ifdef INET6
- if (sc->sc_inc.inc_isipv6) {
- rt = tcp_rtlookup6(&sc->sc_inc);
- if (rt != NULL)
- mssopt = rt->rt_ifp->if_mtu -
- (sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
- else
- mssopt = tcp_v6mssdflt;
- hlen = sizeof(struct ip6_hdr);
- } else
+ (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
#endif
- {
- rt = tcp_rtlookup(&sc->sc_inc);
- if (rt != NULL)
- mssopt = rt->rt_ifp->if_mtu -
- (sizeof(struct ip) + sizeof(struct tcphdr));
- else
- mssopt = tcp_mssdflt;
- hlen = sizeof(struct ip);
- }
+ sizeof(struct ip);
+
+ KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer"));
+
+ /* Determine MSS we advertize to other end of connection */
+ mssopt = tcp_mssopt(&sc->sc_inc);
/* Compute the size of the TCP options. */
if (sc->sc_flags & SCF_NOOPT) {
@@ -1244,13 +1217,10 @@ syncache_respond(sc, m)
#ifdef INET6
if (sc->sc_inc.inc_isipv6) {
- struct route_in6 *ro6 = &sc->sc_route6;
-
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
- ip6->ip6_hlim = in6_selecthlim(NULL,
- ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
- error = ip6_output(m, NULL, ro6, 0, NULL, NULL, inp);
+ ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
+ error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
} else
#endif
{
@@ -1268,7 +1238,7 @@ syncache_respond(sc, m)
mtod(m, void *), th, 0);
}
#endif
- error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL,inp);
+ error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp);
}
INP_UNLOCK(inp);
return (error);
@@ -1435,13 +1405,11 @@ syncookie_lookup(inc, th, so)
if (inc->inc_isipv6) {
sc->sc_inc.inc6_faddr = inc->inc6_faddr;
sc->sc_inc.inc6_laddr = inc->inc6_laddr;
- sc->sc_route6.ro_rt = NULL;
} else
#endif
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
- sc->sc_route.ro_rt = NULL;
}
sc->sc_irs = th->th_seq - 1;
sc->sc_iss = th->th_ack - 1;
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 1a253ab..1eeb66e 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -551,10 +551,8 @@ tcp_timer_rexmt(xtp)
if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
/*
- * If losing, let the lower level know and try for
- * a better route. Also, if we backed off this far,
- * our srtt estimate is probably bogus. Clobber it
- * so we'll take the next rtt measurement as our srtt;
+ * If we backed off this far, our srtt estimate is probably bogus.
+ * Clobber it so we'll take the next rtt measurement as our srtt;
* move the current srtt into rttvar to keep the current
* retransmit times until then.
*/
@@ -564,7 +562,6 @@ tcp_timer_rexmt(xtp)
in6_losing(tp->t_inpcb);
else
#endif
- in_losing(tp->t_inpcb);
tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
tp->t_srtt = 0;
}
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 7ce06f6..dfd6de1 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
- tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
+ tcp_hc_init();
}
/*
@@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
- struct route *ro = 0;
- struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
- struct route_in6 *ro6 = 0;
- struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
- struct inpcb *inp;
+ struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
-#ifdef INET6
- if (isipv6)
- ro6 = &inp->in6p_route;
- else
-#endif /* INET6 */
- ro = &inp->inp_route;
- } else {
- inp = NULL;
-#ifdef INET6
- if (isipv6) {
- ro6 = &sro6;
- bzero(ro6, sizeof *ro6);
- } else
-#endif /* INET6 */
- {
- ro = &sro;
- bzero(ro, sizeof *ro);
- }
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
- ip6->ip6_hlim = in6_selecthlim(inp,
- ro6 && ro6->ro_rt ?
- ro6->ro_rt->rt_ifp :
- NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
- if (isipv6) {
- (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
- if (ro6 == &sro6 && ro6->ro_rt) {
- RTFREE(ro6->ro_rt);
- ro6->ro_rt = NULL;
- }
- } else
+ if (isipv6)
+ (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ else
#endif /* INET6 */
- {
- (void) ip_output(m, NULL, ro, ipflags, NULL, inp);
- if (ro == &sro && ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
- }
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- struct rtentry *rt;
- int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
- * 'Enough' is arbitrarily defined as the 16 samples.
- * 16 samples is enough for the srtt filter to converge
- * to within 5% of the correct value; fewer samples and
- * we could save a very bogus rtt.
- *
- * Don't update the default route's characteristics and don't
- * update anything that the user "locked".
+ * 'Enough' is arbitrarily defined as 4 rtt samples.
+ * 4 samples is enough for the srtt filter to converge
+ * to within enough % of the correct value; fewer samples
+ * and we could save a bogus rtt. The danger is not high
+ * as tcp quickly recovers from everything.
+ * XXX: Works very well but needs some more statistics!
*/
- if (tp->t_rttupdated >= 16) {
- register u_long i = 0;
-#ifdef INET6
- if (isipv6) {
- struct sockaddr_in6 *sin6;
+ if (tp->t_rttupdated >= 4) {
+ struct hc_metrics_lite metrics;
+ u_long ssthresh;
- if ((rt = inp->in6p_route.ro_rt) == NULL)
- goto no_valid_rt;
- sin6 = (struct sockaddr_in6 *)rt_key(rt);
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- goto no_valid_rt;
- }
- else
-#endif /* INET6 */
- if ((rt = inp->inp_route.ro_rt) == NULL ||
- ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
- == INADDR_ANY)
- goto no_valid_rt;
-
- if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
- i = tp->t_srtt *
- (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- if (rt->rt_rmx.rmx_rtt && i)
- /*
- * filter this update to half the old & half
- * the new values, converting scale.
- * See route.h and tcp_var.h for a
- * description of the scaling constants.
- */
- rt->rt_rmx.rmx_rtt =
- (rt->rt_rmx.rmx_rtt + i) / 2;
- else
- rt->rt_rmx.rmx_rtt = i;
- tcpstat.tcps_cachedrtt++;
- }
- if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
- i = tp->t_rttvar *
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- if (rt->rt_rmx.rmx_rttvar && i)
- rt->rt_rmx.rmx_rttvar =
- (rt->rt_rmx.rmx_rttvar + i) / 2;
- else
- rt->rt_rmx.rmx_rttvar = i;
- tcpstat.tcps_cachedrttvar++;
- }
+ bzero(&metrics, sizeof(metrics));
/*
- * The old comment here said:
- * update the pipelimit (ssthresh) if it has been updated
- * already or if a pipesize was specified & the threshhold
- * got below half the pipesize. I.e., wait for bad news
- * before we start updating, then update on both good
- * and bad news.
- *
- * But we want to save the ssthresh even if no pipesize is
- * specified explicitly in the route, because such
- * connections still have an implicit pipesize specified
- * by the global tcp_sendspace. In the absence of a reliable
- * way to calculate the pipesize, it will have to do.
+ * Update the ssthresh always when the conditions below
+ * are satisfied. This gives us better new start value
+ * for the congestion avoidance for new connections.
+ * ssthresh is only set if packet loss occured on a session.
*/
- i = tp->snd_ssthresh;
- if (rt->rt_rmx.rmx_sendpipe != 0)
- dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
- else
- dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
- if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
- i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
- || dosavessthresh) {
+ ssthresh = tp->snd_ssthresh;
+ if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
- i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
- if (i < 2)
- i = 2;
- i *= (u_long)(tp->t_maxseg +
+ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (ssthresh < 2)
+ ssthresh = 2;
+ ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
- if (rt->rt_rmx.rmx_ssthresh)
- rt->rt_rmx.rmx_ssthresh =
- (rt->rt_rmx.rmx_ssthresh + i) / 2;
- else
- rt->rt_rmx.rmx_ssthresh = i;
- tcpstat.tcps_cachedssthresh++;
- }
+ } else
+ ssthresh = 0;
+ metrics.rmx_ssthresh = ssthresh;
+
+ metrics.rmx_rtt = tp->t_srtt;
+ metrics.rmx_rttvar = tp->t_rttvar;
+ /* XXX: This wraps if the pipe is more than 4 Gbit per second */
+ metrics.rmx_bandwidth = tp->snd_bandwidth;
+ metrics.rmx_cwnd = tp->snd_cwnd;
+ metrics.rmx_sendpipe = 0;
+ metrics.rmx_recvpipe = 0;
+
+ tcp_hc_update(&inp->inp_inc, &metrics);
}
- no_valid_rt:
+
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
- else if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
- struct rtentry *rt;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
- int offered;
+ u_int maxmtu;
+ u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ bzero(&tao, sizeof(tao));
if (tp) {
+ maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
+ romtu =
#ifdef INET6
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
+ isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
- rt = tcp_rtlookup(&inp->inp_inc);
- if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tcp_maxmtu(&inp->inp_inc);
+ if (!maxmtu)
+ maxmtu = romtu;
+ else
+ maxmtu = min(maxmtu, romtu);
+ if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
- taop = rmx_taop(rt->rt_rmx);
- offered = taop->tao_mssopt;
- mss = rt->rt_rmx.rmx_mtu -
+ mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
- if (offered)
- mss = min(mss, offered);
+ if (tcp_do_rfc1644) {
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt)
+ mss = min(mss, tao.tao_mssopt);
+ }
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
-struct rtentry *
-tcp_rtlookup(inc)
+u_long
+tcp_maxmtu(inc)
struct in_conninfo *inc;
{
- struct route *ro;
- struct rtentry *rt;
-
- ro = &inc->inc_route;
- rt = ro->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (inc->inc_faddr.s_addr != INADDR_ANY) {
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
- inc->inc_faddr;
- rtalloc(ro);
- rt = ro->ro_rt;
- }
+ struct route sro;
+ struct sockaddr_in *dst;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
+
+ sro.ro_rt = NULL;
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ dst = (struct sockaddr_in *)&sro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = inc->inc_faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
+ }
+ if (sro.ro_rt != NULL) {
+ ifp = sro.ro_rt->rt_ifp;
+ if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = ifp->if_mtu;
+ else
+ maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ RTFREE(sro.ro_rt);
}
- return rt;
+ return (maxmtu);
}
#ifdef INET6
-struct rtentry *
-tcp_rtlookup6(inc)
+u_long
+tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
- struct route_in6 *ro6;
- struct rtentry *rt;
-
- ro6 = &inc->inc6_route;
- rt = ro6->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
- ro6->ro_dst.sin6_family = AF_INET6;
- ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
- ro6->ro_dst.sin6_addr = inc->inc6_faddr;
- rtalloc((struct route *)ro6);
- rt = ro6->ro_rt;
- }
+ struct route_in6 sro6;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
+
+ sro6.ro_rt = NULL;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ sro6.ro_dst.sin6_family = AF_INET6;
+ sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ sro6.ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
- return rt;
+ if (sro6.ro_rt != NULL) {
+ ifp = sro6.ro_rt->rt_ifp;
+ if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
+ else
+ maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
+ IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+ RTFREE(sro6.ro_rt);
+ }
+
+ return (maxmtu);
}
#endif /* INET6 */
@@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp)
#endif /*IPSEC*/
/*
- * Return a pointer to the cached information about the remote host.
- * The cached information is stored in the protocol specific part of
- * the route metrics.
- */
-struct rmxp_tao *
-tcp_gettaocache(inc)
- struct in_conninfo *inc;
-{
- struct rtentry *rt;
-
-#ifdef INET6
- if (inc->inc_isipv6)
- rt = tcp_rtlookup6(inc);
- else
-#endif /* INET6 */
- rt = tcp_rtlookup(inc);
-
- /* Make sure this is a host route and is up. */
- if (rt == NULL ||
- (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
- return NULL;
-
- return rmx_taop(rt->rt_rmx);
-}
-
-/*
- * Clear all the TAO cache entries, called from tcp_init.
- *
- * XXX
- * This routine is just an empty one, because we assume that the routing
- * routing tables are initialized at the same time when TCP, so there is
- * nothing in the cache left over.
- */
-static void
-tcp_cleartaocache()
-{
-}
-
-/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
* inp is locked, and is unlocked before returning.
@@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
- ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
- inp->in6p_route.ro_rt->rt_ifp : NULL);
- error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
- error = ip_output(m, inp->inp_options, &inp->inp_route,
+ error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 7035227..17566c8 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -848,12 +848,13 @@ tcp_connect(tp, nam, td)
struct inpcb *inp = tp->t_inpcb, *oinp;
struct socket *so = inp->inp_socket;
struct tcptw *otw;
- struct rmxp_tao *taop;
- struct rmxp_tao tao_noncached;
+ struct rmxp_tao tao;
struct in_addr laddr;
u_short lport;
int error;
+ bzero(&tao, sizeof(tao));
+
if (inp->inp_lport == 0) {
error = in_pcbbind(inp, (struct sockaddr *)0, td);
if (error)
@@ -902,20 +903,22 @@ tcp_connect(tp, nam, td)
* Generate a CC value for this connection and
* check whether CC or CCnew should be used.
*/
- if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
tp->cc_send = CC_INC(tcp_ccgen);
- if (taop->tao_ccsent != 0 &&
- CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
- taop->tao_ccsent = tp->cc_send;
+ if (tao.tao_ccsent != 0 &&
+ CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
+ tao.tao_ccsent = tp->cc_send;
} else {
- taop->tao_ccsent = 0;
+ tao.tao_ccsent = 0;
tp->t_flags |= TF_SENDCCNEW;
}
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
+ tao.tao_ccsent, 0);
+
return 0;
}
@@ -931,10 +934,11 @@ tcp6_connect(tp, nam, td)
struct tcptw *otw;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
struct in6_addr *addr6;
- struct rmxp_tao *taop;
- struct rmxp_tao tao_noncached;
+ struct rmxp_tao tao;
int error;
+ bzero(&tao, sizeof(tao));
+
if (inp->inp_lport == 0) {
error = in6_pcbbind(inp, (struct sockaddr *)0, td);
if (error)
@@ -991,19 +995,20 @@ tcp6_connect(tp, nam, td)
* Generate a CC value for this connection and
* check whether CC or CCnew should be used.
*/
- if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
tp->cc_send = CC_INC(tcp_ccgen);
- if (taop->tao_ccsent != 0 &&
- CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
- taop->tao_ccsent = tp->cc_send;
+ if (tao.tao_ccsent != 0 &&
+ CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
+ tao.tao_ccsent = tp->cc_send;
} else {
- taop->tao_ccsent = 0;
+ tao.tao_ccsent = 0;
tp->t_flags |= TF_SENDCCNEW;
}
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
+ tao.tao_ccsent, 0);
return 0;
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 2e5b3fa..ddcfd3c 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -213,8 +213,6 @@ struct syncache {
struct tcpcb *sc_tp; /* tcb for listening socket */
struct mbuf *sc_ipopts; /* source route */
struct in_conninfo sc_inc; /* addresses */
-#define sc_route sc_inc.inc_route
-#define sc_route6 sc_inc.inc6_route
u_int32_t sc_tsrecent;
tcp_cc sc_cc_send; /* holds CC or CCnew */
tcp_cc sc_cc_recv;
@@ -232,7 +230,6 @@ struct syncache {
#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */
#define SCF_CC 0x08 /* negotiated CC */
#define SCF_UNREACH 0x10 /* icmp unreachable received */
-#define SCF_KEEPROUTE 0x20 /* keep cloned route */
TAILQ_ENTRY(syncache) sc_hash;
TAILQ_ENTRY(syncache) sc_timerq;
};
@@ -242,6 +239,17 @@ struct syncache_head {
u_int sch_length;
};
+struct hc_metrics_lite { /* must stay in sync with hc_metrics */
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_ssthresh; /* outbound gateway buffer limit */
+ u_long rmx_rtt; /* estimated round trip time */
+ u_long rmx_rttvar; /* estimated rtt variance */
+ u_long rmx_bandwidth; /* estimated bandwidth */
+ u_long rmx_cwnd; /* congestion window */
+ u_long rmx_sendpipe; /* outbound delay-bandwidth product */
+ u_long rmx_recvpipe; /* inbound delay-bandwidth product */
+};
+
struct tcptw {
struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */
tcp_seq snd_nxt;
@@ -260,8 +268,7 @@ struct tcptw {
};
/*
- * The TAO cache entry which is stored in the protocol family specific
- * portion of the route metrics.
+ * The TAO cache entry which is stored in the tcp hostcache.
*/
struct rmxp_tao {
tcp_cc tao_cc; /* latest CC in valid SYN */
@@ -274,7 +281,6 @@ struct rmxp_tao {
#define TAOF_UNDEF 0 /* we don't know yet */
#endif /* notyet */
};
-#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler)
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb)
@@ -401,6 +407,9 @@ struct tcpstat {
u_long tcps_sc_zonefail; /* zalloc() failed */
u_long tcps_sc_sendcookie; /* SYN cookie sent */
u_long tcps_sc_recvcookie; /* SYN cookie received */
+
+ u_long tcps_hc_added; /* entry added to hostcache */
+ u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
};
/*
@@ -451,6 +460,7 @@ struct xtcpcb {
{ "pcblist", CTLTYPE_STRUCT }, \
{ "delacktime", CTLTYPE_INT }, \
{ "v6mssdflt", CTLTYPE_INT }, \
+ { "maxid", CTLTYPE_INT }, \
}
@@ -482,12 +492,12 @@ struct tcpcb *
tcp_drop(struct tcpcb *, int);
void tcp_drain(void);
void tcp_fasttimo(void);
-struct rmxp_tao *
- tcp_gettaocache(struct in_conninfo *);
void tcp_init(void);
void tcp_input(struct mbuf *, int);
+u_long tcp_maxmtu(struct in_conninfo *);
+u_long tcp_maxmtu6(struct in_conninfo *);
void tcp_mss(struct tcpcb *, int);
-int tcp_mssopt(struct tcpcb *);
+int tcp_mssopt(struct in_conninfo *);
struct inpcb *
tcp_drop_syn_sent(struct inpcb *, int);
struct inpcb *
@@ -500,8 +510,6 @@ struct inpcb *
void tcp_respond(struct tcpcb *, void *,
struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
int tcp_twrespond(struct tcptw *, struct socket *, struct mbuf *, int);
-struct rtentry *
- tcp_rtlookup(struct in_conninfo *);
void tcp_setpersist(struct tcpcb *);
void tcp_slowtimo(void);
struct tcptemp *
@@ -519,6 +527,20 @@ int syncache_add(struct in_conninfo *, struct tcpopt *,
struct tcphdr *, struct socket **, struct mbuf *);
void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
void syncache_badack(struct in_conninfo *);
+/*
+ * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
+ */
+void tcp_hc_init(void);
+void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
+u_long tcp_hc_getmtu(struct in_conninfo *);
+void tcp_hc_gettao(struct in_conninfo *, struct rmxp_tao *);
+void tcp_hc_updatemtu(struct in_conninfo *, u_long);
+void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
+void tcp_hc_updatetao(struct in_conninfo *, int, tcp_cc, u_short);
+/* update which tao field */
+#define TCP_HC_TAO_CC 0x1
+#define TCP_HC_TAO_CCSENT 0x2
+#define TCP_HC_TAO_MSSOPT 0x3
extern struct pr_usrreqs tcp_usrreqs;
extern u_long tcp_sendspace;
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index 60ec82b..62e6131 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -544,10 +544,17 @@ udp_ctlinput(cmd, sa, vip)
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
return;
- if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -873,7 +880,7 @@ udp_output(inp, m, addr, control, td)
((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */
udpstat.udps_opackets++;
- error = ip_output(m, inp->inp_options, &inp->inp_route, ipflags,
+ error = ip_output(m, inp->inp_options, NULL, ipflags,
inp->inp_moptions, inp);
return (error);
diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c
index 997474e..6baa2db 100644
--- a/sys/netinet6/icmp6.c
+++ b/sys/netinet6/icmp6.c
@@ -94,6 +94,7 @@
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
+#include <netinet/tcp_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6protosw.h>
@@ -1105,8 +1106,7 @@ icmp6_mtudisc_update(ip6cp, validated)
struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */
u_int mtu = ntohl(icmp6->icmp6_mtu);
- struct rtentry *rt = NULL;
- struct sockaddr_in6 sin6;
+ struct in_conninfo inc;
#if 0
/*
@@ -1131,31 +1131,19 @@ icmp6_mtudisc_update(ip6cp, validated)
if (!validated)
return;
- bzero(&sin6, sizeof(sin6));
- sin6.sin6_family = PF_INET6;
- sin6.sin6_len = sizeof(struct sockaddr_in6);
- sin6.sin6_addr = *dst;
+ bzero(&inc, sizeof(inc));
+ inc.inc_flags = 1; /* IPv6 */
+ inc.inc6_faddr = *dst;
/* XXX normally, this won't happen */
if (IN6_IS_ADDR_LINKLOCAL(dst)) {
- sin6.sin6_addr.s6_addr16[1] =
+ inc.inc6_faddr.s6_addr16[1] =
htons(m->m_pkthdr.rcvif->if_index);
}
- /* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */
- rt = rtalloc1((struct sockaddr *)&sin6, 0, RTF_CLONING);
-
- if (rt && (rt->rt_flags & RTF_HOST) &&
- !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
- if (mtu < IPV6_MMTU) {
- /* xxx */
- rt->rt_rmx.rmx_locks |= RTV_MTU;
- } else if (mtu < rt->rt_ifp->if_mtu &&
- rt->rt_rmx.rmx_mtu > mtu) {
- icmp6stat.icp6s_pmtuchg++;
- rt->rt_rmx.rmx_mtu = mtu;
- }
+
+ if (mtu >= IPV6_MMTU) {
+ tcp_hc_updatemtu(&inc, mtu);
+ icmp6stat.icp6s_pmtuchg++;
}
- if (rt)
- rtfree(rt);
}
/*
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 5c7f1f2..b3d58e8 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -337,8 +337,7 @@ in6_pcbladdr(inp, nam, plocal_addr6)
* Is it the intended behavior?
*/
*plocal_addr6 = in6_selectsrc(sin6, inp->in6p_outputopts,
- inp->in6p_moptions,
- &inp->in6p_route,
+ inp->in6p_moptions, NULL,
&inp->in6p_laddr, &error);
if (*plocal_addr6 == 0) {
if (error == 0)
@@ -351,10 +350,6 @@ in6_pcbladdr(inp, nam, plocal_addr6)
* and exit to caller, that will do the lookup.
*/
}
-
- if (inp->in6p_route.ro_rt)
- ifp = inp->in6p_route.ro_rt->rt_ifp;
-
return (0);
}
@@ -447,8 +442,6 @@ in6_pcbdetach(inp)
ip6_freepcbopts(inp->in6p_outputopts);
ip6_freemoptions(inp->in6p_moptions);
- if (inp->in6p_route.ro_rt)
- RTFREE(inp->in6p_route.ro_rt);
/* Check and free IPv4 related resources in case of mapped addr */
if (inp->inp_options)
(void)m_free(inp->inp_options);
@@ -830,26 +823,10 @@ void
in6_losing(in6p)
struct inpcb *in6p;
{
- struct rtentry *rt;
- struct rt_addrinfo info;
-
- if ((rt = in6p->in6p_route.ro_rt) != NULL) {
- RT_LOCK(rt);
- in6p->in6p_route.ro_rt = NULL;
- bzero((caddr_t)&info, sizeof(info));
- info.rti_flags = rt->rt_flags;
- info.rti_info[RTAX_DST] = rt_key(rt);
- info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
- if (rt->rt_flags & RTF_DYNAMIC)
- rtexpunge(rt);
- RTFREE_LOCKED(rt);
- /*
- * A new route can be allocated
- * the next time output is attempted.
- */
- }
+ /*
+ * We don't store route pointers in the routing table anymore
+ */
+ return;
}
/*
@@ -861,14 +838,9 @@ in6_rtchange(inp, errno)
struct inpcb *inp;
int errno;
{
- if (inp->in6p_route.ro_rt) {
- RTFREE(inp->in6p_route.ro_rt);
- inp->in6p_route.ro_rt = 0;
- /*
- * A new route can be allocated the next time
- * output is attempted.
- */
- }
+ /*
+ * We don't store route pointers in the routing table anymore
+ */
return inp;
}
diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c
index 09526b2..b68852d 100644
--- a/sys/netinet6/in6_rmx.c
+++ b/sys/netinet6/in6_rmx.c
@@ -141,8 +141,7 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
}
}
- if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
- && rt->rt_ifp)
+ if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
ret = rn_addroute(v_arg, n_arg, head, treenodes);
diff --git a/sys/netinet6/in6_src.c b/sys/netinet6/in6_src.c
index d584956..88ace1c 100644
--- a/sys/netinet6/in6_src.c
+++ b/sys/netinet6/in6_src.c
@@ -211,7 +211,6 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp)
!= 0) {
return (NULL);
}
-
/*
* determine the appropriate zone id of the source based on
* the zone of the destination and the outgoing interface.
@@ -449,12 +448,19 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
struct route_in6 *ro;
struct ifnet **retifp;
{
- int error, clone;
+ int error;
+ struct route_in6 sro;
struct rtentry *rt = NULL;
- clone = IN6_IS_ADDR_MULTICAST(&dstsock->sin6_addr) ? 0 : 1;
+ if (ro == NULL) {
+ bzero(&sro, sizeof(sro));
+ ro = &sro;
+ }
+
if ((error = in6_selectroute(dstsock, opts, mopts, ro, retifp,
- &rt, clone)) != 0) {
+ &rt, 0)) != 0) {
+ if (rt && rt == sro.ro_rt)
+ RTFREE(rt);
return (error);
}
@@ -476,7 +482,11 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
* We thus reject the case here.
*/
if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
- return (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
+ int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
+
+ if (rt && rt == sro.ro_rt)
+ RTFREE(rt);
+ return (flags);
}
/*
@@ -489,6 +499,8 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp)
*retifp = rt->rt_ifa->ifa_ifp;
+ if (rt && rt == sro.ro_rt)
+ RTFREE(rt);
return (0);
}
@@ -623,6 +635,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone)
sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
*sa6 = *dstsock;
sa6->sin6_scope_id = 0;
+
if (clone) {
rtalloc((struct route *)ro);
} else {
@@ -695,7 +708,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone)
* 2. (If the outgoing interface is detected) the current
* hop limit of the interface specified by router advertisement.
* 3. The system default hoplimit.
-*/
+ */
int
in6_selecthlim(in6p, ifp)
struct in6pcb *in6p;
@@ -705,8 +718,24 @@ in6_selecthlim(in6p, ifp)
return (in6p->in6p_hops);
else if (ifp)
return (ND_IFINFO(ifp)->chlim);
- else
- return (ip6_defhlim);
+ else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
+ struct route_in6 ro6;
+ struct ifnet *lifp;
+
+ bzero(&ro6, sizeof(ro6));
+ ro6.ro_dst.sin6_family = AF_INET6;
+ ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ ro6.ro_dst.sin6_addr = in6p->in6p_faddr;
+ rtalloc((struct route *)&ro6);
+ if (ro6.ro_rt) {
+ lifp = ro6.ro_rt->rt_ifp;
+ RTFREE(ro6.ro_rt);
+ if (lifp)
+ return (ND_IFINFO(lifp)->chlim);
+ } else
+ return (ip6_defhlim);
+ }
+ return (ip6_defhlim);
}
/*
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index b95b197..3072851 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -96,6 +96,7 @@
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
#include <netinet6/nd6.h>
#ifdef IPSEC
@@ -661,7 +662,7 @@ skip_ipsec2:;
/* XXX rt not locked */
ia = ifatoia6(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
- ro->ro_rt->rt_use++;
+ ro->ro_rt->rt_rmx.rmx_pksent++;
if (ro->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway;
m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
@@ -757,7 +758,7 @@ skip_ipsec2:;
}
ia = ifatoia6(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
- ro->ro_rt->rt_use++;
+ ro->ro_rt->rt_rmx.rmx_pksent++;
RT_UNLOCK(ro->ro_rt);
}
@@ -1387,11 +1388,20 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp)
}
if (ro_pmtu->ro_rt) {
u_int32_t ifmtu;
+ struct in_conninfo inc;
+
+ bzero(&inc, sizeof(inc));
+ inc.inc_flags = 1; /* IPv6 */
+ inc.inc6_faddr = *dst;
if (ifp == NULL)
ifp = ro_pmtu->ro_rt->rt_ifp;
ifmtu = IN6_LINKMTU(ifp);
- mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
+ mtu = tcp_hc_getmtu(&inc);
+ if (mtu)
+ mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu);
+ else
+ mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
if (mtu == 0)
mtu = ifmtu;
else if (mtu < IPV6_MMTU) {
@@ -1415,8 +1425,7 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp)
* field isn't locked).
*/
mtu = ifmtu;
- if (!(ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU))
- ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
+ ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
}
} else if (ifp) {
mtu = IN6_LINKMTU(ifp);
@@ -1993,7 +2002,9 @@ do { \
{
u_long pmtu = 0;
struct ip6_mtuinfo mtuinfo;
- struct route_in6 *ro = (struct route_in6 *)&in6p->in6p_route;
+ struct route_in6 sro;
+
+ bzero(&sro, sizeof(sro));
if (!(so->so_state & SS_ISCONNECTED))
return (ENOTCONN);
@@ -2002,8 +2013,10 @@ do { \
* routing, or optional information to specify
* the outgoing interface.
*/
- error = ip6_getpmtu(ro, NULL, NULL,
+ error = ip6_getpmtu(&sro, NULL, NULL,
&in6p->in6p_faddr, &pmtu, NULL);
+ if (sro.ro_rt)
+ RTFREE(sro.ro_rt);
if (error)
break;
if (pmtu > IPV6_MAXPACKET)
diff --git a/sys/netinet6/udp6_output.c b/sys/netinet6/udp6_output.c
index 36a7fba..d905e84 100644
--- a/sys/netinet6/udp6_output.c
+++ b/sys/netinet6/udp6_output.c
@@ -203,8 +203,7 @@ udp6_output(in6p, m, addr6, control, td)
if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
laddr = in6_selectsrc(sin6, in6p->in6p_outputopts,
- in6p->in6p_moptions,
- &in6p->in6p_route,
+ in6p->in6p_moptions, NULL,
&in6p->in6p_laddr, &error);
} else
laddr = &in6p->in6p_laddr; /* XXX */
@@ -277,9 +276,7 @@ udp6_output(in6p, m, addr6, control, td)
ip6->ip6_plen = htons((u_short)plen);
#endif
ip6->ip6_nxt = IPPROTO_UDP;
- ip6->ip6_hlim = in6_selecthlim(in6p,
- in6p->in6p_route.ro_rt ?
- in6p->in6p_route.ro_rt->rt_ifp : NULL);
+ ip6->ip6_hlim = in6_selecthlim(in6p, NULL);
ip6->ip6_src = *laddr;
ip6->ip6_dst = *faddr;
@@ -297,7 +294,7 @@ udp6_output(in6p, m, addr6, control, td)
goto release;
}
#endif /* IPSEC */
- error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route,
+ error = ip6_output(m, in6p->in6p_outputopts, NULL,
flags, in6p->in6p_moptions, NULL, in6p);
break;
case AF_INET:
OpenPOWER on IntegriCloud