summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/conf/files1
-rw-r--r--sys/net/if_faith.c13
-rw-r--r--sys/net/if_loop.c13
-rw-r--r--sys/net/route.h10
-rw-r--r--sys/net/rtsock.c38
-rw-r--r--sys/netinet/in_pcb.c97
-rw-r--r--sys/netinet/in_pcb.h19
-rw-r--r--sys/netinet/in_rmx.c3
-rw-r--r--sys/netinet/ip_divert.c9
-rw-r--r--sys/netinet/ip_fw2.c9
-rw-r--r--sys/netinet/ip_icmp.c57
-rw-r--r--sys/netinet/ip_input.c16
-rw-r--r--sys/netinet/ip_output.c25
-rw-r--r--sys/netinet/raw_ip.c2
-rw-r--r--sys/netinet/tcp_hostcache.c728
-rw-r--r--sys/netinet/tcp_input.c344
-rw-r--r--sys/netinet/tcp_output.c42
-rw-r--r--sys/netinet/tcp_reass.c344
-rw-r--r--sys/netinet/tcp_subr.c348
-rw-r--r--sys/netinet/tcp_syncache.c100
-rw-r--r--sys/netinet/tcp_timer.c7
-rw-r--r--sys/netinet/tcp_timewait.c348
-rw-r--r--sys/netinet/tcp_usrreq.c45
-rw-r--r--sys/netinet/tcp_var.h44
-rw-r--r--sys/netinet/udp_usrreq.c17
-rw-r--r--sys/netinet6/icmp6.c32
-rw-r--r--sys/netinet6/in6_pcb.c44
-rw-r--r--sys/netinet6/in6_rmx.c3
-rw-r--r--sys/netinet6/in6_src.c45
-rw-r--r--sys/netinet6/ip6_output.c27
-rw-r--r--sys/netinet6/udp6_output.c9
31 files changed, 1686 insertions, 1153 deletions
diff --git a/sys/conf/files b/sys/conf/files
index 63c378b..8eee001 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -1457,6 +1457,7 @@ netinet/ip_mroute.c optional mrouting
netinet/ip_output.c optional inet
netinet/raw_ip.c optional inet
netinet/tcp_debug.c optional tcpdebug
+netinet/tcp_hostcache.c optional inet
netinet/tcp_input.c optional inet
netinet/tcp_output.c optional inet
netinet/tcp_subr.c optional inet
diff --git a/sys/net/if_faith.c b/sys/net/if_faith.c
index 07216b5..a8da4ad 100644
--- a/sys/net/if_faith.c
+++ b/sys/net/if_faith.c
@@ -270,17 +270,8 @@ faithrtrequest(cmd, rt, info)
struct rt_addrinfo *info;
{
RT_LOCK_ASSERT(rt);
-
- if (rt) {
- rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */
- /*
- * For optimal performance, the send and receive buffers
- * should be at least twice the MTU plus a little more for
- * overhead.
- */
- rt->rt_rmx.rmx_recvpipe =
- rt->rt_rmx.rmx_sendpipe = 3 * FAITHMTU;
- }
+ if (rt)
+ rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
}
/*
diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c
index afe0a73..9a54af4 100644
--- a/sys/net/if_loop.c
+++ b/sys/net/if_loop.c
@@ -329,17 +329,8 @@ lortrequest(cmd, rt, info)
struct rt_addrinfo *info;
{
RT_LOCK_ASSERT(rt);
-
- if (rt) {
- rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */
- /*
- * For optimal performance, the send and receive buffers
- * should be at least twice the MTU plus a little more for
- * overhead.
- */
- rt->rt_rmx.rmx_recvpipe =
- rt->rt_rmx.rmx_sendpipe = 3 * LOMTU;
- }
+ if (rt)
+ rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
}
/*
diff --git a/sys/net/route.h b/sys/net/route.h
index 8fff560..34c33eb 100644
--- a/sys/net/route.h
+++ b/sys/net/route.h
@@ -58,6 +58,12 @@ struct route {
* These numbers are used by reliable protocols for determining
* retransmission behavior and are included in the routing structure.
*/
+struct rt_metrics_lite {
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_expire; /* lifetime for route, e.g. redirect */
+ u_long rmx_pksent; /* packets sent using this route */
+};
+
struct rt_metrics {
u_long rmx_locks; /* Kernel must leave these values alone */
u_long rmx_mtu; /* MTU for this path */
@@ -104,10 +110,10 @@ struct rtentry {
long rt_refcnt; /* # held references */
u_long rt_flags; /* up/down?, host/net */
struct ifnet *rt_ifp; /* the answer: interface to use */
- struct ifaddr *rt_ifa; /* the answer: interface to use */
+ struct ifaddr *rt_ifa; /* the answer: interface address to use */
struct sockaddr *rt_genmask; /* for generation of cloned routes */
caddr_t rt_llinfo; /* pointer to link level info cache */
- struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */
+ struct rt_metrics_lite rt_rmx; /* metrics used by rx'ing protocols */
struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */
int (*rt_output)(struct ifnet *, struct mbuf *, struct sockaddr *,
struct rtentry *);
diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c
index 4fba1a2..3290c0c 100644
--- a/sys/net/rtsock.c
+++ b/sys/net/rtsock.c
@@ -87,7 +87,8 @@ static int sysctl_dumpentry(struct radix_node *rn, void *vw);
static int sysctl_iflist(int af, struct walkarg *w);
static int sysctl_ifmalist(int af, struct walkarg *w);
static int route_output(struct mbuf *, struct socket *);
-static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics *);
+static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics_lite *);
+static void rt_getmetrics(struct rt_metrics_lite *, struct rt_metrics *);
static void rt_dispatch(struct mbuf *, struct sockaddr *);
/*
@@ -355,9 +356,6 @@ route_output(m, so)
RT_LOCK(saved_nrt);
rt_setmetrics(rtm->rtm_inits,
&rtm->rtm_rmx, &saved_nrt->rt_rmx);
- saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
- saved_nrt->rt_rmx.rmx_locks |=
- (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
RT_REMREF(saved_nrt);
saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK];
RT_UNLOCK(saved_nrt);
@@ -428,7 +426,7 @@ route_output(m, so)
(void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm,
(struct walkarg *)0);
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_rmx = rt->rt_rmx;
+ rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_addrs = info.rti_addrs;
break;
@@ -478,9 +476,7 @@ route_output(m, so)
rt->rt_genmask = info.rti_info[RTAX_GENMASK];
/* FALLTHROUGH */
case RTM_LOCK:
- rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits);
- rt->rt_rmx.rmx_locks |=
- (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks);
+ /* We don't support locks anymore */
break;
}
RT_UNLOCK(rt);
@@ -542,20 +538,28 @@ flush:
}
static void
-rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics *out)
+rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics_lite *out)
{
#define metric(f, e) if (which & (f)) out->e = in->e;
- metric(RTV_RPIPE, rmx_recvpipe);
- metric(RTV_SPIPE, rmx_sendpipe);
- metric(RTV_SSTHRESH, rmx_ssthresh);
- metric(RTV_RTT, rmx_rtt);
- metric(RTV_RTTVAR, rmx_rttvar);
- metric(RTV_HOPCOUNT, rmx_hopcount);
+ /*
+ * Only these are stored in the routing entry since introduction
+ * of tcp hostcache. The rest is ignored.
+ */
metric(RTV_MTU, rmx_mtu);
metric(RTV_EXPIRE, rmx_expire);
#undef metric
}
+static void
+rt_getmetrics(struct rt_metrics_lite *in, struct rt_metrics *out)
+{
+#define metric(e) out->e = in->e;
+ bzero(out, sizeof(*out));
+ metric(rmx_mtu);
+ metric(rmx_expire);
+#undef metric
+}
+
#define ROUNDUP(a) \
((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))
@@ -948,8 +952,8 @@ sysctl_dumpentry(struct radix_node *rn, void *vw)
struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem;
rtm->rtm_flags = rt->rt_flags;
- rtm->rtm_use = rt->rt_use;
- rtm->rtm_rmx = rt->rt_rmx;
+ rtm->rtm_use = rt->rt_rmx.rmx_pksent;
+ rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx);
rtm->rtm_index = rt->rt_ifp->if_index;
rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0;
rtm->rtm_addrs = info.rti_addrs;
diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c
index 11735ec..898c0d4 100644
--- a/sys/netinet/in_pcb.c
+++ b/sys/netinet/in_pcb.c
@@ -561,7 +561,6 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
if (error)
return (error);
}
-
if (!TAILQ_EMPTY(&in_ifaddrhead)) {
/*
* If the destination address is INADDR_ANY,
@@ -579,32 +578,20 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
&in_ifaddrhead)->ia_broadaddr)->sin_addr;
}
if (laddr.s_addr == INADDR_ANY) {
- register struct route *ro;
+ struct route sro;
+ sro.ro_rt = NULL;
ia = (struct in_ifaddr *)0;
/*
- * If route is known or can be allocated now,
- * our src addr is taken from the i/f, else punt.
- * Note that we should check the address family of the cached
- * destination, in case of sharing the cache with IPv6.
+ * If route is known our src addr is taken from the i/f,
+ * else punt.
*/
- ro = &inp->inp_route;
- if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 ||
- ro->ro_dst.sa_family != AF_INET ||
- satosin(&ro->ro_dst)->sin_addr.s_addr != faddr.s_addr ||
- inp->inp_socket->so_options & SO_DONTROUTE)) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = (struct rtentry *)0;
- }
- if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/
- (ro->ro_rt == (struct rtentry *)0 ||
- ro->ro_rt->rt_ifp == (struct ifnet *)0)) {
- /* No route yet, so try to acquire one */
- bzero(&ro->ro_dst, sizeof(struct sockaddr_in));
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = faddr;
- rtalloc(ro);
+ if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) {
+ /* Find out route to destination */
+ sro.ro_dst.sa_family = AF_INET;
+ sro.ro_dst.sa_len = sizeof(struct sockaddr_in);
+ ((struct sockaddr_in *)&sro.ro_dst)->sin_addr = faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
}
/*
* If we found a route, use the address
@@ -612,8 +599,10 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td)
* unless it is the loopback (in case a route
* to our address on another net goes to loopback).
*/
- if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
- ia = ifatoia(ro->ro_rt->rt_ifa);
+ if (sro.ro_rt && !(sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK))
+ ia = ifatoia(sro.ro_rt->rt_ifa);
+ if (sro.ro_rt)
+ RTFREE(sro.ro_rt);
if (ia == 0) {
bzero(&sa, sizeof(sa));
sa.sin_addr = faddr;
@@ -706,8 +695,6 @@ in_pcbdetach(inp)
}
if (inp->inp_options)
(void)m_free(inp->inp_options);
- if (inp->inp_route.ro_rt)
- RTFREE(inp->inp_route.ro_rt);
ip_freemoptions(inp->inp_moptions);
inp->inp_vflag = 0;
INP_LOCK_DESTROY(inp);
@@ -884,62 +871,6 @@ in_pcbpurgeif0(pcbinfo, ifp)
}
/*
- * Check for alternatives when higher level complains
- * about service problems. For now, invalidate cached
- * routing information. If the route was created dynamically
- * (by a redirect), time to try a default gateway again.
- */
-void
-in_losing(inp)
- struct inpcb *inp;
-{
- register struct rtentry *rt;
- struct rt_addrinfo info;
-
- INP_LOCK_ASSERT(inp);
-
- if ((rt = inp->inp_route.ro_rt)) {
- RT_LOCK(rt);
- inp->inp_route.ro_rt = NULL;
- bzero((caddr_t)&info, sizeof(info));
- info.rti_flags = rt->rt_flags;
- info.rti_info[RTAX_DST] = rt_key(rt);
- info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
- if (rt->rt_flags & RTF_DYNAMIC)
- rtexpunge(rt);
- RTFREE_LOCKED(rt);
- /*
- * A new route can be allocated
- * the next time output is attempted.
- */
- }
-}
-
-/*
- * After a routing change, flush old routing
- * and allocate a (hopefully) better one.
- */
-struct inpcb *
-in_rtchange(inp, errno)
- register struct inpcb *inp;
- int errno;
-{
- INP_LOCK_ASSERT(inp);
-
- if (inp->inp_route.ro_rt) {
- RTFREE(inp->inp_route.ro_rt);
- inp->inp_route.ro_rt = 0;
- /*
- * A new route can be allocated the next time
- * output is attempted.
- */
- }
- return inp;
-}
-
-/*
* Lookup a PCB based on the local address and port.
*/
struct inpcb *
diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h
index 8a6717c..5e93328 100644
--- a/sys/netinet/in_pcb.h
+++ b/sys/netinet/in_pcb.h
@@ -94,31 +94,22 @@ struct in_endpoints {
/*
* XXX
- * At some point struct route should possibly change to:
- * struct rtentry *rt
- * struct in_endpoints *ie;
+ * the defines for inc_* are hacks and should be changed to direct references
*/
struct in_conninfo {
u_int8_t inc_flags;
u_int8_t inc_len;
u_int16_t inc_pad; /* XXX alignment for in_endpoints */
- /* protocol dependent part; cached route */
+ /* protocol dependent part */
struct in_endpoints inc_ie;
- union {
- /* placeholder for routing entry */
- struct route inc4_route;
- struct route_in6 inc6_route;
- } inc_dependroute;
};
#define inc_isipv6 inc_flags /* temp compatability */
#define inc_fport inc_ie.ie_fport
#define inc_lport inc_ie.ie_lport
#define inc_faddr inc_ie.ie_faddr
#define inc_laddr inc_ie.ie_laddr
-#define inc_route inc_dependroute.inc4_route
#define inc6_faddr inc_ie.ie6_faddr
#define inc6_laddr inc_ie.ie6_laddr
-#define inc6_route inc_dependroute.inc6_route
struct icmp6_filter;
@@ -157,7 +148,6 @@ struct inpcb {
#define inp_lport inp_inc.inc_lport
#define inp_faddr inp_inc.inc_faddr
#define inp_laddr inp_inc.inc_laddr
-#define inp_route inp_inc.inc_route
#define inp_ip_tos inp_depend4.inp4_ip_tos
#define inp_options inp_depend4.inp4_options
#define inp_moptions inp_depend4.inp4_moptions
@@ -182,7 +172,7 @@ struct inpcb {
#define in6p_faddr inp_inc.inc6_faddr
#define in6p_laddr inp_inc.inc6_laddr
-#define in6p_route inp_inc.inc6_route
+#define in6p_ip6_hlim inp_depend6.inp6_hlim
#define in6p_hops inp_depend6.inp6_hops /* default hop limit */
#define in6p_ip6_nxt inp_ip_p
#define in6p_flowinfo inp_flow
@@ -347,9 +337,6 @@ extern int ipport_hifirstauto;
extern int ipport_hilastauto;
void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *);
-void in_losing(struct inpcb *);
-struct inpcb *
- in_rtchange(struct inpcb *, int);
int in_pcballoc(struct socket *, struct inpcbinfo *, struct thread *);
int in_pcbbind(struct inpcb *, struct sockaddr *, struct thread *);
int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *,
diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c
index 4625030..ea11792 100644
--- a/sys/netinet/in_rmx.c
+++ b/sys/netinet/in_rmx.c
@@ -98,8 +98,7 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr)))
rt->rt_flags |= RTF_MULTICAST;
- if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) &&
- rt->rt_ifp)
+ if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu;
ret = rn_addroute(v_arg, n_arg, head, treenodes);
diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c
index 172021b..bd777dd 100644
--- a/sys/netinet/ip_divert.c
+++ b/sys/netinet/ip_divert.c
@@ -336,7 +336,7 @@ div_output(struct socket *so, struct mbuf *m,
ipstat.ips_rawout++; /* XXX */
error = ip_output((struct mbuf *)&divert_tag,
- inp->inp_options, &inp->inp_route,
+ inp->inp_options, NULL,
(so->so_options & SO_DONTROUTE) |
IP_ALLOWBROADCAST | IP_RAWOUTPUT,
inp->inp_moptions, NULL);
@@ -527,11 +527,8 @@ div_ctlinput(int cmd, struct sockaddr *sa, void *vip)
faddr = ((struct sockaddr_in *)sa)->sin_addr;
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
return;
- if (PRC_IS_REDIRECT(cmd)) {
- /* flush held routes */
- in_pcbnotifyall(&divcbinfo, faddr,
- inetctlerrmap[cmd], in_rtchange);
- }
+ if (PRC_IS_REDIRECT(cmd))
+ return;
}
static int
diff --git a/sys/netinet/ip_fw2.c b/sys/netinet/ip_fw2.c
index 5d3e3da..999d064 100644
--- a/sys/netinet/ip_fw2.c
+++ b/sys/netinet/ip_fw2.c
@@ -466,10 +466,13 @@ verify_rev_path(struct in_addr src, struct ifnet *ifp)
rtalloc_ign(&ro, RTF_CLONING);
}
- if ((ro.ro_rt == NULL) || (ifp == NULL) ||
- (ro.ro_rt->rt_ifp->if_index != ifp->if_index))
+ if (ro.ro_rt == NULL)
return 0;
-
+ if ((ifp == NULL) || (ro.ro_rt->rt_ifp->if_index != ifp->if_index)) {
+ RTFREE(ro.ro_rt);
+ return 0;
+ }
+ RTFREE(ro.ro_rt);
return 1;
}
diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c
index f94e7b9..b84d689 100644
--- a/sys/netinet/ip_icmp.c
+++ b/sys/netinet/ip_icmp.c
@@ -52,11 +52,15 @@
#include <net/route.h>
#include <netinet/in.h>
+#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
#include <netinet/in_var.h>
#include <netinet/ip.h>
#include <netinet/ip_icmp.h>
#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
#include <netinet/icmp_var.h>
#ifdef IPSEC
@@ -395,7 +399,7 @@ icmp_input(m, off)
printf("deliver to protocol %d\n", icp->icmp_ip.ip_p);
#endif
icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
-#if 1
+
/*
* MTU discovery:
* If we got a needfrag and there is a host route to the
@@ -405,40 +409,37 @@ icmp_input(m, off)
* notice that the MTU has changed and adapt accordingly.
* If no new MTU was suggested, then we guess a new one
* less than the current value. If the new MTU is
- * unreasonably small (arbitrarily set at 296), then
- * we reset the MTU to the interface value and enable the
- * lock bit, indicating that we are no longer doing MTU
- * discovery.
+ * unreasonably small, then we don't update the MTU value.
+ *
+ * XXX: All this should be done in tcp_mtudisc() because
+ * the way we do it now, everyone can send us bogus ICMP
+ * MSGSIZE packets for any destination. By doing this far
+ * higher in the chain we have a matching tcp connection.
+ * Thus spoofing is much harder. However there is no easy
+ * non-hackish way to pass the new MTU up to tcp_mtudisc().
+ * Also see next XXX regarding IPv4 AH TCP.
*/
if (code == PRC_MSGSIZE) {
- struct rtentry *rt;
int mtu;
+ struct in_conninfo inc;
+
+ bzero(&inc, sizeof(inc));
+ inc.inc_flags = 0; /* IPv4 */
+ inc.inc_faddr = icmpsrc.sin_addr;
+
+ mtu = ntohs(icp->icmp_nextmtu);
+ if (!mtu)
+ mtu = ip_next_mtu(mtu, 1);
+
+ if (mtu >= 256 + sizeof(struct tcpiphdr))
+ tcp_hc_updatemtu(&inc, mtu);
- rt = rtalloc1((struct sockaddr *)&icmpsrc, 0,
- RTF_CLONING);
- if (rt && (rt->rt_flags & RTF_HOST)
- && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
- mtu = ntohs(icp->icmp_nextmtu);
- if (!mtu)
- mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu,
- 1);
#ifdef DEBUG_MTUDISC
- printf("MTU for %s reduced to %d\n",
- inet_ntoa(icmpsrc.sin_addr), mtu);
+ printf("MTU for %s reduced to %d\n",
+ inet_ntoa(icmpsrc.sin_addr), mtu);
#endif
- if (mtu < 296) {
- /* rt->rt_rmx.rmx_mtu =
- rt->rt_ifp->if_mtu; */
- rt->rt_rmx.rmx_locks |= RTV_MTU;
- } else if (rt->rt_rmx.rmx_mtu > mtu) {
- rt->rt_rmx.rmx_mtu = mtu;
- }
- }
- if (rt)
- rtfree(rt);
}
-#endif
/*
* XXX if the packet contains [IPv4 AH TCP], we can't make a
* notification to TCP layer.
@@ -785,7 +786,6 @@ iptime()
return (htonl(t));
}
-#if 1
/*
* Return the next larger or smaller MTU plateau (table from RFC 1191)
* given current value MTU. If DIR is less than zero, a larger plateau
@@ -823,7 +823,6 @@ ip_next_mtu(mtu, dir)
}
}
}
-#endif
/*
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index df67d22..3d528f4 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -1612,22 +1612,22 @@ struct in_ifaddr *
ip_rtaddr(dst)
struct in_addr dst;
{
+ struct route sro;
struct sockaddr_in *sin;
struct in_ifaddr *ifa;
- struct route ro;
- bzero(&ro, sizeof(ro));
- sin = (struct sockaddr_in *)&ro.ro_dst;
+ sro.ro_rt = NULL;
+ sin = (struct sockaddr_in *)&sro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
sin->sin_addr = dst;
- rtalloc_ign(&ro, RTF_CLONING);
+ rtalloc_ign(&sro, RTF_CLONING);
- if (ro.ro_rt == 0)
+ if (sro.ro_rt == NULL)
return ((struct in_ifaddr *)0);
- ifa = ifatoia(ro.ro_rt->rt_ifa);
- RTFREE(ro.ro_rt);
+ ifa = ifatoia(sro.ro_rt->rt_ifa);
+ RTFREE(sro.ro_rt);
return ifa;
}
@@ -1879,7 +1879,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop)
struct route ro;
struct rtentry *rt;
- bzero(&ro, sizeof(ro));
+ ro.ro_rt = NULL;
sin = (struct sockaddr_in *)&ro.ro_dst;
sin->sin_family = AF_INET;
sin->sin_len = sizeof(*sin);
diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c
index cdf8b87..0a11524 100644
--- a/sys/netinet/ip_output.c
+++ b/sys/netinet/ip_output.c
@@ -302,13 +302,9 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
isbroadcast = 0; /* fool gcc */
} else {
/*
- * If this is the case, we probably don't want to allocate
- * a protocol-cloned route since we didn't get one from the
- * ULP. This lets TCP do its thing, while not burdening
- * forwarding or ICMP with the overhead of cloning a route.
- * Of course, we still want to do any cloning requested by
- * the link layer, as this is probably required in all cases
- * for correct operation (as it is for ARP).
+ * We want to do any cloning requested by the link layer,
+ * as this is probably required in all cases for correct
+ * operation (as it is for ARP).
*/
if (ro->ro_rt == 0)
rtalloc(ro);
@@ -319,7 +315,7 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro,
}
ia = ifatoia(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
- ro->ro_rt->rt_use++;
+ ro->ro_rt->rt_rmx.rmx_pksent++;
if (ro->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway;
if (ro->ro_rt->rt_flags & RTF_HOST)
@@ -931,16 +927,14 @@ spd_done:
ip_input((struct mbuf *)&tag);
goto done;
}
- /* Some of the logic for this was
+ /*
+ * Some of the logic for this was
* nicked from above.
- *
- * This rewrites the cached route in a local PCB.
- * Is this what we want to do?
*/
bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst));
ro_fwd->ro_rt = 0;
- rtalloc(ro_fwd);
+ rtalloc_ign(ro_fwd, RTF_CLONING);
if (ro_fwd->ro_rt == 0) {
ipstat.ips_noroute++;
@@ -950,7 +944,7 @@ spd_done:
ia = ifatoia(ro_fwd->ro_rt->rt_ifa);
ifp = ro_fwd->ro_rt->rt_ifp;
- ro_fwd->ro_rt->rt_use++;
+ ro_fwd->ro_rt->rt_rmx.rmx_pksent++;
if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in *)
ro_fwd->ro_rt->rt_gateway;
@@ -1045,7 +1039,6 @@ pass:
* routes when the MTU is changed.
*/
if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
- !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
(ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) {
ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu;
}
@@ -1983,7 +1976,7 @@ ip_setmoptions(sopt, imop)
dst->sin_len = sizeof(*dst);
dst->sin_family = AF_INET;
dst->sin_addr = mreq.imr_multiaddr;
- rtalloc(&ro);
+ rtalloc_ign(&ro, RTF_CLONING);
if (ro.ro_rt == NULL) {
error = EADDRNOTAVAIL;
splx(s);
diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c
index 632e00a..0a76a7f 100644
--- a/sys/netinet/raw_ip.c
+++ b/sys/netinet/raw_ip.c
@@ -302,7 +302,7 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst)
if (inp->inp_flags & INP_ONESBCAST)
flags |= IP_SENDONES;
- return (ip_output(m, inp->inp_options, &inp->inp_route, flags,
+ return (ip_output(m, inp->inp_options, NULL, flags,
inp->inp_moptions, inp));
}
diff --git a/sys/netinet/tcp_hostcache.c b/sys/netinet/tcp_hostcache.c
new file mode 100644
index 0000000..461ce85
--- /dev/null
+++ b/sys/netinet/tcp_hostcache.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * The tcp_hostcache moves the tcp specific cached metrics from the routing
+ * table into a dedicated structure indexed by the remote IP address. It
+ * keeps information on the measured tcp parameters of past tcp sessions
+ * to have better initial start values for following connections from the
+ * same source. Depending on the network parameters (delay, bandwidth, max
+ * MTU, congestion window) between local and remote site this can lead to
+ * significant speedups for new tcp connections after the first one.
+ *
+ * Due to this new tcp_hostcache all tcp specific metrics information in
+ * the routing table has been removed. The INPCB no longer keeps a pointer
+ * to the routing entry and protocol initiated route cloning has been
+ * removed as well. With these changes the routing table has gone back
+ * to being more lightwight and only carries information related to packet
+ * forwarding.
+ *
+ * Tcp_hostcache is designed for multiple concurrent access in SMP
+ * environments and high contention. All bucket rows have their own
+ * lock and thus multiple lookups and modifies can be done at the same
+ * time as long as they are in different bucket rows. If a request for
+ * insertion of a new record can't be satisfied it simply returns an
+ * empty structure. Nobody and nothing shall ever point directly to
+ * any entry in tcp_hostcache. All communication is done in an object
+ * oriented way and only funtions of tcp_hostcache will manipulate hostcache
+ * entries. Otherwise we are unable to achieve good behaviour in concurrent
+ * access situations. Since tcp_hostcache is only caching information there
+ * are no fatal consequences if we either can't satisfy any particular request
+ * or have to drop/overwrite an existing entry because of bucket limit
+ * memory constrains.
+ */
+
+/*
+ * Many thanks to jlemon for basic structure of tcp_syncache which is being
+ * followed here.
+ */
+
+#include "opt_inet6.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/malloc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_var.h>
+#include <netinet/in_pcb.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#include <netinet6/ip6_var.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_var.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+
+#include <vm/uma.h>
+
+
+TAILQ_HEAD(hc_qhead, hc_metrics);
+
+struct hc_head {
+ struct hc_qhead hch_bucket;
+ u_int hch_length;
+ struct mtx hch_mtx;
+};
+
+struct hc_metrics {
+ /* housekeeping */
+ TAILQ_ENTRY(hc_metrics) rmx_q;
+ struct hc_head *rmx_head; /* head of bucket tail queue */
+ struct in_addr ip4; /* IP address */
+ struct in6_addr ip6; /* IP6 address */
+ /* endpoint specific values for tcp */
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_ssthresh; /* outbound gateway buffer limit */
+ u_long rmx_rtt; /* estimated round trip time */
+ u_long rmx_rttvar; /* estimated rtt variance */
+ u_long rmx_bandwidth; /* estimated bandwidth */
+ u_long rmx_cwnd; /* congestion window */
+ u_long rmx_sendpipe; /* outbound delay-bandwidth product */
+ u_long rmx_recvpipe; /* inbound delay-bandwidth product */
+ struct rmxp_tao rmx_tao; /* TAO cache for T/TCP */
+ /* tcp hostcache internal data */
+ int rmx_expire; /* lifetime for object */
+ u_long rmx_hits; /* number of hits */
+ u_long rmx_updates; /* number of updates */
+};
+
+/* Arbitrary values */
+#define TCP_HOSTCACHE_HASHSIZE 512
+#define TCP_HOSTCACHE_BUCKETLIMIT 30
+#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */
+#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */
+
+struct tcp_hostcache {
+ struct hc_head *hashbase;
+ uma_zone_t zone;
+ u_int hashsize;
+ u_int hashmask;
+ u_int bucket_limit;
+ u_int cache_count;
+ u_int cache_limit;
+ int expire;
+ int purgeall;
+};
+static struct tcp_hostcache tcp_hostcache;
+
+static struct callout tcp_hc_callout;
+
+static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *);
+static struct hc_metrics *tcp_hc_insert(struct in_conninfo *);
+static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS);
+static void tcp_hc_purge(void *);
+
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN,
+ &tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN,
+ &tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN,
+ &tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD,
+ &tcp_hostcache.cache_count, 0, "Current number of entries in hostcache");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW,
+ &tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries");
+
+SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW,
+ &tcp_hostcache.purgeall, 0, "Expire all entires on next purge run");
+
+SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list,
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0,
+ sysctl_tcp_hc_list, "A", "List of all hostcache entries");
+
+
+static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache");
+
+#define HOSTCACHE_HASH(ip) \
+ (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \
+ tcp_hostcache.hashmask)
+
+/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */
+#define HOSTCACHE_HASH6(ip6) \
+ (((ip6)->s6_addr32[0] ^ \
+ (ip6)->s6_addr32[1] ^ \
+ (ip6)->s6_addr32[2] ^ \
+ (ip6)->s6_addr32[3]) & \
+ tcp_hostcache.hashmask)
+
+#define THC_LOCK(lp) mtx_lock(lp)
+#define THC_UNLOCK(lp) mtx_unlock(lp)
+
+void
+tcp_hc_init(void)
+{
+ int i;
+
+ /*
+ * Initialize hostcache structures
+ */
+ tcp_hostcache.cache_count = 0;
+ tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE;
+ tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT;
+ tcp_hostcache.cache_limit =
+ tcp_hostcache.hashsize * tcp_hostcache.bucket_limit;
+ tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE;
+
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize",
+ &tcp_hostcache.hashsize);
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit",
+ &tcp_hostcache.cache_limit);
+ TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit",
+ &tcp_hostcache.bucket_limit);
+ if (!powerof2(tcp_hostcache.hashsize)) {
+ printf("WARNING: hostcache hash size is not a power of 2.\n");
+ tcp_hostcache.hashsize = 512; /* safe default */
+ }
+ tcp_hostcache.hashmask = tcp_hostcache.hashsize - 1;
+
+ /*
+ * Allocate the hash table
+ */
+ tcp_hostcache.hashbase = (struct hc_head *)
+ malloc(tcp_hostcache.hashsize * sizeof(struct hc_head),
+ M_HOSTCACHE, M_WAITOK | M_ZERO);
+
+ /*
+ * Initialize the hash buckets
+ */
+ for (i = 0; i < tcp_hostcache.hashsize; i++) {
+ TAILQ_INIT(&tcp_hostcache.hashbase[i].hch_bucket);
+ tcp_hostcache.hashbase[i].hch_length = 0;
+ mtx_init(&tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry",
+ NULL, MTX_DEF);
+ }
+
+ /*
+ * Allocate the hostcache entries.
+ */
+ tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcp_hostcache.zone, tcp_hostcache.cache_limit);
+
+ /*
+ * Set up periodic cache cleanup.
+ */
+ callout_init(&tcp_hc_callout, CALLOUT_MPSAFE);
+ callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0);
+}
+
+/*
+ * Internal function: lookup an entry in the hostcache or return NULL.
+ *
+ * If an entry has been returned, the caller becomes responsible for
+ * unlocking the bucket row after he is done reading/modifying the entry.
+ */
+static struct hc_metrics *
+tcp_hc_lookup(struct in_conninfo *inc)
+{
+ int hash;
+ struct hc_head *hc_head;
+ struct hc_metrics *hc_entry;
+
+ KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer"));
+
+ /*
+ * Hash the foreign ip address.
+ */
+ if (inc->inc_isipv6)
+ hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
+ else
+ hash = HOSTCACHE_HASH(&inc->inc_faddr);
+
+ hc_head = &tcp_hostcache.hashbase[hash];
+
+ /*
+ * aquire lock for this bucket row
+ * we release the lock if we don't find an entry,
+ * otherwise the caller has to unlock after he is done
+ */
+ THC_LOCK(&hc_head->hch_mtx);
+
+ /*
+ * circle through entries in bucket row looking for a match
+ */
+ TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) {
+ if (inc->inc_isipv6) {
+ if (memcmp(&inc->inc6_faddr, &hc_entry->ip6,
+ sizeof(inc->inc6_faddr)) == 0)
+ return hc_entry;
+ } else {
+ if (memcmp(&inc->inc_faddr, &hc_entry->ip4,
+ sizeof(inc->inc_faddr)) == 0)
+ return hc_entry;
+ }
+ }
+
+ /*
+ * We were unsuccessful and didn't find anything
+ */
+ THC_UNLOCK(&hc_head->hch_mtx);
+ return NULL;
+}
+
+/*
+ * Internal function: insert an entry into the hostcache or return NULL
+ * if unable to allocate a new one.
+ *
+ * If an entry has been returned, the caller becomes responsible for
+ * unlocking the bucket row after he is done reading/modifying the entry.
+ */
+static struct hc_metrics *
+tcp_hc_insert(struct in_conninfo *inc)
+{
+ int hash;
+ struct hc_head *hc_head;
+ struct hc_metrics *hc_entry;
+
+ KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer"));
+
+ /*
+ * Hash the foreign ip address
+ */
+ if (inc->inc_isipv6)
+ hash = HOSTCACHE_HASH6(&inc->inc6_faddr);
+ else
+ hash = HOSTCACHE_HASH(&inc->inc_faddr);
+
+ hc_head = &tcp_hostcache.hashbase[hash];
+
+ /*
+ * aquire lock for this bucket row
+ * we release the lock if we don't find an entry,
+ * otherwise the caller has to unlock after he is done
+ */
+ THC_LOCK(&hc_head->hch_mtx);
+
+ /*
+ * If the bucket limit is reached reuse the least used element
+ */
+ if (hc_head->hch_length >= tcp_hostcache.bucket_limit ||
+ tcp_hostcache.cache_count >= tcp_hostcache.cache_limit) {
+ hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead);
+ /*
+ * At first we were dropping the last element, just to
+ * reaquire it in the next two lines again which ain't
+ * very efficient. Instead just reuse the least used element.
+ * maybe we drop something that is still "in-use" but we can
+ * be "lossy".
+ */
+#if 0
+ TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q);
+ uma_zfree(tcp_hostcache.zone, hc_entry);
+ tcp_hostcache.hashbase[hash].hch_length--;
+ tcp_hostcache.cache_count--;
+#endif
+ tcpstat.tcps_hc_bucketoverflow++;
+ } else {
+ /*
+ * Allocate a new entry, or balk if not possible
+ */
+ hc_entry = uma_zalloc(tcp_hostcache.zone, M_NOWAIT);
+ if (hc_entry == NULL) {
+ THC_UNLOCK(&hc_head->hch_mtx);
+ return NULL;
+ }
+ }
+
+ /*
+ * Initialize basic information of hostcache entry
+ */
+ bzero(hc_entry, sizeof(*hc_entry));
+ if (inc->inc_isipv6)
+ bcopy(&hc_entry->ip6, &inc->inc6_faddr, sizeof(hc_entry->ip6));
+ else
+ hc_entry->ip4 = inc->inc_faddr;
+ hc_entry->rmx_head = hc_head;
+ hc_entry->rmx_expire = tcp_hostcache.expire;
+
+ /*
+ * Put it upfront
+ */
+ TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q);
+ tcp_hostcache.hashbase[hash].hch_length++;
+ tcp_hostcache.cache_count++;
+ tcpstat.tcps_hc_added++;
+
+ return hc_entry;
+}
+
+/*
+ * External function: lookup an entry in the hostcache and fill out the
+ * supplied tcp metrics structure. Fills in null when no entry was found
+ * or a value is not set.
+ */
+void
+tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite)
+{
+ struct hc_metrics *hc_entry;
+
+ /*
+ * Find the right bucket
+ */
+ hc_entry = tcp_hc_lookup(inc);
+
+ /*
+ * If we don't have an existing object
+ */
+ if (hc_entry == NULL) {
+ bzero(hc_metrics_lite, sizeof(*hc_metrics_lite));
+ return;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu;
+ hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh;
+ hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt;
+ hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar;
+ hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth;
+ hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd;
+ hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe;
+ hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe;
+
+ /*
+ * unlock bucket row
+ */
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: lookup an entry in the hostcache and return the
+ * discovered path mtu. Returns null if no entry found or value not is set.
+ */
+u_long
+tcp_hc_getmtu(struct in_conninfo *inc)
+{
+ struct hc_metrics *hc_entry;
+ u_long mtu;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ return 0;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ mtu = hc_entry->rmx_mtu;
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+ return mtu;
+}
+
+/*
+ * External function: lookup an entry in the hostcache and fill out the
+ * supplied t/tcp tao structure. Fills in null when no entry was found
+ * or a value is not set.
+ */
+void
+tcp_hc_gettao(struct in_conninfo *inc, struct rmxp_tao *tao)
+{
+ struct hc_metrics *hc_entry;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ bzero(tao, sizeof(*tao));
+ return;
+ }
+ hc_entry->rmx_hits++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ bcopy(tao, &hc_entry->rmx_tao, sizeof(*tao));
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: update the mtu value of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu)
+{
+ struct hc_metrics *hc_entry;
+
+ /*
+ * Find the right bucket
+ */
+ hc_entry = tcp_hc_lookup(inc);
+
+ /*
+ * If we don't have an existing object try to insert a new one
+ */
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ hc_entry->rmx_mtu = mtu;
+
+ /*
+ * put it upfront so we find it faster next time
+ */
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+
+ /*
+ * unlock bucket row
+ */
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: update the tcp metrics of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml)
+{
+ struct hc_metrics *hc_entry;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ if (hcml->rmx_rtt != 0) {
+ if (hc_entry->rmx_rtt == 0)
+ hc_entry->rmx_rtt = hcml->rmx_rtt;
+ else
+ hc_entry->rmx_rtt =
+ (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2;
+ tcpstat.tcps_cachedrtt++;
+ }
+ if (hcml->rmx_rttvar != 0) {
+ if (hc_entry->rmx_rttvar == 0)
+ hc_entry->rmx_rttvar = hcml->rmx_rttvar;
+ else
+ hc_entry->rmx_rttvar =
+ (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2;
+ tcpstat.tcps_cachedrttvar++;
+ }
+ if (hcml->rmx_ssthresh != 0) {
+ if (hc_entry->rmx_ssthresh == 0)
+ hc_entry->rmx_ssthresh = hcml->rmx_ssthresh;
+ else
+ hc_entry->rmx_ssthresh =
+ (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2;
+ tcpstat.tcps_cachedssthresh++;
+ }
+ if (hcml->rmx_bandwidth != 0) {
+ if (hc_entry->rmx_bandwidth == 0)
+ hc_entry->rmx_bandwidth = hcml->rmx_bandwidth;
+ else
+ hc_entry->rmx_bandwidth =
+ (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2;
+ /* tcpstat.tcps_cachedbandwidth++; */
+ }
+ if (hcml->rmx_cwnd != 0) {
+ if (hc_entry->rmx_cwnd == 0)
+ hc_entry->rmx_cwnd = hcml->rmx_cwnd;
+ else
+ hc_entry->rmx_cwnd =
+ (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2;
+ /* tcpstat.tcps_cachedcwnd++; */
+ }
+ if (hcml->rmx_sendpipe != 0) {
+ if (hc_entry->rmx_sendpipe == 0)
+ hc_entry->rmx_sendpipe = hcml->rmx_sendpipe;
+ else
+ hc_entry->rmx_sendpipe =
+ (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2;
+ /* tcpstat.tcps_cachedsendpipe++; */
+ }
+ if (hcml->rmx_recvpipe != 0) {
+ if (hc_entry->rmx_recvpipe == 0)
+ hc_entry->rmx_recvpipe = hcml->rmx_recvpipe;
+ else
+ hc_entry->rmx_recvpipe =
+ (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2;
+ /* tcpstat.tcps_cachedrecvpipe++; */
+ }
+
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * External function: update the t/tcp tao of an entry in the hostcache.
+ * Creates a new entry if none was found.
+ */
+void
+tcp_hc_updatetao(struct in_conninfo *inc, int field, tcp_cc ccount, u_short mss)
+{
+ struct hc_metrics *hc_entry;
+
+ hc_entry = tcp_hc_lookup(inc);
+ if (hc_entry == NULL) {
+ hc_entry = tcp_hc_insert(inc);
+ if (hc_entry == NULL)
+ return;
+ }
+ hc_entry->rmx_updates++;
+ hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */
+
+ switch(field) {
+ case TCP_HC_TAO_CC:
+ hc_entry->rmx_tao.tao_cc = ccount;
+ break;
+
+ case TCP_HC_TAO_CCSENT:
+ hc_entry->rmx_tao.tao_ccsent = ccount;
+ break;
+
+ case TCP_HC_TAO_MSSOPT:
+ hc_entry->rmx_tao.tao_mssopt = mss;
+ break;
+ }
+
+ TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q);
+ THC_UNLOCK(&hc_entry->rmx_head->hch_mtx);
+}
+
+/*
+ * Sysctl function: prints the list and values of all hostcache entries in
+ * unsorted order.
+ */
+static int
+sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS)
+{
+ int bufsize;
+ int linesize = 128;
+ char *p, *buf;
+ int len, i, error;
+ struct hc_metrics *hc_entry;
+
+ bufsize = linesize * (tcp_hostcache.cache_count + 1);
+
+ p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
+
+ len = snprintf(p, linesize,
+ "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH "
+ " CWND SENDPIPE RECVPIPE HITS UPD EXP\n");
+ p += len;
+
+#define msec(u) (((u) + 500) / 1000)
+ for (i = 0; i < tcp_hostcache.hashsize; i++) {
+ THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket,
+ rmx_q) {
+ len = snprintf(p, linesize,
+ "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu "
+ "%4lu %4lu %4i\n",
+ hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) :
+#ifdef INET6
+ ip6_sprintf(&hc_entry->ip6),
+#else
+ "IPv6?",
+#endif
+ hc_entry->rmx_mtu,
+ hc_entry->rmx_ssthresh,
+ msec(hc_entry->rmx_rtt *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
+ msec(hc_entry->rmx_rttvar *
+ (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))),
+ hc_entry->rmx_bandwidth * hz * 8,
+ hc_entry->rmx_cwnd,
+ hc_entry->rmx_sendpipe,
+ hc_entry->rmx_recvpipe,
+ hc_entry->rmx_hits,
+ hc_entry->rmx_updates,
+ hc_entry->rmx_expire);
+ p += len;
+ }
+ THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ }
+#undef msec
+ error = SYSCTL_OUT(req, buf, p - buf);
+ free(buf, M_TEMP);
+ return(error);
+}
+
+/*
+ * Expire and purge (old|all) entries in the tcp_hostcache. Runs periodically
+ * from the callout.
+ */
+static void
+tcp_hc_purge(void *arg)
+{
+ struct hc_metrics *hc_entry;
+ int all = (intptr_t)arg;
+ int i;
+
+ if (tcp_hostcache.purgeall) {
+ all = 1;
+ tcp_hostcache.purgeall = 0;
+ }
+
+ for (i = 0; i < tcp_hostcache.hashsize; i++) {
+ THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket,
+ rmx_q) {
+ if (all || hc_entry->rmx_expire <= 0) {
+ TAILQ_REMOVE(&tcp_hostcache.hashbase[i].hch_bucket,
+ hc_entry, rmx_q);
+ uma_zfree(tcp_hostcache.zone, hc_entry);
+ tcp_hostcache.hashbase[i].hch_length--;
+ tcp_hostcache.cache_count--;
+ } else
+ hc_entry->rmx_expire -= TCP_HOSTCACHE_PRUNE;
+ }
+ THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx);
+ }
+ callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0);
+}
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index a247138..eca5cb2 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *,
#define ND6_HINT(tp) \
do { \
if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
- (tp)->t_inpcb->in6p_route.ro_rt) \
- nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
+ nd6_nud_hint(NULL, NULL, 0); \
} while (0)
#else
#define ND6_HINT(tp)
@@ -358,8 +357,7 @@ tcp_input(m, off0)
int todrop, acked, ourfinisacked, needoutput = 0;
u_long tiwin;
struct tcpopt to; /* options in this segment */
- struct rmxp_tao *taop; /* pointer to our TAO cache entry */
- struct rmxp_tao tao_noncached; /* in case there's no cached entry */
+ struct rmxp_tao tao; /* our TAO cache entry */
int headlocked = 0;
struct sockaddr_in *next_hop = NULL;
int rstreason; /* For badport_bandlim accounting purposes */
@@ -389,6 +387,7 @@ tcp_input(m, off0)
#ifdef INET6
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ bzero(&tao, sizeof(tao));
bzero((char *)&to, sizeof(to));
tcpstat.tcps_rcvtotal++;
@@ -707,11 +706,9 @@ findpcb:
if (isipv6) {
inc.inc6_faddr = ip6->ip6_src;
inc.inc6_laddr = ip6->ip6_dst;
- inc.inc6_route.ro_rt = NULL; /* XXX */
} else {
inc.inc_faddr = ip->ip_src;
inc.inc_laddr = ip->ip_dst;
- inc.inc_route.ro_rt = NULL; /* XXX */
}
inc.inc_fport = th->th_sport;
inc.inc_lport = th->th_dport;
@@ -916,7 +913,7 @@ findpcb:
}
after_listen:
-/* XXX temp debugging */
+ /* XXX temp debugging */
/* should not happen - syncache should pick up these connections */
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
@@ -930,8 +927,9 @@ after_listen:
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
/*
- * Process options.
- * XXX this is tradtitional behavior, may need to be cleaned up.
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
if (thflags & TH_SYN) {
@@ -1179,10 +1177,8 @@ after_listen:
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
- if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
@@ -1195,7 +1191,7 @@ after_listen:
* Our new SYN, when it arrives, will serve as the
* needed ACK.
*/
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1225,7 +1221,7 @@ after_listen:
*/
if (to.to_flags & TOF_CCECHO) {
if (tp->cc_send != to.to_ccecho) {
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1246,8 +1242,8 @@ after_listen:
tp->rcv_scale = tp->request_r_scale;
}
/* Segment is acceptable, update cache if undefined. */
- if (taop->tao_ccsent == 0)
- taop->tao_ccsent = to.to_ccecho;
+ if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
@@ -1290,14 +1286,16 @@ after_listen:
tp->t_flags |= TF_ACKNOW;
callout_stop(tp->tt_rexmt);
if (to.to_flags & TOF_CC) {
- if (taop->tao_cc != 0 &&
- CC_GT(to.to_cc, taop->tao_cc)) {
+ if (tao.tao_cc != 0 &&
+ CC_GT(to.to_cc, tao.tao_cc)) {
/*
* update cache and make transition:
* SYN-SENT -> ESTABLISHED*
* SYN-SENT* -> FIN-WAIT-1*
*/
- taop->tao_cc = to.to_cc;
+ tao.tao_cc = to.to_cc;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
tp->t_state = TCPS_FIN_WAIT_1;
@@ -1313,8 +1311,12 @@ after_listen:
} else
tp->t_state = TCPS_SYN_RECEIVED;
} else {
- /* CC.NEW or no option => invalidate cache */
- taop->tao_cc = 0;
+ if (tcp_do_rfc1644) {
+ /* CC.NEW or no option => invalidate cache */
+ tao.tao_cc = 0;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
+ }
tp->t_state = TCPS_SYN_RECEIVED;
}
}
@@ -1682,13 +1684,14 @@ trimthenstep6:
}
/*
* Upon successful completion of 3-way handshake,
- * update cache.CC if it was undefined, pass any queued
- * data to the user, and advance state appropriately.
+ * update cache.CC, pass any queued data to the user,
+ * and advance state appropriately.
*/
- if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
- taop->tao_cc == 0)
- taop->tao_cc = tp->cc_recv;
-
+ if (tcp_do_rfc1644) {
+ tao.tao_cc = tp->cc_recv;
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
+ tp->cc_recv, 0);
+ }
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
@@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt)
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
- * NOTE that this routine is only called when we process an incoming
- * segment, for outgoing segments only tcp_mssopt is called.
*
* In case of T/TCP, we call this routine during implicit connection
* setup as well (offer = -1), to initialize maxseg from the cached
* MSS of our peer.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
*/
void
tcp_mss(tp, offer)
struct tcpcb *tp;
int offer;
{
- register struct rtentry *rt;
- struct ifnet *ifp;
- register int rtt, mss;
+ int rtt, mss;
u_long bufsize;
+ u_long maxmtu;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- struct rmxp_tao *taop;
+ struct hc_metrics_lite metrics;
+ struct rmxp_tao tao;
int origoffer = offer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
@@ -2637,96 +2641,96 @@ tcp_mss(tp, offer)
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ const size_t min_protoh = sizeof(struct tcpiphdr);
#endif
+ bzero(&tao, sizeof(tao));
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
- rt = tcp_rtlookup(&inp->inp_inc);
- if (rt == NULL) {
- tp->t_maxopd = tp->t_maxseg =
- isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- return;
+ /* initialize */
+#ifdef INET6
+ if (isipv6) {
+ maxmtu = tcp_maxmtu6(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
+ } else
+#endif
+ {
+ maxmtu = tcp_maxmtu(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
}
- ifp = rt->rt_ifp;
so = inp->inp_socket;
- taop = rmx_taop(rt->rt_rmx);
/*
- * Offer == -1 means that we didn't receive SYN yet,
- * use cached value in that case;
+ * no route to sender, take default mss and return
*/
- if (offer == -1)
- offer = taop->tao_mssopt;
- /*
- * Offer == 0 means that there was no MSS on the SYN segment,
- * in this case we use tcp_mssdflt.
- */
- if (offer == 0)
- offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- else
- /*
- * Sanity check: make sure that maxopd will be large
- * enough to allow some data on segments even is the
- * all the option space is used (40bytes). Otherwise
- * funny things may happen in tcp_output.
- */
- offer = max(offer, 64);
- taop->tao_mssopt = offer;
+ if (maxmtu == 0)
+ return;
+
+ /* what have we got? */
+ switch (offer) {
+ case 0:
+ /*
+ * Offer == 0 means that there was no MSS on the SYN
+ * segment, in this case we use tcp_mssdflt.
+ */
+ offer =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif
+ tcp_mssdflt;
+ break;
+
+ case -1:
+ /*
+ * Offer == -1 means that we didn't receive SYN yet,
+ * use cached value in that case;
+ */
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt != 0)
+ offer = tao.tao_mssopt;
+ /* FALLTHROUGH */
+
+ default:
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even if the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ offer = max(offer, 64);
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_MSSOPT, 0, offer);
+ }
/*
- * While we're here, check if there's an initial rtt
- * or rttvar. Convert from the route-table units
- * to scaled multiples of the slow timeout timer.
+ * rmx information is now retrieved from tcp_hostcache
*/
- if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
- /*
- * XXX the lock bit for RTT indicates that the value
- * is also a minimum value; this is subject to time.
- */
- if (rt->rt_rmx.rmx_locks & RTV_RTT)
- tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
- tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
- tcpstat.tcps_usedrtt++;
- if (rt->rt_rmx.rmx_rttvar) {
- tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- tcpstat.tcps_usedrttvar++;
- } else {
- /* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
- }
- TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
- }
+ tcp_hc_get(&inp->inp_inc, &metrics);
+
/*
- * if there's an mtu associated with the route, use it
+ * if there's a discovered mtu int tcp hostcache, use it
* else, use the link mtu.
*/
- if (rt->rt_rmx.rmx_mtu)
- mss = rt->rt_rmx.rmx_mtu - min_protoh;
+ if (metrics.rmx_mtu)
+ mss = metrics.rmx_mtu - min_protoh;
else {
#ifdef INET6
- mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
- - min_protoh;
-#else
- mss = ifp->if_mtu - min_protoh;
-#endif
-#ifdef INET6
if (isipv6) {
- if (!in6_localaddr(&inp->in6p_faddr))
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in6_localaddr(&inp->in6p_faddr))
mss = min(mss, tcp_v6mssdflt);
} else
#endif
- if (!in_localaddr(inp->inp_faddr))
+ {
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in_localaddr(inp->inp_faddr))
mss = min(mss, tcp_mssdflt);
+ }
}
mss = min(mss, offer);
+
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
@@ -2749,6 +2753,7 @@ tcp_mss(tp, offer)
(origoffer == -1 ||
(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
mss -= TCPOLEN_CC_APPA;
+ tp->t_maxseg = mss;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
if (mss > MCLBYTES)
@@ -2757,15 +2762,18 @@ tcp_mss(tp, offer)
if (mss > MCLBYTES)
mss = mss / MCLBYTES * MCLBYTES;
#endif
+ tp->t_maxseg = mss;
+
/*
- * If there's a pipesize, change the socket buffer
- * to that size. Make the socket buffers an integral
- * number of mss units; if the mss is larger than
- * the socket buffer, decrease the mss.
+ * If there's a pipesize, change the socket buffer to that size,
+ * don't change if sb_hiwat is different than default (then it
+ * has been changed on purpose with setsockopt).
+ * Make the socket buffers an integral number of mss units;
+ * if the mss is larger than the socket buffer, decrease the mss.
*/
-#ifdef RTV_SPIPE
- if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
-#endif
+ if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
+ bufsize = metrics.rmx_sendpipe;
+ else
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss)
mss = bufsize;
@@ -2778,9 +2786,9 @@ tcp_mss(tp, offer)
}
tp->t_maxseg = mss;
-#ifdef RTV_RPIPE
- if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
-#endif
+ if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
+ bufsize = metrics.rmx_recvpipe;
+ else
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@@ -2789,62 +2797,110 @@ tcp_mss(tp, offer)
if (bufsize > so->so_rcv.sb_hiwat)
(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
}
+ /*
+ * While we're here, check the others too
+ */
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ tp->t_srtt = rtt;
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ tcpstat.tcps_usedrtt++;
+ if (metrics.rmx_rttvar) {
+ tp->t_rttvar = metrics.rmx_rttvar;
+ tcpstat.tcps_usedrttvar++;
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ if (metrics.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
+ tcpstat.tcps_usedssthresh++;
+ }
+ if (metrics.rmx_bandwidth)
+ tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
*/
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(mss,
+ min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
+#ifdef INET6
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
+#endif
else
tp->snd_cwnd = mss * ss_fltsz;
-
- if (rt->rt_rmx.rmx_ssthresh) {
- /*
- * There's some sort of gateway or interface
- * buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
- * threshold to no less than 2*mss.
- */
- tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
- tcpstat.tcps_usedssthresh++;
- }
}
/*
* Determine the MSS option to send on an outgoing SYN.
*/
int
-tcp_mssopt(tp)
- struct tcpcb *tp;
+tcp_mssopt(inc)
+ struct in_conninfo *inc;
{
- struct rtentry *rt;
+ int mss = 0;
+ u_long maxmtu = 0;
+ u_long thcmtu = 0;
+ size_t min_protoh;
#ifdef INET6
- int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
- size_t min_protoh = isipv6 ?
- sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
- sizeof (struct tcpiphdr);
-#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ int isipv6 = inc->inc_isipv6 ? 1 : 0;
#endif
- if (isipv6)
- rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
- else
- rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
- if (rt == NULL)
- return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
+ KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
#ifdef INET6
- return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
- rt->rt_ifp->if_mtu - min_protoh);
-#else
- return (rt->rt_ifp->if_mtu - min_protoh);
+ if (isipv6) {
+ mss = tcp_v6mssdflt;
+ maxmtu = tcp_maxmtu6(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
#endif
+ {
+ mss = tcp_mssdflt;
+ maxmtu = tcp_maxmtu(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct tcpiphdr);
+ }
+ if (maxmtu && thcmtu)
+ mss = min(maxmtu, thcmtu) - min_protoh;
+ else if (maxmtu || thcmtu)
+ mss = max(maxmtu, thcmtu) - min_protoh;
+
+ return (mss);
}
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index a48ec4a..a8b8e53 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -125,11 +125,12 @@ tcp_output(struct tcpcb *tp)
#if 0
int maxburst = TCP_MAXBURST;
#endif
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
int isipv6;
+ bzero(&tao, sizeof(tao));
isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif
@@ -232,7 +233,6 @@ again:
*/
len = (long)ulmin(so->so_snd.sb_cc, win) - off;
- taop = tcp_gettaocache(&tp->t_inpcb->inp_inc);
/*
* Lop off SYN bit if it has already been sent. However, if this
@@ -242,8 +242,10 @@ again:
if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
flags &= ~TH_SYN;
off--, len++;
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao);
if (len > 0 && tp->t_state == TCPS_SYN_SENT &&
- (taop == NULL || taop->tao_ccsent == 0))
+ tao.tao_ccsent == 0)
return 0;
}
@@ -429,7 +431,7 @@ send:
opt[0] = TCPOPT_MAXSEG;
opt[1] = TCPOLEN_MAXSEG;
- mss = htons((u_short) tcp_mssopt(tp));
+ mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc));
(void)memcpy(opt + 2, &mss, sizeof(mss));
optlen = TCPOLEN_MAXSEG;
@@ -872,10 +874,7 @@ send:
* Also, desired default hop limit might be changed via
* Neighbor Discovery.
*/
- ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb,
- tp->t_inpcb->in6p_route.ro_rt ?
- tp->t_inpcb->in6p_route.ro_rt->rt_ifp
- : NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
/* TODO: IPv6 IP6TOS_ECT bit on */
#if defined(IPSEC) && !defined(FAST_IPSEC)
@@ -886,36 +885,27 @@ send:
}
#endif /*IPSEC*/
error = ip6_output(m,
- tp->t_inpcb->in6p_outputopts,
- &tp->t_inpcb->in6p_route,
+ tp->t_inpcb->in6p_outputopts, NULL,
(so->so_options & SO_DONTROUTE), NULL, NULL,
tp->t_inpcb);
} else
#endif /* INET6 */
{
- struct rtentry *rt;
ip->ip_len = m->m_pkthdr.len;
#ifdef INET6
if (INP_CHECK_SOCKAF(so, AF_INET6))
- ip->ip_ttl = in6_selecthlim(tp->t_inpcb,
- tp->t_inpcb->in6p_route.ro_rt ?
- tp->t_inpcb->in6p_route.ro_rt->rt_ifp
- : NULL);
+ ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL);
#endif /* INET6 */
/*
- * See if we should do MTU discovery. We do it only if the following
- * are true:
- * 1) we have a valid route to the destination
- * 2) the MTU is not locked (if it is, then discovery has been
- * disabled)
+ * If we do path MTU discovery, then we set DF on every packet.
+ * This might not be the best thing to do according to RFC3390
+ * Section 2. However the tcp hostcache migitates the problem
+ * so it affects only the first tcp connection with a host.
*/
- if (path_mtu_discovery
- && (rt = tp->t_inpcb->inp_route.ro_rt)
- && rt->rt_flags & RTF_UP
- && !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
+ if (path_mtu_discovery)
ip->ip_off |= IP_DF;
- }
- error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route,
+
+ error = ip_output(m, tp->t_inpcb->inp_options, NULL,
(so->so_options & SO_DONTROUTE), 0, tp->t_inpcb);
}
if (error) {
diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c
index a247138..eca5cb2 100644
--- a/sys/netinet/tcp_reass.c
+++ b/sys/netinet/tcp_reass.c
@@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *,
#define ND6_HINT(tp) \
do { \
if ((tp) && (tp)->t_inpcb && \
- ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \
- (tp)->t_inpcb->in6p_route.ro_rt) \
- nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \
+ ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \
+ nd6_nud_hint(NULL, NULL, 0); \
} while (0)
#else
#define ND6_HINT(tp)
@@ -358,8 +357,7 @@ tcp_input(m, off0)
int todrop, acked, ourfinisacked, needoutput = 0;
u_long tiwin;
struct tcpopt to; /* options in this segment */
- struct rmxp_tao *taop; /* pointer to our TAO cache entry */
- struct rmxp_tao tao_noncached; /* in case there's no cached entry */
+ struct rmxp_tao tao; /* our TAO cache entry */
int headlocked = 0;
struct sockaddr_in *next_hop = NULL;
int rstreason; /* For badport_bandlim accounting purposes */
@@ -389,6 +387,7 @@ tcp_input(m, off0)
#ifdef INET6
isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
#endif
+ bzero(&tao, sizeof(tao));
bzero((char *)&to, sizeof(to));
tcpstat.tcps_rcvtotal++;
@@ -707,11 +706,9 @@ findpcb:
if (isipv6) {
inc.inc6_faddr = ip6->ip6_src;
inc.inc6_laddr = ip6->ip6_dst;
- inc.inc6_route.ro_rt = NULL; /* XXX */
} else {
inc.inc_faddr = ip->ip_src;
inc.inc_laddr = ip->ip_dst;
- inc.inc_route.ro_rt = NULL; /* XXX */
}
inc.inc_fport = th->th_sport;
inc.inc_lport = th->th_dport;
@@ -916,7 +913,7 @@ findpcb:
}
after_listen:
-/* XXX temp debugging */
+ /* XXX temp debugging */
/* should not happen - syncache should pick up these connections */
if (tp->t_state == TCPS_LISTEN)
panic("tcp_input: TCPS_LISTEN");
@@ -930,8 +927,9 @@ after_listen:
callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp);
/*
- * Process options.
- * XXX this is tradtitional behavior, may need to be cleaned up.
+ * Process options only when we get SYN/ACK back. The SYN case
+ * for incoming connections is handled in tcp_syncache.
+ * XXX this is traditional behavior, may need to be cleaned up.
*/
tcp_dooptions(&to, optp, optlen, thflags & TH_SYN);
if (thflags & TH_SYN) {
@@ -1179,10 +1177,8 @@ after_listen:
* continue processing rest of data/controls, beginning with URG
*/
case TCPS_SYN_SENT:
- if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
if ((thflags & TH_ACK) &&
(SEQ_LEQ(th->th_ack, tp->iss) ||
@@ -1195,7 +1191,7 @@ after_listen:
* Our new SYN, when it arrives, will serve as the
* needed ACK.
*/
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1225,7 +1221,7 @@ after_listen:
*/
if (to.to_flags & TOF_CCECHO) {
if (tp->cc_send != to.to_ccecho) {
- if (taop->tao_ccsent != 0)
+ if (tao.tao_ccsent != 0)
goto drop;
else {
rstreason = BANDLIM_UNLIMITED;
@@ -1246,8 +1242,8 @@ after_listen:
tp->rcv_scale = tp->request_r_scale;
}
/* Segment is acceptable, update cache if undefined. */
- if (taop->tao_ccsent == 0)
- taop->tao_ccsent = to.to_ccecho;
+ if (tao.tao_ccsent == 0 && tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0);
tp->rcv_adv += tp->rcv_wnd;
tp->snd_una++; /* SYN is acked */
@@ -1290,14 +1286,16 @@ after_listen:
tp->t_flags |= TF_ACKNOW;
callout_stop(tp->tt_rexmt);
if (to.to_flags & TOF_CC) {
- if (taop->tao_cc != 0 &&
- CC_GT(to.to_cc, taop->tao_cc)) {
+ if (tao.tao_cc != 0 &&
+ CC_GT(to.to_cc, tao.tao_cc)) {
/*
* update cache and make transition:
* SYN-SENT -> ESTABLISHED*
* SYN-SENT* -> FIN-WAIT-1*
*/
- taop->tao_cc = to.to_cc;
+ tao.tao_cc = to.to_cc;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
tp->t_starttime = ticks;
if (tp->t_flags & TF_NEEDFIN) {
tp->t_state = TCPS_FIN_WAIT_1;
@@ -1313,8 +1311,12 @@ after_listen:
} else
tp->t_state = TCPS_SYN_RECEIVED;
} else {
- /* CC.NEW or no option => invalidate cache */
- taop->tao_cc = 0;
+ if (tcp_do_rfc1644) {
+ /* CC.NEW or no option => invalidate cache */
+ tao.tao_cc = 0;
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_CC, to.to_cc, 0);
+ }
tp->t_state = TCPS_SYN_RECEIVED;
}
}
@@ -1682,13 +1684,14 @@ trimthenstep6:
}
/*
* Upon successful completion of 3-way handshake,
- * update cache.CC if it was undefined, pass any queued
- * data to the user, and advance state appropriately.
+ * update cache.CC, pass any queued data to the user,
+ * and advance state appropriately.
*/
- if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL &&
- taop->tao_cc == 0)
- taop->tao_cc = tp->cc_recv;
-
+ if (tcp_do_rfc1644) {
+ tao.tao_cc = tp->cc_recv;
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC,
+ tp->cc_recv, 0);
+ }
/*
* Make transitions:
* SYN-RECEIVED -> ESTABLISHED
@@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt)
* are present. Store the upper limit of the length of options plus
* data in maxopd.
*
- * NOTE that this routine is only called when we process an incoming
- * segment, for outgoing segments only tcp_mssopt is called.
*
* In case of T/TCP, we call this routine during implicit connection
* setup as well (offer = -1), to initialize maxseg from the cached
* MSS of our peer.
+ *
+ * NOTE that this routine is only called when we process an incoming
+ * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt().
*/
void
tcp_mss(tp, offer)
struct tcpcb *tp;
int offer;
{
- register struct rtentry *rt;
- struct ifnet *ifp;
- register int rtt, mss;
+ int rtt, mss;
u_long bufsize;
+ u_long maxmtu;
struct inpcb *inp = tp->t_inpcb;
struct socket *so;
- struct rmxp_tao *taop;
+ struct hc_metrics_lite metrics;
+ struct rmxp_tao tao;
int origoffer = offer;
#ifdef INET6
int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
@@ -2637,96 +2641,96 @@ tcp_mss(tp, offer)
sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
sizeof (struct tcpiphdr);
#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ const size_t min_protoh = sizeof(struct tcpiphdr);
#endif
+ bzero(&tao, sizeof(tao));
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
- rt = tcp_rtlookup(&inp->inp_inc);
- if (rt == NULL) {
- tp->t_maxopd = tp->t_maxseg =
- isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- return;
+ /* initialize */
+#ifdef INET6
+ if (isipv6) {
+ maxmtu = tcp_maxmtu6(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt;
+ } else
+#endif
+ {
+ maxmtu = tcp_maxmtu(&inp->inp_inc);
+ tp->t_maxopd = tp->t_maxseg = tcp_mssdflt;
}
- ifp = rt->rt_ifp;
so = inp->inp_socket;
- taop = rmx_taop(rt->rt_rmx);
/*
- * Offer == -1 means that we didn't receive SYN yet,
- * use cached value in that case;
+ * no route to sender, take default mss and return
*/
- if (offer == -1)
- offer = taop->tao_mssopt;
- /*
- * Offer == 0 means that there was no MSS on the SYN segment,
- * in this case we use tcp_mssdflt.
- */
- if (offer == 0)
- offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
- else
- /*
- * Sanity check: make sure that maxopd will be large
- * enough to allow some data on segments even is the
- * all the option space is used (40bytes). Otherwise
- * funny things may happen in tcp_output.
- */
- offer = max(offer, 64);
- taop->tao_mssopt = offer;
+ if (maxmtu == 0)
+ return;
+
+ /* what have we got? */
+ switch (offer) {
+ case 0:
+ /*
+ * Offer == 0 means that there was no MSS on the SYN
+ * segment, in this case we use tcp_mssdflt.
+ */
+ offer =
+#ifdef INET6
+ isipv6 ? tcp_v6mssdflt :
+#endif
+ tcp_mssdflt;
+ break;
+
+ case -1:
+ /*
+ * Offer == -1 means that we didn't receive SYN yet,
+ * use cached value in that case;
+ */
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt != 0)
+ offer = tao.tao_mssopt;
+ /* FALLTHROUGH */
+
+ default:
+ /*
+ * Sanity check: make sure that maxopd will be large
+ * enough to allow some data on segments even if the
+ * all the option space is used (40bytes). Otherwise
+ * funny things may happen in tcp_output.
+ */
+ offer = max(offer, 64);
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc,
+ TCP_HC_TAO_MSSOPT, 0, offer);
+ }
/*
- * While we're here, check if there's an initial rtt
- * or rttvar. Convert from the route-table units
- * to scaled multiples of the slow timeout timer.
+ * rmx information is now retrieved from tcp_hostcache
*/
- if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
- /*
- * XXX the lock bit for RTT indicates that the value
- * is also a minimum value; this is subject to time.
- */
- if (rt->rt_rmx.rmx_locks & RTV_RTT)
- tp->t_rttmin = rtt / (RTM_RTTUNIT / hz);
- tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
- tcpstat.tcps_usedrtt++;
- if (rt->rt_rmx.rmx_rttvar) {
- tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- tcpstat.tcps_usedrttvar++;
- } else {
- /* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
- }
- TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
- }
+ tcp_hc_get(&inp->inp_inc, &metrics);
+
/*
- * if there's an mtu associated with the route, use it
+ * if there's a discovered mtu int tcp hostcache, use it
* else, use the link mtu.
*/
- if (rt->rt_rmx.rmx_mtu)
- mss = rt->rt_rmx.rmx_mtu - min_protoh;
+ if (metrics.rmx_mtu)
+ mss = metrics.rmx_mtu - min_protoh;
else {
#ifdef INET6
- mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu)
- - min_protoh;
-#else
- mss = ifp->if_mtu - min_protoh;
-#endif
-#ifdef INET6
if (isipv6) {
- if (!in6_localaddr(&inp->in6p_faddr))
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in6_localaddr(&inp->in6p_faddr))
mss = min(mss, tcp_v6mssdflt);
} else
#endif
- if (!in_localaddr(inp->inp_faddr))
+ {
+ mss = maxmtu - min_protoh;
+ if (!path_mtu_discovery &&
+ !in_localaddr(inp->inp_faddr))
mss = min(mss, tcp_mssdflt);
+ }
}
mss = min(mss, offer);
+
/*
* maxopd stores the maximum length of data AND options
* in a segment; maxseg is the amount of data in a normal
@@ -2749,6 +2753,7 @@ tcp_mss(tp, offer)
(origoffer == -1 ||
(tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC))
mss -= TCPOLEN_CC_APPA;
+ tp->t_maxseg = mss;
#if (MCLBYTES & (MCLBYTES - 1)) == 0
if (mss > MCLBYTES)
@@ -2757,15 +2762,18 @@ tcp_mss(tp, offer)
if (mss > MCLBYTES)
mss = mss / MCLBYTES * MCLBYTES;
#endif
+ tp->t_maxseg = mss;
+
/*
- * If there's a pipesize, change the socket buffer
- * to that size. Make the socket buffers an integral
- * number of mss units; if the mss is larger than
- * the socket buffer, decrease the mss.
+ * If there's a pipesize, change the socket buffer to that size,
+ * don't change if sb_hiwat is different than default (then it
+ * has been changed on purpose with setsockopt).
+ * Make the socket buffers an integral number of mss units;
+ * if the mss is larger than the socket buffer, decrease the mss.
*/
-#ifdef RTV_SPIPE
- if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0)
-#endif
+ if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe)
+ bufsize = metrics.rmx_sendpipe;
+ else
bufsize = so->so_snd.sb_hiwat;
if (bufsize < mss)
mss = bufsize;
@@ -2778,9 +2786,9 @@ tcp_mss(tp, offer)
}
tp->t_maxseg = mss;
-#ifdef RTV_RPIPE
- if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0)
-#endif
+ if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe)
+ bufsize = metrics.rmx_recvpipe;
+ else
bufsize = so->so_rcv.sb_hiwat;
if (bufsize > mss) {
bufsize = roundup(bufsize, mss);
@@ -2789,62 +2797,110 @@ tcp_mss(tp, offer)
if (bufsize > so->so_rcv.sb_hiwat)
(void)sbreserve(&so->so_rcv, bufsize, so, NULL);
}
+ /*
+ * While we're here, check the others too
+ */
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ tp->t_srtt = rtt;
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ tcpstat.tcps_usedrtt++;
+ if (metrics.rmx_rttvar) {
+ tp->t_rttvar = metrics.rmx_rttvar;
+ tcpstat.tcps_usedrttvar++;
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ if (metrics.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
+ tcpstat.tcps_usedssthresh++;
+ }
+ if (metrics.rmx_bandwidth)
+ tp->snd_bandwidth = metrics.rmx_bandwidth;
/*
* Set the slow-start flight size depending on whether this
* is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
*/
+#define TCP_METRICS_CWND
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(mss,
+ min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
if (tcp_do_rfc3390)
tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
+#ifdef INET6
else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
(!isipv6 && in_localaddr(inp->inp_faddr)))
tp->snd_cwnd = mss * ss_fltsz_local;
+#endif
else
tp->snd_cwnd = mss * ss_fltsz;
-
- if (rt->rt_rmx.rmx_ssthresh) {
- /*
- * There's some sort of gateway or interface
- * buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
- * threshold to no less than 2*mss.
- */
- tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh);
- tcpstat.tcps_usedssthresh++;
- }
}
/*
* Determine the MSS option to send on an outgoing SYN.
*/
int
-tcp_mssopt(tp)
- struct tcpcb *tp;
+tcp_mssopt(inc)
+ struct in_conninfo *inc;
{
- struct rtentry *rt;
+ int mss = 0;
+ u_long maxmtu = 0;
+ u_long thcmtu = 0;
+ size_t min_protoh;
#ifdef INET6
- int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
- size_t min_protoh = isipv6 ?
- sizeof (struct ip6_hdr) + sizeof (struct tcphdr) :
- sizeof (struct tcpiphdr);
-#else
- const int isipv6 = 0;
- const size_t min_protoh = sizeof (struct tcpiphdr);
+ int isipv6 = inc->inc_isipv6 ? 1 : 0;
#endif
- if (isipv6)
- rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc);
- else
- rt = tcp_rtlookup(&tp->t_inpcb->inp_inc);
- if (rt == NULL)
- return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt);
+ KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer"));
#ifdef INET6
- return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) :
- rt->rt_ifp->if_mtu - min_protoh);
-#else
- return (rt->rt_ifp->if_mtu - min_protoh);
+ if (isipv6) {
+ mss = tcp_v6mssdflt;
+ maxmtu = tcp_maxmtu6(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ } else
#endif
+ {
+ mss = tcp_mssdflt;
+ maxmtu = tcp_maxmtu(inc);
+ thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */
+ min_protoh = sizeof(struct tcpiphdr);
+ }
+ if (maxmtu && thcmtu)
+ mss = min(maxmtu, thcmtu) - min_protoh;
+ else if (maxmtu || thcmtu)
+ mss = max(maxmtu, thcmtu) - min_protoh;
+
+ return (mss);
}
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 7ce06f6..dfd6de1 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
- tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
+ tcp_hc_init();
}
/*
@@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
- struct route *ro = 0;
- struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
- struct route_in6 *ro6 = 0;
- struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
- struct inpcb *inp;
+ struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
-#ifdef INET6
- if (isipv6)
- ro6 = &inp->in6p_route;
- else
-#endif /* INET6 */
- ro = &inp->inp_route;
- } else {
- inp = NULL;
-#ifdef INET6
- if (isipv6) {
- ro6 = &sro6;
- bzero(ro6, sizeof *ro6);
- } else
-#endif /* INET6 */
- {
- ro = &sro;
- bzero(ro, sizeof *ro);
- }
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
- ip6->ip6_hlim = in6_selecthlim(inp,
- ro6 && ro6->ro_rt ?
- ro6->ro_rt->rt_ifp :
- NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
- if (isipv6) {
- (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
- if (ro6 == &sro6 && ro6->ro_rt) {
- RTFREE(ro6->ro_rt);
- ro6->ro_rt = NULL;
- }
- } else
+ if (isipv6)
+ (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ else
#endif /* INET6 */
- {
- (void) ip_output(m, NULL, ro, ipflags, NULL, inp);
- if (ro == &sro && ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
- }
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- struct rtentry *rt;
- int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
- * 'Enough' is arbitrarily defined as the 16 samples.
- * 16 samples is enough for the srtt filter to converge
- * to within 5% of the correct value; fewer samples and
- * we could save a very bogus rtt.
- *
- * Don't update the default route's characteristics and don't
- * update anything that the user "locked".
+ * 'Enough' is arbitrarily defined as 4 rtt samples.
+ * 4 samples is enough for the srtt filter to converge
+ * to within enough % of the correct value; fewer samples
+ * and we could save a bogus rtt. The danger is not high
+ * as tcp quickly recovers from everything.
+ * XXX: Works very well but needs some more statistics!
*/
- if (tp->t_rttupdated >= 16) {
- register u_long i = 0;
-#ifdef INET6
- if (isipv6) {
- struct sockaddr_in6 *sin6;
+ if (tp->t_rttupdated >= 4) {
+ struct hc_metrics_lite metrics;
+ u_long ssthresh;
- if ((rt = inp->in6p_route.ro_rt) == NULL)
- goto no_valid_rt;
- sin6 = (struct sockaddr_in6 *)rt_key(rt);
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- goto no_valid_rt;
- }
- else
-#endif /* INET6 */
- if ((rt = inp->inp_route.ro_rt) == NULL ||
- ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
- == INADDR_ANY)
- goto no_valid_rt;
-
- if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
- i = tp->t_srtt *
- (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- if (rt->rt_rmx.rmx_rtt && i)
- /*
- * filter this update to half the old & half
- * the new values, converting scale.
- * See route.h and tcp_var.h for a
- * description of the scaling constants.
- */
- rt->rt_rmx.rmx_rtt =
- (rt->rt_rmx.rmx_rtt + i) / 2;
- else
- rt->rt_rmx.rmx_rtt = i;
- tcpstat.tcps_cachedrtt++;
- }
- if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
- i = tp->t_rttvar *
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- if (rt->rt_rmx.rmx_rttvar && i)
- rt->rt_rmx.rmx_rttvar =
- (rt->rt_rmx.rmx_rttvar + i) / 2;
- else
- rt->rt_rmx.rmx_rttvar = i;
- tcpstat.tcps_cachedrttvar++;
- }
+ bzero(&metrics, sizeof(metrics));
/*
- * The old comment here said:
- * update the pipelimit (ssthresh) if it has been updated
- * already or if a pipesize was specified & the threshhold
- * got below half the pipesize. I.e., wait for bad news
- * before we start updating, then update on both good
- * and bad news.
- *
- * But we want to save the ssthresh even if no pipesize is
- * specified explicitly in the route, because such
- * connections still have an implicit pipesize specified
- * by the global tcp_sendspace. In the absence of a reliable
- * way to calculate the pipesize, it will have to do.
+ * Update the ssthresh always when the conditions below
+ * are satisfied. This gives us better new start value
+ * for the congestion avoidance for new connections.
+ * ssthresh is only set if packet loss occured on a session.
*/
- i = tp->snd_ssthresh;
- if (rt->rt_rmx.rmx_sendpipe != 0)
- dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
- else
- dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
- if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
- i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
- || dosavessthresh) {
+ ssthresh = tp->snd_ssthresh;
+ if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
- i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
- if (i < 2)
- i = 2;
- i *= (u_long)(tp->t_maxseg +
+ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (ssthresh < 2)
+ ssthresh = 2;
+ ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
- if (rt->rt_rmx.rmx_ssthresh)
- rt->rt_rmx.rmx_ssthresh =
- (rt->rt_rmx.rmx_ssthresh + i) / 2;
- else
- rt->rt_rmx.rmx_ssthresh = i;
- tcpstat.tcps_cachedssthresh++;
- }
+ } else
+ ssthresh = 0;
+ metrics.rmx_ssthresh = ssthresh;
+
+ metrics.rmx_rtt = tp->t_srtt;
+ metrics.rmx_rttvar = tp->t_rttvar;
+ /* XXX: This wraps if the pipe is more than 4 Gbit per second */
+ metrics.rmx_bandwidth = tp->snd_bandwidth;
+ metrics.rmx_cwnd = tp->snd_cwnd;
+ metrics.rmx_sendpipe = 0;
+ metrics.rmx_recvpipe = 0;
+
+ tcp_hc_update(&inp->inp_inc, &metrics);
}
- no_valid_rt:
+
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
- else if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
- struct rtentry *rt;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
- int offered;
+ u_int maxmtu;
+ u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ bzero(&tao, sizeof(tao));
if (tp) {
+ maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
+ romtu =
#ifdef INET6
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
+ isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
- rt = tcp_rtlookup(&inp->inp_inc);
- if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tcp_maxmtu(&inp->inp_inc);
+ if (!maxmtu)
+ maxmtu = romtu;
+ else
+ maxmtu = min(maxmtu, romtu);
+ if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
- taop = rmx_taop(rt->rt_rmx);
- offered = taop->tao_mssopt;
- mss = rt->rt_rmx.rmx_mtu -
+ mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
- if (offered)
- mss = min(mss, offered);
+ if (tcp_do_rfc1644) {
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt)
+ mss = min(mss, tao.tao_mssopt);
+ }
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
-struct rtentry *
-tcp_rtlookup(inc)
+u_long
+tcp_maxmtu(inc)
struct in_conninfo *inc;
{
- struct route *ro;
- struct rtentry *rt;
-
- ro = &inc->inc_route;
- rt = ro->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (inc->inc_faddr.s_addr != INADDR_ANY) {
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
- inc->inc_faddr;
- rtalloc(ro);
- rt = ro->ro_rt;
- }
+ struct route sro;
+ struct sockaddr_in *dst;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
+
+ sro.ro_rt = NULL;
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ dst = (struct sockaddr_in *)&sro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = inc->inc_faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
+ }
+ if (sro.ro_rt != NULL) {
+ ifp = sro.ro_rt->rt_ifp;
+ if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = ifp->if_mtu;
+ else
+ maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ RTFREE(sro.ro_rt);
}
- return rt;
+ return (maxmtu);
}
#ifdef INET6
-struct rtentry *
-tcp_rtlookup6(inc)
+u_long
+tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
- struct route_in6 *ro6;
- struct rtentry *rt;
-
- ro6 = &inc->inc6_route;
- rt = ro6->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
- ro6->ro_dst.sin6_family = AF_INET6;
- ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
- ro6->ro_dst.sin6_addr = inc->inc6_faddr;
- rtalloc((struct route *)ro6);
- rt = ro6->ro_rt;
- }
+ struct route_in6 sro6;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
+
+ sro6.ro_rt = NULL;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ sro6.ro_dst.sin6_family = AF_INET6;
+ sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ sro6.ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
- return rt;
+ if (sro6.ro_rt != NULL) {
+ ifp = sro6.ro_rt->rt_ifp;
+ if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
+ else
+ maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
+ IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+ RTFREE(sro6.ro_rt);
+ }
+
+ return (maxmtu);
}
#endif /* INET6 */
@@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp)
#endif /*IPSEC*/
/*
- * Return a pointer to the cached information about the remote host.
- * The cached information is stored in the protocol specific part of
- * the route metrics.
- */
-struct rmxp_tao *
-tcp_gettaocache(inc)
- struct in_conninfo *inc;
-{
- struct rtentry *rt;
-
-#ifdef INET6
- if (inc->inc_isipv6)
- rt = tcp_rtlookup6(inc);
- else
-#endif /* INET6 */
- rt = tcp_rtlookup(inc);
-
- /* Make sure this is a host route and is up. */
- if (rt == NULL ||
- (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
- return NULL;
-
- return rmx_taop(rt->rt_rmx);
-}
-
-/*
- * Clear all the TAO cache entries, called from tcp_init.
- *
- * XXX
- * This routine is just an empty one, because we assume that the routing
- * routing tables are initialized at the same time when TCP, so there is
- * nothing in the cache left over.
- */
-static void
-tcp_cleartaocache()
-{
-}
-
-/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
* inp is locked, and is unlocked before returning.
@@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
- ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
- inp->in6p_route.ro_rt->rt_ifp : NULL);
- error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
- error = ip_output(m, inp->inp_options, &inp->inp_route,
+ error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
index 822ffeb..e2d96e9 100644
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -202,29 +202,9 @@ static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache");
static void
syncache_free(struct syncache *sc)
{
- struct rtentry *rt;
-
if (sc->sc_ipopts)
(void) m_free(sc->sc_ipopts);
-#ifdef INET6
- if (sc->sc_inc.inc_isipv6)
- rt = sc->sc_route6.ro_rt;
- else
-#endif
- rt = sc->sc_route.ro_rt;
- if (rt != NULL) {
- /*
- * If this is the only reference to a protocol cloned
- * route, remove it immediately.
- */
- if (rt->rt_flags & RTF_WASCLONED &&
- (sc->sc_flags & SCF_KEEPROUTE) == 0 &&
- rt->rt_refcnt == 1)
- rtrequest(RTM_DELETE, rt_key(rt),
- rt->rt_gateway, rt_mask(rt),
- rt->rt_flags, NULL);
- RTFREE(rt);
- }
+
uma_zfree(tcp_syncache.zone, sc);
}
@@ -644,8 +624,6 @@ syncache_socket(sc, lso, m)
if (oinp->in6p_outputopts)
inp->in6p_outputopts =
ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT);
- inp->in6p_route = sc->sc_route6;
- sc->sc_route6.ro_rt = NULL;
MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6,
M_SONAME, M_NOWAIT | M_ZERO);
@@ -675,8 +653,6 @@ syncache_socket(sc, lso, m)
inp->inp_options = sc->sc_ipopts;
sc->sc_ipopts = NULL;
}
- inp->inp_route = sc->sc_route;
- sc->sc_route.ro_rt = NULL;
MALLOC(sin, struct sockaddr_in *, sizeof *sin,
M_SONAME, M_NOWAIT | M_ZERO);
@@ -733,6 +709,10 @@ syncache_socket(sc, lso, m)
tp->cc_recv = sc->sc_cc_recv;
}
+ /*
+ * Set up MSS and get cached values from tcp_hostcache.
+ * This might overwrite some of the defaults we just set.
+ */
tcp_mss(tp, sc->sc_peer_mss);
/*
@@ -811,10 +791,9 @@ resetandabort:
#endif
m_freem(m); /* XXX only needed for above */
tcpstat.tcps_sc_aborted++;
- } else {
- sc->sc_flags |= SCF_KEEPROUTE;
+ } else
tcpstat.tcps_sc_completed++;
- }
+
if (sch == NULL)
syncache_free(sc);
else
@@ -849,13 +828,14 @@ syncache_add(inc, to, th, sop, m)
struct syncache *sc = NULL;
struct syncache_head *sch;
struct mbuf *ipopts = NULL;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
int i, win;
INP_INFO_WLOCK_ASSERT(&tcbinfo);
so = *sop;
tp = sototcpcb(so);
+ bzero(&tao, sizeof(tao));
/*
* Remember the IP options, if any.
@@ -949,13 +929,11 @@ syncache_add(inc, to, th, sop, m)
if (inc->inc_isipv6) {
sc->sc_inc.inc6_faddr = inc->inc6_faddr;
sc->sc_inc.inc6_laddr = inc->inc6_laddr;
- sc->sc_route6.ro_rt = NULL;
} else
#endif
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
- sc->sc_route.ro_rt = NULL;
}
sc->sc_irs = th->th_seq;
sc->sc_flags = 0;
@@ -1027,17 +1005,19 @@ syncache_add(inc, to, th, sop, m)
* processing: drop SYN, process data and FIN.
* - otherwise do a normal 3-way handshake.
*/
- taop = tcp_gettaocache(&sc->sc_inc);
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&sc->sc_inc, &tao);
+
if ((to->to_flags & TOF_CC) != 0) {
if (((tp->t_flags & TF_NOPUSH) != 0) &&
- sc->sc_flags & SCF_CC &&
- taop != NULL && taop->tao_cc != 0 &&
- CC_GT(to->to_cc, taop->tao_cc)) {
+ sc->sc_flags & SCF_CC && tao.tao_cc != 0 &&
+ CC_GT(to->to_cc, tao.tao_cc)) {
sc->sc_rxtslot = 0;
so = syncache_socket(sc, *sop, m);
if (so != NULL) {
- sc->sc_flags |= SCF_KEEPROUTE;
- taop->tao_cc = to->to_cc;
+ tao.tao_cc = to->to_cc;
+ tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC,
+ tao.tao_cc, 0);
*sop = so;
}
syncache_free(sc);
@@ -1047,9 +1027,13 @@ syncache_add(inc, to, th, sop, m)
/*
* No CC option, but maybe CC.NEW: invalidate cached value.
*/
- if (taop != NULL)
- taop->tao_cc = 0;
+ if (tcp_do_rfc1644) {
+ tao.tao_cc = 0;
+ tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC,
+ tao.tao_cc, 0);
+ }
}
+
/*
* TAO test failed or there was no CC option,
* do a standard 3-way handshake.
@@ -1087,33 +1071,22 @@ syncache_respond(sc, m)
int optlen, error;
u_int16_t tlen, hlen, mssopt;
struct ip *ip = NULL;
- struct rtentry *rt;
struct tcphdr *th;
struct inpcb *inp;
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
+ hlen =
#ifdef INET6
- if (sc->sc_inc.inc_isipv6) {
- rt = tcp_rtlookup6(&sc->sc_inc);
- if (rt != NULL)
- mssopt = rt->rt_ifp->if_mtu -
- (sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
- else
- mssopt = tcp_v6mssdflt;
- hlen = sizeof(struct ip6_hdr);
- } else
+ (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) :
#endif
- {
- rt = tcp_rtlookup(&sc->sc_inc);
- if (rt != NULL)
- mssopt = rt->rt_ifp->if_mtu -
- (sizeof(struct ip) + sizeof(struct tcphdr));
- else
- mssopt = tcp_mssdflt;
- hlen = sizeof(struct ip);
- }
+ sizeof(struct ip);
+
+ KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer"));
+
+ /* Determine MSS we advertize to other end of connection */
+ mssopt = tcp_mssopt(&sc->sc_inc);
/* Compute the size of the TCP options. */
if (sc->sc_flags & SCF_NOOPT) {
@@ -1244,13 +1217,10 @@ syncache_respond(sc, m)
#ifdef INET6
if (sc->sc_inc.inc_isipv6) {
- struct route_in6 *ro6 = &sc->sc_route6;
-
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
- ip6->ip6_hlim = in6_selecthlim(NULL,
- ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
- error = ip6_output(m, NULL, ro6, 0, NULL, NULL, inp);
+ ip6->ip6_hlim = in6_selecthlim(NULL, NULL);
+ error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp);
} else
#endif
{
@@ -1268,7 +1238,7 @@ syncache_respond(sc, m)
mtod(m, void *), th, 0);
}
#endif
- error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL,inp);
+ error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp);
}
INP_UNLOCK(inp);
return (error);
@@ -1435,13 +1405,11 @@ syncookie_lookup(inc, th, so)
if (inc->inc_isipv6) {
sc->sc_inc.inc6_faddr = inc->inc6_faddr;
sc->sc_inc.inc6_laddr = inc->inc6_laddr;
- sc->sc_route6.ro_rt = NULL;
} else
#endif
{
sc->sc_inc.inc_faddr = inc->inc_faddr;
sc->sc_inc.inc_laddr = inc->inc_laddr;
- sc->sc_route.ro_rt = NULL;
}
sc->sc_irs = th->th_seq - 1;
sc->sc_iss = th->th_ack - 1;
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 1a253ab..1eeb66e 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -551,10 +551,8 @@ tcp_timer_rexmt(xtp)
if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3))
tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
/*
- * If losing, let the lower level know and try for
- * a better route. Also, if we backed off this far,
- * our srtt estimate is probably bogus. Clobber it
- * so we'll take the next rtt measurement as our srtt;
+ * If we backed off this far, our srtt estimate is probably bogus.
+ * Clobber it so we'll take the next rtt measurement as our srtt;
* move the current srtt into rttvar to keep the current
* retransmit times until then.
*/
@@ -564,7 +562,6 @@ tcp_timer_rexmt(xtp)
in6_losing(tp->t_inpcb);
else
#endif
- in_losing(tp->t_inpcb);
tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
tp->t_srtt = 0;
}
diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c
index 7ce06f6..dfd6de1 100644
--- a/sys/netinet/tcp_timewait.c
+++ b/sys/netinet/tcp_timewait.c
@@ -76,6 +76,7 @@
#include <netinet/ip_var.h>
#ifdef INET6
#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
#endif
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
@@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
&tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
-static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
static void tcp_discardcb(struct tcpcb *);
@@ -215,7 +215,6 @@ tcp_init()
int hashsize = TCBHASHSIZE;
tcp_ccgen = 1;
- tcp_cleartaocache();
tcp_delacktime = TCPTV_DELACK;
tcp_keepinit = TCPTV_KEEP_INIT;
@@ -262,6 +261,7 @@ tcp_init()
uma_zone_set_max(tcptw_zone, maxsockets / 5);
tcp_timer_init();
syncache_init();
+ tcp_hc_init();
}
/*
@@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
{
register int tlen;
int win = 0;
- struct route *ro = 0;
- struct route sro;
struct ip *ip;
struct tcphdr *nth;
#ifdef INET6
- struct route_in6 *ro6 = 0;
- struct route_in6 sro6;
struct ip6_hdr *ip6;
int isipv6;
#endif /* INET6 */
int ipflags = 0;
- struct inpcb *inp;
+ struct inpcb *inp = NULL;
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
@@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
win = (long)TCP_MAXWIN << tp->rcv_scale;
}
-#ifdef INET6
- if (isipv6)
- ro6 = &inp->in6p_route;
- else
-#endif /* INET6 */
- ro = &inp->inp_route;
- } else {
- inp = NULL;
-#ifdef INET6
- if (isipv6) {
- ro6 = &sro6;
- bzero(ro6, sizeof *ro6);
- } else
-#endif /* INET6 */
- {
- ro = &sro;
- bzero(ro, sizeof *ro);
- }
}
if (m == 0) {
m = m_gethdr(M_DONTWAIT, MT_HEADER);
@@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
nth->th_sum = in6_cksum(m, IPPROTO_TCP,
sizeof(struct ip6_hdr),
tlen - sizeof(struct ip6_hdr));
- ip6->ip6_hlim = in6_selecthlim(inp,
- ro6 && ro6->ro_rt ?
- ro6->ro_rt->rt_ifp :
- NULL);
+ ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL);
} else
#endif /* INET6 */
{
@@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags)
tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
#endif
#ifdef INET6
- if (isipv6) {
- (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp);
- if (ro6 == &sro6 && ro6->ro_rt) {
- RTFREE(ro6->ro_rt);
- ro6->ro_rt = NULL;
- }
- } else
+ if (isipv6)
+ (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp);
+ else
#endif /* INET6 */
- {
- (void) ip_output(m, NULL, ro, ipflags, NULL, inp);
- if (ro == &sro && ro->ro_rt) {
- RTFREE(ro->ro_rt);
- ro->ro_rt = NULL;
- }
- }
+ (void) ip_output(m, NULL, NULL, ipflags, NULL, inp);
}
/*
@@ -647,8 +612,6 @@ tcp_discardcb(tp)
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- struct rtentry *rt;
- int dosavessthresh;
/*
* Make sure that all of our timers are stopped before we
@@ -663,89 +626,34 @@ tcp_discardcb(tp)
/*
* If we got enough samples through the srtt filter,
* save the rtt and rttvar in the routing entry.
- * 'Enough' is arbitrarily defined as the 16 samples.
- * 16 samples is enough for the srtt filter to converge
- * to within 5% of the correct value; fewer samples and
- * we could save a very bogus rtt.
- *
- * Don't update the default route's characteristics and don't
- * update anything that the user "locked".
+ * 'Enough' is arbitrarily defined as 4 rtt samples.
+ * 4 samples is enough for the srtt filter to converge
+ * to within enough % of the correct value; fewer samples
+ * and we could save a bogus rtt. The danger is not high
+ * as tcp quickly recovers from everything.
+ * XXX: Works very well but needs some more statistics!
*/
- if (tp->t_rttupdated >= 16) {
- register u_long i = 0;
-#ifdef INET6
- if (isipv6) {
- struct sockaddr_in6 *sin6;
+ if (tp->t_rttupdated >= 4) {
+ struct hc_metrics_lite metrics;
+ u_long ssthresh;
- if ((rt = inp->in6p_route.ro_rt) == NULL)
- goto no_valid_rt;
- sin6 = (struct sockaddr_in6 *)rt_key(rt);
- if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
- goto no_valid_rt;
- }
- else
-#endif /* INET6 */
- if ((rt = inp->inp_route.ro_rt) == NULL ||
- ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr
- == INADDR_ANY)
- goto no_valid_rt;
-
- if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
- i = tp->t_srtt *
- (RTM_RTTUNIT / (hz * TCP_RTT_SCALE));
- if (rt->rt_rmx.rmx_rtt && i)
- /*
- * filter this update to half the old & half
- * the new values, converting scale.
- * See route.h and tcp_var.h for a
- * description of the scaling constants.
- */
- rt->rt_rmx.rmx_rtt =
- (rt->rt_rmx.rmx_rtt + i) / 2;
- else
- rt->rt_rmx.rmx_rtt = i;
- tcpstat.tcps_cachedrtt++;
- }
- if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
- i = tp->t_rttvar *
- (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE));
- if (rt->rt_rmx.rmx_rttvar && i)
- rt->rt_rmx.rmx_rttvar =
- (rt->rt_rmx.rmx_rttvar + i) / 2;
- else
- rt->rt_rmx.rmx_rttvar = i;
- tcpstat.tcps_cachedrttvar++;
- }
+ bzero(&metrics, sizeof(metrics));
/*
- * The old comment here said:
- * update the pipelimit (ssthresh) if it has been updated
- * already or if a pipesize was specified & the threshhold
- * got below half the pipesize. I.e., wait for bad news
- * before we start updating, then update on both good
- * and bad news.
- *
- * But we want to save the ssthresh even if no pipesize is
- * specified explicitly in the route, because such
- * connections still have an implicit pipesize specified
- * by the global tcp_sendspace. In the absence of a reliable
- * way to calculate the pipesize, it will have to do.
+ * Update the ssthresh always when the conditions below
+ * are satisfied. This gives us better new start value
+ * for the congestion avoidance for new connections.
+ * ssthresh is only set if packet loss occured on a session.
*/
- i = tp->snd_ssthresh;
- if (rt->rt_rmx.rmx_sendpipe != 0)
- dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
- else
- dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
- if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
- i != 0 && rt->rt_rmx.rmx_ssthresh != 0)
- || dosavessthresh) {
+ ssthresh = tp->snd_ssthresh;
+ if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
- i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
- if (i < 2)
- i = 2;
- i *= (u_long)(tp->t_maxseg +
+ ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg;
+ if (ssthresh < 2)
+ ssthresh = 2;
+ ssthresh *= (u_long)(tp->t_maxseg +
#ifdef INET6
(isipv6 ? sizeof (struct ip6_hdr) +
sizeof (struct tcphdr) :
@@ -755,15 +663,21 @@ tcp_discardcb(tp)
)
#endif
);
- if (rt->rt_rmx.rmx_ssthresh)
- rt->rt_rmx.rmx_ssthresh =
- (rt->rt_rmx.rmx_ssthresh + i) / 2;
- else
- rt->rt_rmx.rmx_ssthresh = i;
- tcpstat.tcps_cachedssthresh++;
- }
+ } else
+ ssthresh = 0;
+ metrics.rmx_ssthresh = ssthresh;
+
+ metrics.rmx_rtt = tp->t_srtt;
+ metrics.rmx_rttvar = tp->t_rttvar;
+ /* XXX: This wraps if the pipe is more than 4 Gbit per second */
+ metrics.rmx_bandwidth = tp->snd_bandwidth;
+ metrics.rmx_cwnd = tp->snd_cwnd;
+ metrics.rmx_sendpipe = 0;
+ metrics.rmx_recvpipe = 0;
+
+ tcp_hc_update(&inp->inp_inc, &metrics);
}
- no_valid_rt:
+
/* free the reassembly queue, if any */
while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
@@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip)
notify = tcp_drop_syn_sent;
else if (cmd == PRC_MSGSIZE)
notify = tcp_mtudisc;
- else if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno)
int errno;
{
struct tcpcb *tp = intotcpcb(inp);
- struct rtentry *rt;
- struct rmxp_tao *taop;
+ struct rmxp_tao tao;
struct socket *so = inp->inp_socket;
- int offered;
+ u_int maxmtu;
+ u_int romtu;
int mss;
#ifdef INET6
int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
+ bzero(&tao, sizeof(tao));
if (tp) {
+ maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */
+ romtu =
#ifdef INET6
- if (isipv6)
- rt = tcp_rtlookup6(&inp->inp_inc);
- else
+ isipv6 ? tcp_maxmtu6(&inp->inp_inc) :
#endif /* INET6 */
- rt = tcp_rtlookup(&inp->inp_inc);
- if (!rt || !rt->rt_rmx.rmx_mtu) {
+ tcp_maxmtu(&inp->inp_inc);
+ if (!maxmtu)
+ maxmtu = romtu;
+ else
+ maxmtu = min(maxmtu, romtu);
+ if (!maxmtu) {
tp->t_maxopd = tp->t_maxseg =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno)
tcp_mssdflt;
return inp;
}
- taop = rmx_taop(rt->rt_rmx);
- offered = taop->tao_mssopt;
- mss = rt->rt_rmx.rmx_mtu -
+ mss = maxmtu -
#ifdef INET6
(isipv6 ?
sizeof(struct ip6_hdr) + sizeof(struct tcphdr) :
@@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno)
#endif /* INET6 */
;
- if (offered)
- mss = min(mss, offered);
+ if (tcp_do_rfc1644) {
+ tcp_hc_gettao(&inp->inp_inc, &tao);
+ if (tao.tao_mssopt)
+ mss = min(mss, tao.tao_mssopt);
+ }
/*
* XXX - The above conditional probably violates the TCP
* spec. The problem is that, since we don't know the
@@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno)
* is called by TCP routines that access the rmx structure and by tcp_mss
* to get the interface MTU.
*/
-struct rtentry *
-tcp_rtlookup(inc)
+u_long
+tcp_maxmtu(inc)
struct in_conninfo *inc;
{
- struct route *ro;
- struct rtentry *rt;
-
- ro = &inc->inc_route;
- rt = ro->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (inc->inc_faddr.s_addr != INADDR_ANY) {
- ro->ro_dst.sa_family = AF_INET;
- ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
- ((struct sockaddr_in *) &ro->ro_dst)->sin_addr =
- inc->inc_faddr;
- rtalloc(ro);
- rt = ro->ro_rt;
- }
+ struct route sro;
+ struct sockaddr_in *dst;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer"));
+
+ sro.ro_rt = NULL;
+ if (inc->inc_faddr.s_addr != INADDR_ANY) {
+ dst = (struct sockaddr_in *)&sro.ro_dst;
+ dst->sin_family = AF_INET;
+ dst->sin_len = sizeof(*dst);
+ dst->sin_addr = inc->inc_faddr;
+ rtalloc_ign(&sro, RTF_CLONING);
+ }
+ if (sro.ro_rt != NULL) {
+ ifp = sro.ro_rt->rt_ifp;
+ if (sro.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = ifp->if_mtu;
+ else
+ maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu);
+ RTFREE(sro.ro_rt);
}
- return rt;
+ return (maxmtu);
}
#ifdef INET6
-struct rtentry *
-tcp_rtlookup6(inc)
+u_long
+tcp_maxmtu6(inc)
struct in_conninfo *inc;
{
- struct route_in6 *ro6;
- struct rtentry *rt;
-
- ro6 = &inc->inc6_route;
- rt = ro6->ro_rt;
- if (rt == NULL || !(rt->rt_flags & RTF_UP)) {
- /* No route yet, so try to acquire one */
- if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
- ro6->ro_dst.sin6_family = AF_INET6;
- ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6);
- ro6->ro_dst.sin6_addr = inc->inc6_faddr;
- rtalloc((struct route *)ro6);
- rt = ro6->ro_rt;
- }
+ struct route_in6 sro6;
+ struct ifnet *ifp;
+ u_long maxmtu = 0;
+
+ KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer"));
+
+ sro6.ro_rt = NULL;
+ if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) {
+ sro6.ro_dst.sin6_family = AF_INET6;
+ sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ sro6.ro_dst.sin6_addr = inc->inc6_faddr;
+ rtalloc_ign((struct route *)&sro6, RTF_CLONING);
}
- return rt;
+ if (sro6.ro_rt != NULL) {
+ ifp = sro6.ro_rt->rt_ifp;
+ if (sro6.ro_rt->rt_rmx.rmx_mtu == 0)
+ maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp);
+ else
+ maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu,
+ IN6_LINKMTU(sro6.ro_rt->rt_ifp));
+ RTFREE(sro6.ro_rt);
+ }
+
+ return (maxmtu);
}
#endif /* INET6 */
@@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp)
#endif /*IPSEC*/
/*
- * Return a pointer to the cached information about the remote host.
- * The cached information is stored in the protocol specific part of
- * the route metrics.
- */
-struct rmxp_tao *
-tcp_gettaocache(inc)
- struct in_conninfo *inc;
-{
- struct rtentry *rt;
-
-#ifdef INET6
- if (inc->inc_isipv6)
- rt = tcp_rtlookup6(inc);
- else
-#endif /* INET6 */
- rt = tcp_rtlookup(inc);
-
- /* Make sure this is a host route and is up. */
- if (rt == NULL ||
- (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST))
- return NULL;
-
- return rmx_taop(rt->rt_rmx);
-}
-
-/*
- * Clear all the TAO cache entries, called from tcp_init.
- *
- * XXX
- * This routine is just an empty one, because we assume that the routing
- * routing tables are initialized at the same time when TCP, so there is
- * nothing in the cache left over.
- */
-static void
-tcp_cleartaocache()
-{
-}
-
-/*
* Move a TCP connection into TIME_WAIT state.
* tcbinfo is unlocked.
* inp is locked, and is unlocked before returning.
@@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
if (isipv6) {
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
sizeof(struct tcphdr) + optlen);
- ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
- inp->in6p_route.ro_rt->rt_ifp : NULL);
- error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
+ ip6->ip6_hlim = in6_selecthlim(inp, NULL);
+ error = ip6_output(m, inp->in6p_outputopts, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
} else
#endif
@@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc,
m->m_pkthdr.csum_flags = CSUM_TCP;
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
ip->ip_len = m->m_pkthdr.len;
- error = ip_output(m, inp->inp_options, &inp->inp_route,
+ error = ip_output(m, inp->inp_options, NULL,
(tw->tw_so_options & SO_DONTROUTE), NULL, inp);
}
if (flags & TH_ACK)
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index 7035227..17566c8 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -848,12 +848,13 @@ tcp_connect(tp, nam, td)
struct inpcb *inp = tp->t_inpcb, *oinp;
struct socket *so = inp->inp_socket;
struct tcptw *otw;
- struct rmxp_tao *taop;
- struct rmxp_tao tao_noncached;
+ struct rmxp_tao tao;
struct in_addr laddr;
u_short lport;
int error;
+ bzero(&tao, sizeof(tao));
+
if (inp->inp_lport == 0) {
error = in_pcbbind(inp, (struct sockaddr *)0, td);
if (error)
@@ -902,20 +903,22 @@ tcp_connect(tp, nam, td)
* Generate a CC value for this connection and
* check whether CC or CCnew should be used.
*/
- if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
tp->cc_send = CC_INC(tcp_ccgen);
- if (taop->tao_ccsent != 0 &&
- CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
- taop->tao_ccsent = tp->cc_send;
+ if (tao.tao_ccsent != 0 &&
+ CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
+ tao.tao_ccsent = tp->cc_send;
} else {
- taop->tao_ccsent = 0;
+ tao.tao_ccsent = 0;
tp->t_flags |= TF_SENDCCNEW;
}
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
+ tao.tao_ccsent, 0);
+
return 0;
}
@@ -931,10 +934,11 @@ tcp6_connect(tp, nam, td)
struct tcptw *otw;
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
struct in6_addr *addr6;
- struct rmxp_tao *taop;
- struct rmxp_tao tao_noncached;
+ struct rmxp_tao tao;
int error;
+ bzero(&tao, sizeof(tao));
+
if (inp->inp_lport == 0) {
error = in6_pcbbind(inp, (struct sockaddr *)0, td);
if (error)
@@ -991,19 +995,20 @@ tcp6_connect(tp, nam, td)
* Generate a CC value for this connection and
* check whether CC or CCnew should be used.
*/
- if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) {
- taop = &tao_noncached;
- bzero(taop, sizeof(*taop));
- }
+ if (tcp_do_rfc1644)
+ tcp_hc_gettao(&inp->inp_inc, &tao);
tp->cc_send = CC_INC(tcp_ccgen);
- if (taop->tao_ccsent != 0 &&
- CC_GEQ(tp->cc_send, taop->tao_ccsent)) {
- taop->tao_ccsent = tp->cc_send;
+ if (tao.tao_ccsent != 0 &&
+ CC_GEQ(tp->cc_send, tao.tao_ccsent)) {
+ tao.tao_ccsent = tp->cc_send;
} else {
- taop->tao_ccsent = 0;
+ tao.tao_ccsent = 0;
tp->t_flags |= TF_SENDCCNEW;
}
+ if (tcp_do_rfc1644)
+ tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT,
+ tao.tao_ccsent, 0);
return 0;
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 2e5b3fa..ddcfd3c 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -213,8 +213,6 @@ struct syncache {
struct tcpcb *sc_tp; /* tcb for listening socket */
struct mbuf *sc_ipopts; /* source route */
struct in_conninfo sc_inc; /* addresses */
-#define sc_route sc_inc.inc_route
-#define sc_route6 sc_inc.inc6_route
u_int32_t sc_tsrecent;
tcp_cc sc_cc_send; /* holds CC or CCnew */
tcp_cc sc_cc_recv;
@@ -232,7 +230,6 @@ struct syncache {
#define SCF_TIMESTAMP 0x04 /* negotiated timestamps */
#define SCF_CC 0x08 /* negotiated CC */
#define SCF_UNREACH 0x10 /* icmp unreachable received */
-#define SCF_KEEPROUTE 0x20 /* keep cloned route */
TAILQ_ENTRY(syncache) sc_hash;
TAILQ_ENTRY(syncache) sc_timerq;
};
@@ -242,6 +239,17 @@ struct syncache_head {
u_int sch_length;
};
+struct hc_metrics_lite { /* must stay in sync with hc_metrics */
+ u_long rmx_mtu; /* MTU for this path */
+ u_long rmx_ssthresh; /* outbound gateway buffer limit */
+ u_long rmx_rtt; /* estimated round trip time */
+ u_long rmx_rttvar; /* estimated rtt variance */
+ u_long rmx_bandwidth; /* estimated bandwidth */
+ u_long rmx_cwnd; /* congestion window */
+ u_long rmx_sendpipe; /* outbound delay-bandwidth product */
+ u_long rmx_recvpipe; /* inbound delay-bandwidth product */
+};
+
struct tcptw {
struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */
tcp_seq snd_nxt;
@@ -260,8 +268,7 @@ struct tcptw {
};
/*
- * The TAO cache entry which is stored in the protocol family specific
- * portion of the route metrics.
+ * The TAO cache entry which is stored in the tcp hostcache.
*/
struct rmxp_tao {
tcp_cc tao_cc; /* latest CC in valid SYN */
@@ -274,7 +281,6 @@ struct rmxp_tao {
#define TAOF_UNDEF 0 /* we don't know yet */
#endif /* notyet */
};
-#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler)
#define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb)
#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb)
@@ -401,6 +407,9 @@ struct tcpstat {
u_long tcps_sc_zonefail; /* zalloc() failed */
u_long tcps_sc_sendcookie; /* SYN cookie sent */
u_long tcps_sc_recvcookie; /* SYN cookie received */
+
+ u_long tcps_hc_added; /* entry added to hostcache */
+ u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */
};
/*
@@ -451,6 +460,7 @@ struct xtcpcb {
{ "pcblist", CTLTYPE_STRUCT }, \
{ "delacktime", CTLTYPE_INT }, \
{ "v6mssdflt", CTLTYPE_INT }, \
+ { "maxid", CTLTYPE_INT }, \
}
@@ -482,12 +492,12 @@ struct tcpcb *
tcp_drop(struct tcpcb *, int);
void tcp_drain(void);
void tcp_fasttimo(void);
-struct rmxp_tao *
- tcp_gettaocache(struct in_conninfo *);
void tcp_init(void);
void tcp_input(struct mbuf *, int);
+u_long tcp_maxmtu(struct in_conninfo *);
+u_long tcp_maxmtu6(struct in_conninfo *);
void tcp_mss(struct tcpcb *, int);
-int tcp_mssopt(struct tcpcb *);
+int tcp_mssopt(struct in_conninfo *);
struct inpcb *
tcp_drop_syn_sent(struct inpcb *, int);
struct inpcb *
@@ -500,8 +510,6 @@ struct inpcb *
void tcp_respond(struct tcpcb *, void *,
struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int);
int tcp_twrespond(struct tcptw *, struct socket *, struct mbuf *, int);
-struct rtentry *
- tcp_rtlookup(struct in_conninfo *);
void tcp_setpersist(struct tcpcb *);
void tcp_slowtimo(void);
struct tcptemp *
@@ -519,6 +527,20 @@ int syncache_add(struct in_conninfo *, struct tcpopt *,
struct tcphdr *, struct socket **, struct mbuf *);
void syncache_chkrst(struct in_conninfo *, struct tcphdr *);
void syncache_badack(struct in_conninfo *);
+/*
+ * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo)
+ */
+void tcp_hc_init(void);
+void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *);
+u_long tcp_hc_getmtu(struct in_conninfo *);
+void tcp_hc_gettao(struct in_conninfo *, struct rmxp_tao *);
+void tcp_hc_updatemtu(struct in_conninfo *, u_long);
+void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *);
+void tcp_hc_updatetao(struct in_conninfo *, int, tcp_cc, u_short);
+/* update which tao field */
+#define TCP_HC_TAO_CC 0x1
+#define TCP_HC_TAO_CCSENT 0x2
+#define TCP_HC_TAO_MSSOPT 0x3
extern struct pr_usrreqs tcp_usrreqs;
extern u_long tcp_sendspace;
diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c
index 60ec82b..62e6131 100644
--- a/sys/netinet/udp_usrreq.c
+++ b/sys/netinet/udp_usrreq.c
@@ -544,10 +544,17 @@ udp_ctlinput(cmd, sa, vip)
if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
return;
- if (PRC_IS_REDIRECT(cmd)) {
- ip = 0;
- notify = in_rtchange;
- } else if (cmd == PRC_HOSTDEAD)
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ if (cmd == PRC_HOSTDEAD)
ip = 0;
else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
return;
@@ -873,7 +880,7 @@ udp_output(inp, m, addr, control, td)
((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */
udpstat.udps_opackets++;
- error = ip_output(m, inp->inp_options, &inp->inp_route, ipflags,
+ error = ip_output(m, inp->inp_options, NULL, ipflags,
inp->inp_moptions, inp);
return (error);
diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c
index 997474e..6baa2db 100644
--- a/sys/netinet6/icmp6.c
+++ b/sys/netinet6/icmp6.c
@@ -94,6 +94,7 @@
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
+#include <netinet/tcp_var.h>
#include <netinet6/in6_ifattach.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6protosw.h>
@@ -1105,8 +1106,7 @@ icmp6_mtudisc_update(ip6cp, validated)
struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6;
struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */
u_int mtu = ntohl(icmp6->icmp6_mtu);
- struct rtentry *rt = NULL;
- struct sockaddr_in6 sin6;
+ struct in_conninfo inc;
#if 0
/*
@@ -1131,31 +1131,19 @@ icmp6_mtudisc_update(ip6cp, validated)
if (!validated)
return;
- bzero(&sin6, sizeof(sin6));
- sin6.sin6_family = PF_INET6;
- sin6.sin6_len = sizeof(struct sockaddr_in6);
- sin6.sin6_addr = *dst;
+ bzero(&inc, sizeof(inc));
+ inc.inc_flags = 1; /* IPv6 */
+ inc.inc6_faddr = *dst;
/* XXX normally, this won't happen */
if (IN6_IS_ADDR_LINKLOCAL(dst)) {
- sin6.sin6_addr.s6_addr16[1] =
+ inc.inc6_faddr.s6_addr16[1] =
htons(m->m_pkthdr.rcvif->if_index);
}
- /* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */
- rt = rtalloc1((struct sockaddr *)&sin6, 0, RTF_CLONING);
-
- if (rt && (rt->rt_flags & RTF_HOST) &&
- !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
- if (mtu < IPV6_MMTU) {
- /* xxx */
- rt->rt_rmx.rmx_locks |= RTV_MTU;
- } else if (mtu < rt->rt_ifp->if_mtu &&
- rt->rt_rmx.rmx_mtu > mtu) {
- icmp6stat.icp6s_pmtuchg++;
- rt->rt_rmx.rmx_mtu = mtu;
- }
+
+ if (mtu >= IPV6_MMTU) {
+ tcp_hc_updatemtu(&inc, mtu);
+ icmp6stat.icp6s_pmtuchg++;
}
- if (rt)
- rtfree(rt);
}
/*
diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c
index 5c7f1f2..b3d58e8 100644
--- a/sys/netinet6/in6_pcb.c
+++ b/sys/netinet6/in6_pcb.c
@@ -337,8 +337,7 @@ in6_pcbladdr(inp, nam, plocal_addr6)
* Is it the intended behavior?
*/
*plocal_addr6 = in6_selectsrc(sin6, inp->in6p_outputopts,
- inp->in6p_moptions,
- &inp->in6p_route,
+ inp->in6p_moptions, NULL,
&inp->in6p_laddr, &error);
if (*plocal_addr6 == 0) {
if (error == 0)
@@ -351,10 +350,6 @@ in6_pcbladdr(inp, nam, plocal_addr6)
* and exit to caller, that will do the lookup.
*/
}
-
- if (inp->in6p_route.ro_rt)
- ifp = inp->in6p_route.ro_rt->rt_ifp;
-
return (0);
}
@@ -447,8 +442,6 @@ in6_pcbdetach(inp)
ip6_freepcbopts(inp->in6p_outputopts);
ip6_freemoptions(inp->in6p_moptions);
- if (inp->in6p_route.ro_rt)
- RTFREE(inp->in6p_route.ro_rt);
/* Check and free IPv4 related resources in case of mapped addr */
if (inp->inp_options)
(void)m_free(inp->inp_options);
@@ -830,26 +823,10 @@ void
in6_losing(in6p)
struct inpcb *in6p;
{
- struct rtentry *rt;
- struct rt_addrinfo info;
-
- if ((rt = in6p->in6p_route.ro_rt) != NULL) {
- RT_LOCK(rt);
- in6p->in6p_route.ro_rt = NULL;
- bzero((caddr_t)&info, sizeof(info));
- info.rti_flags = rt->rt_flags;
- info.rti_info[RTAX_DST] = rt_key(rt);
- info.rti_info[RTAX_GATEWAY] = rt->rt_gateway;
- info.rti_info[RTAX_NETMASK] = rt_mask(rt);
- rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0);
- if (rt->rt_flags & RTF_DYNAMIC)
- rtexpunge(rt);
- RTFREE_LOCKED(rt);
- /*
- * A new route can be allocated
- * the next time output is attempted.
- */
- }
+ /*
+ * We don't store route pointers in the routing table anymore
+ */
+ return;
}
/*
@@ -861,14 +838,9 @@ in6_rtchange(inp, errno)
struct inpcb *inp;
int errno;
{
- if (inp->in6p_route.ro_rt) {
- RTFREE(inp->in6p_route.ro_rt);
- inp->in6p_route.ro_rt = 0;
- /*
- * A new route can be allocated the next time
- * output is attempted.
- */
- }
+ /*
+ * We don't store route pointers in the routing table anymore
+ */
return inp;
}
diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c
index 09526b2..b68852d 100644
--- a/sys/netinet6/in6_rmx.c
+++ b/sys/netinet6/in6_rmx.c
@@ -141,8 +141,7 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head,
}
}
- if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU)
- && rt->rt_ifp)
+ if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp)
rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp);
ret = rn_addroute(v_arg, n_arg, head, treenodes);
diff --git a/sys/netinet6/in6_src.c b/sys/netinet6/in6_src.c
index d584956..88ace1c 100644
--- a/sys/netinet6/in6_src.c
+++ b/sys/netinet6/in6_src.c
@@ -211,7 +211,6 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp)
!= 0) {
return (NULL);
}
-
/*
* determine the appropriate zone id of the source based on
* the zone of the destination and the outgoing interface.
@@ -449,12 +448,19 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
struct route_in6 *ro;
struct ifnet **retifp;
{
- int error, clone;
+ int error;
+ struct route_in6 sro;
struct rtentry *rt = NULL;
- clone = IN6_IS_ADDR_MULTICAST(&dstsock->sin6_addr) ? 0 : 1;
+ if (ro == NULL) {
+ bzero(&sro, sizeof(sro));
+ ro = &sro;
+ }
+
if ((error = in6_selectroute(dstsock, opts, mopts, ro, retifp,
- &rt, clone)) != 0) {
+ &rt, 0)) != 0) {
+ if (rt && rt == sro.ro_rt)
+ RTFREE(rt);
return (error);
}
@@ -476,7 +482,11 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
* We thus reject the case here.
*/
if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) {
- return (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
+ int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH);
+
+ if (rt && rt == sro.ro_rt)
+ RTFREE(rt);
+ return (flags);
}
/*
@@ -489,6 +499,8 @@ in6_selectif(dstsock, opts, mopts, ro, retifp)
if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp)
*retifp = rt->rt_ifa->ifa_ifp;
+ if (rt && rt == sro.ro_rt)
+ RTFREE(rt);
return (0);
}
@@ -623,6 +635,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone)
sa6 = (struct sockaddr_in6 *)&ro->ro_dst;
*sa6 = *dstsock;
sa6->sin6_scope_id = 0;
+
if (clone) {
rtalloc((struct route *)ro);
} else {
@@ -695,7 +708,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone)
* 2. (If the outgoing interface is detected) the current
* hop limit of the interface specified by router advertisement.
* 3. The system default hoplimit.
-*/
+ */
int
in6_selecthlim(in6p, ifp)
struct in6pcb *in6p;
@@ -705,8 +718,24 @@ in6_selecthlim(in6p, ifp)
return (in6p->in6p_hops);
else if (ifp)
return (ND_IFINFO(ifp)->chlim);
- else
- return (ip6_defhlim);
+ else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) {
+ struct route_in6 ro6;
+ struct ifnet *lifp;
+
+ bzero(&ro6, sizeof(ro6));
+ ro6.ro_dst.sin6_family = AF_INET6;
+ ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6);
+ ro6.ro_dst.sin6_addr = in6p->in6p_faddr;
+ rtalloc((struct route *)&ro6);
+ if (ro6.ro_rt) {
+ lifp = ro6.ro_rt->rt_ifp;
+ RTFREE(ro6.ro_rt);
+ if (lifp)
+ return (ND_IFINFO(lifp)->chlim);
+ } else
+ return (ip6_defhlim);
+ }
+ return (ip6_defhlim);
}
/*
diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c
index b95b197..3072851 100644
--- a/sys/netinet6/ip6_output.c
+++ b/sys/netinet6/ip6_output.c
@@ -96,6 +96,7 @@
#include <netinet/icmp6.h>
#include <netinet6/ip6_var.h>
#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
#include <netinet6/nd6.h>
#ifdef IPSEC
@@ -661,7 +662,7 @@ skip_ipsec2:;
/* XXX rt not locked */
ia = ifatoia6(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
- ro->ro_rt->rt_use++;
+ ro->ro_rt->rt_rmx.rmx_pksent++;
if (ro->ro_rt->rt_flags & RTF_GATEWAY)
dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway;
m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */
@@ -757,7 +758,7 @@ skip_ipsec2:;
}
ia = ifatoia6(ro->ro_rt->rt_ifa);
ifp = ro->ro_rt->rt_ifp;
- ro->ro_rt->rt_use++;
+ ro->ro_rt->rt_rmx.rmx_pksent++;
RT_UNLOCK(ro->ro_rt);
}
@@ -1387,11 +1388,20 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp)
}
if (ro_pmtu->ro_rt) {
u_int32_t ifmtu;
+ struct in_conninfo inc;
+
+ bzero(&inc, sizeof(inc));
+ inc.inc_flags = 1; /* IPv6 */
+ inc.inc6_faddr = *dst;
if (ifp == NULL)
ifp = ro_pmtu->ro_rt->rt_ifp;
ifmtu = IN6_LINKMTU(ifp);
- mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
+ mtu = tcp_hc_getmtu(&inc);
+ if (mtu)
+ mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu);
+ else
+ mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu;
if (mtu == 0)
mtu = ifmtu;
else if (mtu < IPV6_MMTU) {
@@ -1415,8 +1425,7 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp)
* field isn't locked).
*/
mtu = ifmtu;
- if (!(ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU))
- ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
+ ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu;
}
} else if (ifp) {
mtu = IN6_LINKMTU(ifp);
@@ -1993,7 +2002,9 @@ do { \
{
u_long pmtu = 0;
struct ip6_mtuinfo mtuinfo;
- struct route_in6 *ro = (struct route_in6 *)&in6p->in6p_route;
+ struct route_in6 sro;
+
+ bzero(&sro, sizeof(sro));
if (!(so->so_state & SS_ISCONNECTED))
return (ENOTCONN);
@@ -2002,8 +2013,10 @@ do { \
* routing, or optional information to specify
* the outgoing interface.
*/
- error = ip6_getpmtu(ro, NULL, NULL,
+ error = ip6_getpmtu(&sro, NULL, NULL,
&in6p->in6p_faddr, &pmtu, NULL);
+ if (sro.ro_rt)
+ RTFREE(sro.ro_rt);
if (error)
break;
if (pmtu > IPV6_MAXPACKET)
diff --git a/sys/netinet6/udp6_output.c b/sys/netinet6/udp6_output.c
index 36a7fba..d905e84 100644
--- a/sys/netinet6/udp6_output.c
+++ b/sys/netinet6/udp6_output.c
@@ -203,8 +203,7 @@ udp6_output(in6p, m, addr6, control, td)
if (!IN6_IS_ADDR_V4MAPPED(faddr)) {
laddr = in6_selectsrc(sin6, in6p->in6p_outputopts,
- in6p->in6p_moptions,
- &in6p->in6p_route,
+ in6p->in6p_moptions, NULL,
&in6p->in6p_laddr, &error);
} else
laddr = &in6p->in6p_laddr; /* XXX */
@@ -277,9 +276,7 @@ udp6_output(in6p, m, addr6, control, td)
ip6->ip6_plen = htons((u_short)plen);
#endif
ip6->ip6_nxt = IPPROTO_UDP;
- ip6->ip6_hlim = in6_selecthlim(in6p,
- in6p->in6p_route.ro_rt ?
- in6p->in6p_route.ro_rt->rt_ifp : NULL);
+ ip6->ip6_hlim = in6_selecthlim(in6p, NULL);
ip6->ip6_src = *laddr;
ip6->ip6_dst = *faddr;
@@ -297,7 +294,7 @@ udp6_output(in6p, m, addr6, control, td)
goto release;
}
#endif /* IPSEC */
- error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route,
+ error = ip6_output(m, in6p->in6p_outputopts, NULL,
flags, in6p->in6p_moptions, NULL, in6p);
break;
case AF_INET:
OpenPOWER on IntegriCloud