diff options
31 files changed, 1686 insertions, 1153 deletions
diff --git a/sys/conf/files b/sys/conf/files index 63c378b..8eee001 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1457,6 +1457,7 @@ netinet/ip_mroute.c optional mrouting netinet/ip_output.c optional inet netinet/raw_ip.c optional inet netinet/tcp_debug.c optional tcpdebug +netinet/tcp_hostcache.c optional inet netinet/tcp_input.c optional inet netinet/tcp_output.c optional inet netinet/tcp_subr.c optional inet diff --git a/sys/net/if_faith.c b/sys/net/if_faith.c index 07216b5..a8da4ad 100644 --- a/sys/net/if_faith.c +++ b/sys/net/if_faith.c @@ -270,17 +270,8 @@ faithrtrequest(cmd, rt, info) struct rt_addrinfo *info; { RT_LOCK_ASSERT(rt); - - if (rt) { - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */ - /* - * For optimal performance, the send and receive buffers - * should be at least twice the MTU plus a little more for - * overhead. - */ - rt->rt_rmx.rmx_recvpipe = - rt->rt_rmx.rmx_sendpipe = 3 * FAITHMTU; - } + if (rt) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; } /* diff --git a/sys/net/if_loop.c b/sys/net/if_loop.c index afe0a73..9a54af4 100644 --- a/sys/net/if_loop.c +++ b/sys/net/if_loop.c @@ -329,17 +329,8 @@ lortrequest(cmd, rt, info) struct rt_addrinfo *info; { RT_LOCK_ASSERT(rt); - - if (rt) { - rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; /* for ISO */ - /* - * For optimal performance, the send and receive buffers - * should be at least twice the MTU plus a little more for - * overhead. - */ - rt->rt_rmx.rmx_recvpipe = - rt->rt_rmx.rmx_sendpipe = 3 * LOMTU; - } + if (rt) + rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; } /* diff --git a/sys/net/route.h b/sys/net/route.h index 8fff560..34c33eb 100644 --- a/sys/net/route.h +++ b/sys/net/route.h @@ -58,6 +58,12 @@ struct route { * These numbers are used by reliable protocols for determining * retransmission behavior and are included in the routing structure. */ +struct rt_metrics_lite { + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_expire; /* lifetime for route, e.g. redirect */ + u_long rmx_pksent; /* packets sent using this route */ +}; + struct rt_metrics { u_long rmx_locks; /* Kernel must leave these values alone */ u_long rmx_mtu; /* MTU for this path */ @@ -104,10 +110,10 @@ struct rtentry { long rt_refcnt; /* # held references */ u_long rt_flags; /* up/down?, host/net */ struct ifnet *rt_ifp; /* the answer: interface to use */ - struct ifaddr *rt_ifa; /* the answer: interface to use */ + struct ifaddr *rt_ifa; /* the answer: interface address to use */ struct sockaddr *rt_genmask; /* for generation of cloned routes */ caddr_t rt_llinfo; /* pointer to link level info cache */ - struct rt_metrics rt_rmx; /* metrics used by rx'ing protocols */ + struct rt_metrics_lite rt_rmx; /* metrics used by rx'ing protocols */ struct rtentry *rt_gwroute; /* implied entry for gatewayed routes */ int (*rt_output)(struct ifnet *, struct mbuf *, struct sockaddr *, struct rtentry *); diff --git a/sys/net/rtsock.c b/sys/net/rtsock.c index 4fba1a2..3290c0c 100644 --- a/sys/net/rtsock.c +++ b/sys/net/rtsock.c @@ -87,7 +87,8 @@ static int sysctl_dumpentry(struct radix_node *rn, void *vw); static int sysctl_iflist(int af, struct walkarg *w); static int sysctl_ifmalist(int af, struct walkarg *w); static int route_output(struct mbuf *, struct socket *); -static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics *); +static void rt_setmetrics(u_long, struct rt_metrics *, struct rt_metrics_lite *); +static void rt_getmetrics(struct rt_metrics_lite *, struct rt_metrics *); static void rt_dispatch(struct mbuf *, struct sockaddr *); /* @@ -355,9 +356,6 @@ route_output(m, so) RT_LOCK(saved_nrt); rt_setmetrics(rtm->rtm_inits, &rtm->rtm_rmx, &saved_nrt->rt_rmx); - saved_nrt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); - saved_nrt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); RT_REMREF(saved_nrt); saved_nrt->rt_genmask = info.rti_info[RTAX_GENMASK]; RT_UNLOCK(saved_nrt); @@ -428,7 +426,7 @@ route_output(m, so) (void)rt_msg2(rtm->rtm_type, &info, (caddr_t)rtm, (struct walkarg *)0); rtm->rtm_flags = rt->rt_flags; - rtm->rtm_rmx = rt->rt_rmx; + rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); rtm->rtm_addrs = info.rti_addrs; break; @@ -478,9 +476,7 @@ route_output(m, so) rt->rt_genmask = info.rti_info[RTAX_GENMASK]; /* FALLTHROUGH */ case RTM_LOCK: - rt->rt_rmx.rmx_locks &= ~(rtm->rtm_inits); - rt->rt_rmx.rmx_locks |= - (rtm->rtm_inits & rtm->rtm_rmx.rmx_locks); + /* We don't support locks anymore */ break; } RT_UNLOCK(rt); @@ -542,20 +538,28 @@ flush: } static void -rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics *out) +rt_setmetrics(u_long which, struct rt_metrics *in, struct rt_metrics_lite *out) { #define metric(f, e) if (which & (f)) out->e = in->e; - metric(RTV_RPIPE, rmx_recvpipe); - metric(RTV_SPIPE, rmx_sendpipe); - metric(RTV_SSTHRESH, rmx_ssthresh); - metric(RTV_RTT, rmx_rtt); - metric(RTV_RTTVAR, rmx_rttvar); - metric(RTV_HOPCOUNT, rmx_hopcount); + /* + * Only these are stored in the routing entry since introduction + * of tcp hostcache. The rest is ignored. + */ metric(RTV_MTU, rmx_mtu); metric(RTV_EXPIRE, rmx_expire); #undef metric } +static void +rt_getmetrics(struct rt_metrics_lite *in, struct rt_metrics *out) +{ +#define metric(e) out->e = in->e; + bzero(out, sizeof(*out)); + metric(rmx_mtu); + metric(rmx_expire); +#undef metric +} + #define ROUNDUP(a) \ ((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long)) @@ -948,8 +952,8 @@ sysctl_dumpentry(struct radix_node *rn, void *vw) struct rt_msghdr *rtm = (struct rt_msghdr *)w->w_tmem; rtm->rtm_flags = rt->rt_flags; - rtm->rtm_use = rt->rt_use; - rtm->rtm_rmx = rt->rt_rmx; + rtm->rtm_use = rt->rt_rmx.rmx_pksent; + rt_getmetrics(&rt->rt_rmx, &rtm->rtm_rmx); rtm->rtm_index = rt->rt_ifp->if_index; rtm->rtm_errno = rtm->rtm_pid = rtm->rtm_seq = 0; rtm->rtm_addrs = info.rti_addrs; diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c index 11735ec..898c0d4 100644 --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -561,7 +561,6 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td) if (error) return (error); } - if (!TAILQ_EMPTY(&in_ifaddrhead)) { /* * If the destination address is INADDR_ANY, @@ -579,32 +578,20 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td) &in_ifaddrhead)->ia_broadaddr)->sin_addr; } if (laddr.s_addr == INADDR_ANY) { - register struct route *ro; + struct route sro; + sro.ro_rt = NULL; ia = (struct in_ifaddr *)0; /* - * If route is known or can be allocated now, - * our src addr is taken from the i/f, else punt. - * Note that we should check the address family of the cached - * destination, in case of sharing the cache with IPv6. + * If route is known our src addr is taken from the i/f, + * else punt. */ - ro = &inp->inp_route; - if (ro->ro_rt && ((ro->ro_rt->rt_flags & RTF_UP) == 0 || - ro->ro_dst.sa_family != AF_INET || - satosin(&ro->ro_dst)->sin_addr.s_addr != faddr.s_addr || - inp->inp_socket->so_options & SO_DONTROUTE)) { - RTFREE(ro->ro_rt); - ro->ro_rt = (struct rtentry *)0; - } - if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0 && /*XXX*/ - (ro->ro_rt == (struct rtentry *)0 || - ro->ro_rt->rt_ifp == (struct ifnet *)0)) { - /* No route yet, so try to acquire one */ - bzero(&ro->ro_dst, sizeof(struct sockaddr_in)); - ro->ro_dst.sa_family = AF_INET; - ro->ro_dst.sa_len = sizeof(struct sockaddr_in); - ((struct sockaddr_in *)&ro->ro_dst)->sin_addr = faddr; - rtalloc(ro); + if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) { + /* Find out route to destination */ + sro.ro_dst.sa_family = AF_INET; + sro.ro_dst.sa_len = sizeof(struct sockaddr_in); + ((struct sockaddr_in *)&sro.ro_dst)->sin_addr = faddr; + rtalloc_ign(&sro, RTF_CLONING); } /* * If we found a route, use the address @@ -612,8 +599,10 @@ in_pcbconnect_setup(inp, nam, laddrp, lportp, faddrp, fportp, oinpp, td) * unless it is the loopback (in case a route * to our address on another net goes to loopback). */ - if (ro->ro_rt && !(ro->ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) - ia = ifatoia(ro->ro_rt->rt_ifa); + if (sro.ro_rt && !(sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK)) + ia = ifatoia(sro.ro_rt->rt_ifa); + if (sro.ro_rt) + RTFREE(sro.ro_rt); if (ia == 0) { bzero(&sa, sizeof(sa)); sa.sin_addr = faddr; @@ -706,8 +695,6 @@ in_pcbdetach(inp) } if (inp->inp_options) (void)m_free(inp->inp_options); - if (inp->inp_route.ro_rt) - RTFREE(inp->inp_route.ro_rt); ip_freemoptions(inp->inp_moptions); inp->inp_vflag = 0; INP_LOCK_DESTROY(inp); @@ -884,62 +871,6 @@ in_pcbpurgeif0(pcbinfo, ifp) } /* - * Check for alternatives when higher level complains - * about service problems. For now, invalidate cached - * routing information. If the route was created dynamically - * (by a redirect), time to try a default gateway again. - */ -void -in_losing(inp) - struct inpcb *inp; -{ - register struct rtentry *rt; - struct rt_addrinfo info; - - INP_LOCK_ASSERT(inp); - - if ((rt = inp->inp_route.ro_rt)) { - RT_LOCK(rt); - inp->inp_route.ro_rt = NULL; - bzero((caddr_t)&info, sizeof(info)); - info.rti_flags = rt->rt_flags; - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = rt_mask(rt); - rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); - if (rt->rt_flags & RTF_DYNAMIC) - rtexpunge(rt); - RTFREE_LOCKED(rt); - /* - * A new route can be allocated - * the next time output is attempted. - */ - } -} - -/* - * After a routing change, flush old routing - * and allocate a (hopefully) better one. - */ -struct inpcb * -in_rtchange(inp, errno) - register struct inpcb *inp; - int errno; -{ - INP_LOCK_ASSERT(inp); - - if (inp->inp_route.ro_rt) { - RTFREE(inp->inp_route.ro_rt); - inp->inp_route.ro_rt = 0; - /* - * A new route can be allocated the next time - * output is attempted. - */ - } - return inp; -} - -/* * Lookup a PCB based on the local address and port. */ struct inpcb * diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h index 8a6717c..5e93328 100644 --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -94,31 +94,22 @@ struct in_endpoints { /* * XXX - * At some point struct route should possibly change to: - * struct rtentry *rt - * struct in_endpoints *ie; + * the defines for inc_* are hacks and should be changed to direct references */ struct in_conninfo { u_int8_t inc_flags; u_int8_t inc_len; u_int16_t inc_pad; /* XXX alignment for in_endpoints */ - /* protocol dependent part; cached route */ + /* protocol dependent part */ struct in_endpoints inc_ie; - union { - /* placeholder for routing entry */ - struct route inc4_route; - struct route_in6 inc6_route; - } inc_dependroute; }; #define inc_isipv6 inc_flags /* temp compatability */ #define inc_fport inc_ie.ie_fport #define inc_lport inc_ie.ie_lport #define inc_faddr inc_ie.ie_faddr #define inc_laddr inc_ie.ie_laddr -#define inc_route inc_dependroute.inc4_route #define inc6_faddr inc_ie.ie6_faddr #define inc6_laddr inc_ie.ie6_laddr -#define inc6_route inc_dependroute.inc6_route struct icmp6_filter; @@ -157,7 +148,6 @@ struct inpcb { #define inp_lport inp_inc.inc_lport #define inp_faddr inp_inc.inc_faddr #define inp_laddr inp_inc.inc_laddr -#define inp_route inp_inc.inc_route #define inp_ip_tos inp_depend4.inp4_ip_tos #define inp_options inp_depend4.inp4_options #define inp_moptions inp_depend4.inp4_moptions @@ -182,7 +172,7 @@ struct inpcb { #define in6p_faddr inp_inc.inc6_faddr #define in6p_laddr inp_inc.inc6_laddr -#define in6p_route inp_inc.inc6_route +#define in6p_ip6_hlim inp_depend6.inp6_hlim #define in6p_hops inp_depend6.inp6_hops /* default hop limit */ #define in6p_ip6_nxt inp_ip_p #define in6p_flowinfo inp_flow @@ -347,9 +337,6 @@ extern int ipport_hifirstauto; extern int ipport_hilastauto; void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); -void in_losing(struct inpcb *); -struct inpcb * - in_rtchange(struct inpcb *, int); int in_pcballoc(struct socket *, struct inpcbinfo *, struct thread *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct thread *); int in_pcbbind_setup(struct inpcb *, struct sockaddr *, in_addr_t *, diff --git a/sys/netinet/in_rmx.c b/sys/netinet/in_rmx.c index 4625030..ea11792 100644 --- a/sys/netinet/in_rmx.c +++ b/sys/netinet/in_rmx.c @@ -98,8 +98,7 @@ in_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) rt->rt_flags |= RTF_MULTICAST; - if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) && - rt->rt_ifp) + if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp) rt->rt_rmx.rmx_mtu = rt->rt_ifp->if_mtu; ret = rn_addroute(v_arg, n_arg, head, treenodes); diff --git a/sys/netinet/ip_divert.c b/sys/netinet/ip_divert.c index 172021b..bd777dd 100644 --- a/sys/netinet/ip_divert.c +++ b/sys/netinet/ip_divert.c @@ -336,7 +336,7 @@ div_output(struct socket *so, struct mbuf *m, ipstat.ips_rawout++; /* XXX */ error = ip_output((struct mbuf *)&divert_tag, - inp->inp_options, &inp->inp_route, + inp->inp_options, NULL, (so->so_options & SO_DONTROUTE) | IP_ALLOWBROADCAST | IP_RAWOUTPUT, inp->inp_moptions, NULL); @@ -527,11 +527,8 @@ div_ctlinput(int cmd, struct sockaddr *sa, void *vip) faddr = ((struct sockaddr_in *)sa)->sin_addr; if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; - if (PRC_IS_REDIRECT(cmd)) { - /* flush held routes */ - in_pcbnotifyall(&divcbinfo, faddr, - inetctlerrmap[cmd], in_rtchange); - } + if (PRC_IS_REDIRECT(cmd)) + return; } static int diff --git a/sys/netinet/ip_fw2.c b/sys/netinet/ip_fw2.c index 5d3e3da..999d064 100644 --- a/sys/netinet/ip_fw2.c +++ b/sys/netinet/ip_fw2.c @@ -466,10 +466,13 @@ verify_rev_path(struct in_addr src, struct ifnet *ifp) rtalloc_ign(&ro, RTF_CLONING); } - if ((ro.ro_rt == NULL) || (ifp == NULL) || - (ro.ro_rt->rt_ifp->if_index != ifp->if_index)) + if (ro.ro_rt == NULL) return 0; - + if ((ifp == NULL) || (ro.ro_rt->rt_ifp->if_index != ifp->if_index)) { + RTFREE(ro.ro_rt); + return 0; + } + RTFREE(ro.ro_rt); return 1; } diff --git a/sys/netinet/ip_icmp.c b/sys/netinet/ip_icmp.c index f94e7b9..b84d689 100644 --- a/sys/netinet/ip_icmp.c +++ b/sys/netinet/ip_icmp.c @@ -52,11 +52,15 @@ #include <net/route.h> #include <netinet/in.h> +#include <netinet/in_pcb.h> #include <netinet/in_systm.h> #include <netinet/in_var.h> #include <netinet/ip.h> #include <netinet/ip_icmp.h> #include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#include <netinet/tcpip.h> #include <netinet/icmp_var.h> #ifdef IPSEC @@ -395,7 +399,7 @@ icmp_input(m, off) printf("deliver to protocol %d\n", icp->icmp_ip.ip_p); #endif icmpsrc.sin_addr = icp->icmp_ip.ip_dst; -#if 1 + /* * MTU discovery: * If we got a needfrag and there is a host route to the @@ -405,40 +409,37 @@ icmp_input(m, off) * notice that the MTU has changed and adapt accordingly. * If no new MTU was suggested, then we guess a new one * less than the current value. If the new MTU is - * unreasonably small (arbitrarily set at 296), then - * we reset the MTU to the interface value and enable the - * lock bit, indicating that we are no longer doing MTU - * discovery. + * unreasonably small, then we don't update the MTU value. + * + * XXX: All this should be done in tcp_mtudisc() because + * the way we do it now, everyone can send us bogus ICMP + * MSGSIZE packets for any destination. By doing this far + * higher in the chain we have a matching tcp connection. + * Thus spoofing is much harder. However there is no easy + * non-hackish way to pass the new MTU up to tcp_mtudisc(). + * Also see next XXX regarding IPv4 AH TCP. */ if (code == PRC_MSGSIZE) { - struct rtentry *rt; int mtu; + struct in_conninfo inc; + + bzero(&inc, sizeof(inc)); + inc.inc_flags = 0; /* IPv4 */ + inc.inc_faddr = icmpsrc.sin_addr; + + mtu = ntohs(icp->icmp_nextmtu); + if (!mtu) + mtu = ip_next_mtu(mtu, 1); + + if (mtu >= 256 + sizeof(struct tcpiphdr)) + tcp_hc_updatemtu(&inc, mtu); - rt = rtalloc1((struct sockaddr *)&icmpsrc, 0, - RTF_CLONING); - if (rt && (rt->rt_flags & RTF_HOST) - && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { - mtu = ntohs(icp->icmp_nextmtu); - if (!mtu) - mtu = ip_next_mtu(rt->rt_rmx.rmx_mtu, - 1); #ifdef DEBUG_MTUDISC - printf("MTU for %s reduced to %d\n", - inet_ntoa(icmpsrc.sin_addr), mtu); + printf("MTU for %s reduced to %d\n", + inet_ntoa(icmpsrc.sin_addr), mtu); #endif - if (mtu < 296) { - /* rt->rt_rmx.rmx_mtu = - rt->rt_ifp->if_mtu; */ - rt->rt_rmx.rmx_locks |= RTV_MTU; - } else if (rt->rt_rmx.rmx_mtu > mtu) { - rt->rt_rmx.rmx_mtu = mtu; - } - } - if (rt) - rtfree(rt); } -#endif /* * XXX if the packet contains [IPv4 AH TCP], we can't make a * notification to TCP layer. @@ -785,7 +786,6 @@ iptime() return (htonl(t)); } -#if 1 /* * Return the next larger or smaller MTU plateau (table from RFC 1191) * given current value MTU. If DIR is less than zero, a larger plateau @@ -823,7 +823,6 @@ ip_next_mtu(mtu, dir) } } } -#endif /* diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index df67d22..3d528f4 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -1612,22 +1612,22 @@ struct in_ifaddr * ip_rtaddr(dst) struct in_addr dst; { + struct route sro; struct sockaddr_in *sin; struct in_ifaddr *ifa; - struct route ro; - bzero(&ro, sizeof(ro)); - sin = (struct sockaddr_in *)&ro.ro_dst; + sro.ro_rt = NULL; + sin = (struct sockaddr_in *)&sro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); sin->sin_addr = dst; - rtalloc_ign(&ro, RTF_CLONING); + rtalloc_ign(&sro, RTF_CLONING); - if (ro.ro_rt == 0) + if (sro.ro_rt == NULL) return ((struct in_ifaddr *)0); - ifa = ifatoia(ro.ro_rt->rt_ifa); - RTFREE(ro.ro_rt); + ifa = ifatoia(sro.ro_rt->rt_ifa); + RTFREE(sro.ro_rt); return ifa; } @@ -1879,7 +1879,7 @@ ip_forward(struct mbuf *m, int srcrt, struct sockaddr_in *next_hop) struct route ro; struct rtentry *rt; - bzero(&ro, sizeof(ro)); + ro.ro_rt = NULL; sin = (struct sockaddr_in *)&ro.ro_dst; sin->sin_family = AF_INET; sin->sin_len = sizeof(*sin); diff --git a/sys/netinet/ip_output.c b/sys/netinet/ip_output.c index cdf8b87..0a11524 100644 --- a/sys/netinet/ip_output.c +++ b/sys/netinet/ip_output.c @@ -302,13 +302,9 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, isbroadcast = 0; /* fool gcc */ } else { /* - * If this is the case, we probably don't want to allocate - * a protocol-cloned route since we didn't get one from the - * ULP. This lets TCP do its thing, while not burdening - * forwarding or ICMP with the overhead of cloning a route. - * Of course, we still want to do any cloning requested by - * the link layer, as this is probably required in all cases - * for correct operation (as it is for ARP). + * We want to do any cloning requested by the link layer, + * as this is probably required in all cases for correct + * operation (as it is for ARP). */ if (ro->ro_rt == 0) rtalloc(ro); @@ -319,7 +315,7 @@ ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, } ia = ifatoia(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; - ro->ro_rt->rt_use++; + ro->ro_rt->rt_rmx.rmx_pksent++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *)ro->ro_rt->rt_gateway; if (ro->ro_rt->rt_flags & RTF_HOST) @@ -931,16 +927,14 @@ spd_done: ip_input((struct mbuf *)&tag); goto done; } - /* Some of the logic for this was + /* + * Some of the logic for this was * nicked from above. - * - * This rewrites the cached route in a local PCB. - * Is this what we want to do? */ bcopy(dst, &ro_fwd->ro_dst, sizeof(*dst)); ro_fwd->ro_rt = 0; - rtalloc(ro_fwd); + rtalloc_ign(ro_fwd, RTF_CLONING); if (ro_fwd->ro_rt == 0) { ipstat.ips_noroute++; @@ -950,7 +944,7 @@ spd_done: ia = ifatoia(ro_fwd->ro_rt->rt_ifa); ifp = ro_fwd->ro_rt->rt_ifp; - ro_fwd->ro_rt->rt_use++; + ro_fwd->ro_rt->rt_rmx.rmx_pksent++; if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in *) ro_fwd->ro_rt->rt_gateway; @@ -1045,7 +1039,6 @@ pass: * routes when the MTU is changed. */ if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) && - !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) && (ro->ro_rt->rt_rmx.rmx_mtu > ifp->if_mtu)) { ro->ro_rt->rt_rmx.rmx_mtu = ifp->if_mtu; } @@ -1983,7 +1976,7 @@ ip_setmoptions(sopt, imop) dst->sin_len = sizeof(*dst); dst->sin_family = AF_INET; dst->sin_addr = mreq.imr_multiaddr; - rtalloc(&ro); + rtalloc_ign(&ro, RTF_CLONING); if (ro.ro_rt == NULL) { error = EADDRNOTAVAIL; splx(s); diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c index 632e00a..0a76a7f 100644 --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -302,7 +302,7 @@ rip_output(struct mbuf *m, struct socket *so, u_long dst) if (inp->inp_flags & INP_ONESBCAST) flags |= IP_SENDONES; - return (ip_output(m, inp->inp_options, &inp->inp_route, flags, + return (ip_output(m, inp->inp_options, NULL, flags, inp->inp_moptions, inp)); } diff --git a/sys/netinet/tcp_hostcache.c b/sys/netinet/tcp_hostcache.c new file mode 100644 index 0000000..461ce85 --- /dev/null +++ b/sys/netinet/tcp_hostcache.c @@ -0,0 +1,728 @@ +/* + * Copyright (c) 2002 Andre Oppermann, Internet Business Solutions AG + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * The tcp_hostcache moves the tcp specific cached metrics from the routing + * table into a dedicated structure indexed by the remote IP address. It + * keeps information on the measured tcp parameters of past tcp sessions + * to have better initial start values for following connections from the + * same source. Depending on the network parameters (delay, bandwidth, max + * MTU, congestion window) between local and remote site this can lead to + * significant speedups for new tcp connections after the first one. + * + * Due to this new tcp_hostcache all tcp specific metrics information in + * the routing table has been removed. The INPCB no longer keeps a pointer + * to the routing entry and protocol initiated route cloning has been + * removed as well. With these changes the routing table has gone back + * to being more lightwight and only carries information related to packet + * forwarding. + * + * Tcp_hostcache is designed for multiple concurrent access in SMP + * environments and high contention. All bucket rows have their own + * lock and thus multiple lookups and modifies can be done at the same + * time as long as they are in different bucket rows. If a request for + * insertion of a new record can't be satisfied it simply returns an + * empty structure. Nobody and nothing shall ever point directly to + * any entry in tcp_hostcache. All communication is done in an object + * oriented way and only funtions of tcp_hostcache will manipulate hostcache + * entries. Otherwise we are unable to achieve good behaviour in concurrent + * access situations. Since tcp_hostcache is only caching information there + * are no fatal consequences if we either can't satisfy any particular request + * or have to drop/overwrite an existing entry because of bucket limit + * memory constrains. + */ + +/* + * Many thanks to jlemon for basic structure of tcp_syncache which is being + * followed here. + */ + +#include "opt_inet6.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/malloc.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/sysctl.h> + +#include <net/if.h> + +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/in_var.h> +#include <netinet/in_pcb.h> +#include <netinet/ip_var.h> +#ifdef INET6 +#include <netinet/ip6.h> +#include <netinet6/ip6_var.h> +#endif +#include <netinet/tcp.h> +#include <netinet/tcp_var.h> +#ifdef INET6 +#include <netinet6/tcp6_var.h> +#endif + +#include <vm/uma.h> + + +TAILQ_HEAD(hc_qhead, hc_metrics); + +struct hc_head { + struct hc_qhead hch_bucket; + u_int hch_length; + struct mtx hch_mtx; +}; + +struct hc_metrics { + /* housekeeping */ + TAILQ_ENTRY(hc_metrics) rmx_q; + struct hc_head *rmx_head; /* head of bucket tail queue */ + struct in_addr ip4; /* IP address */ + struct in6_addr ip6; /* IP6 address */ + /* endpoint specific values for tcp */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_bandwidth; /* estimated bandwidth */ + u_long rmx_cwnd; /* congestion window */ + u_long rmx_sendpipe; /* outbound delay-bandwidth product */ + u_long rmx_recvpipe; /* inbound delay-bandwidth product */ + struct rmxp_tao rmx_tao; /* TAO cache for T/TCP */ + /* tcp hostcache internal data */ + int rmx_expire; /* lifetime for object */ + u_long rmx_hits; /* number of hits */ + u_long rmx_updates; /* number of updates */ +}; + +/* Arbitrary values */ +#define TCP_HOSTCACHE_HASHSIZE 512 +#define TCP_HOSTCACHE_BUCKETLIMIT 30 +#define TCP_HOSTCACHE_EXPIRE 60*60 /* one hour */ +#define TCP_HOSTCACHE_PRUNE 5*60 /* every 5 minutes */ + +struct tcp_hostcache { + struct hc_head *hashbase; + uma_zone_t zone; + u_int hashsize; + u_int hashmask; + u_int bucket_limit; + u_int cache_count; + u_int cache_limit; + int expire; + int purgeall; +}; +static struct tcp_hostcache tcp_hostcache; + +static struct callout tcp_hc_callout; + +static struct hc_metrics *tcp_hc_lookup(struct in_conninfo *); +static struct hc_metrics *tcp_hc_insert(struct in_conninfo *); +static int sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS); +static void tcp_hc_purge(void *); + +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hostcache, CTLFLAG_RW, 0, "TCP Host cache"); + +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, cachelimit, CTLFLAG_RDTUN, + &tcp_hostcache.cache_limit, 0, "Overall entry limit for hostcache"); + +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, hashsize, CTLFLAG_RDTUN, + &tcp_hostcache.hashsize, 0, "Size of TCP hostcache hashtable"); + +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, bucketlimit, CTLFLAG_RDTUN, + &tcp_hostcache.bucket_limit, 0, "Per-bucket hash limit for hostcache"); + +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, count, CTLFLAG_RD, + &tcp_hostcache.cache_count, 0, "Current number of entries in hostcache"); + +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, expire, CTLFLAG_RW, + &tcp_hostcache.expire, 0, "Expire time of TCP hostcache entries"); + +SYSCTL_INT(_net_inet_tcp_hostcache, OID_AUTO, purge, CTLFLAG_RW, + &tcp_hostcache.purgeall, 0, "Expire all entires on next purge run"); + +SYSCTL_PROC(_net_inet_tcp_hostcache, OID_AUTO, list, + CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_SKIP, 0, 0, + sysctl_tcp_hc_list, "A", "List of all hostcache entries"); + + +static MALLOC_DEFINE(M_HOSTCACHE, "hostcache", "TCP hostcache"); + +#define HOSTCACHE_HASH(ip) \ + (((ip)->s_addr ^ ((ip)->s_addr >> 7) ^ ((ip)->s_addr >> 17)) & \ + tcp_hostcache.hashmask) + +/* XXX: What is the recommended hash to get good entropy for IPv6 addresses? */ +#define HOSTCACHE_HASH6(ip6) \ + (((ip6)->s6_addr32[0] ^ \ + (ip6)->s6_addr32[1] ^ \ + (ip6)->s6_addr32[2] ^ \ + (ip6)->s6_addr32[3]) & \ + tcp_hostcache.hashmask) + +#define THC_LOCK(lp) mtx_lock(lp) +#define THC_UNLOCK(lp) mtx_unlock(lp) + +void +tcp_hc_init(void) +{ + int i; + + /* + * Initialize hostcache structures + */ + tcp_hostcache.cache_count = 0; + tcp_hostcache.hashsize = TCP_HOSTCACHE_HASHSIZE; + tcp_hostcache.bucket_limit = TCP_HOSTCACHE_BUCKETLIMIT; + tcp_hostcache.cache_limit = + tcp_hostcache.hashsize * tcp_hostcache.bucket_limit; + tcp_hostcache.expire = TCP_HOSTCACHE_EXPIRE; + + TUNABLE_INT_FETCH("net.inet.tcp.hostcache.hashsize", + &tcp_hostcache.hashsize); + TUNABLE_INT_FETCH("net.inet.tcp.hostcache.cachelimit", + &tcp_hostcache.cache_limit); + TUNABLE_INT_FETCH("net.inet.tcp.hostcache.bucketlimit", + &tcp_hostcache.bucket_limit); + if (!powerof2(tcp_hostcache.hashsize)) { + printf("WARNING: hostcache hash size is not a power of 2.\n"); + tcp_hostcache.hashsize = 512; /* safe default */ + } + tcp_hostcache.hashmask = tcp_hostcache.hashsize - 1; + + /* + * Allocate the hash table + */ + tcp_hostcache.hashbase = (struct hc_head *) + malloc(tcp_hostcache.hashsize * sizeof(struct hc_head), + M_HOSTCACHE, M_WAITOK | M_ZERO); + + /* + * Initialize the hash buckets + */ + for (i = 0; i < tcp_hostcache.hashsize; i++) { + TAILQ_INIT(&tcp_hostcache.hashbase[i].hch_bucket); + tcp_hostcache.hashbase[i].hch_length = 0; + mtx_init(&tcp_hostcache.hashbase[i].hch_mtx, "tcp_hc_entry", + NULL, MTX_DEF); + } + + /* + * Allocate the hostcache entries. + */ + tcp_hostcache.zone = uma_zcreate("hostcache", sizeof(struct hc_metrics), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcp_hostcache.zone, tcp_hostcache.cache_limit); + + /* + * Set up periodic cache cleanup. + */ + callout_init(&tcp_hc_callout, CALLOUT_MPSAFE); + callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0); +} + +/* + * Internal function: lookup an entry in the hostcache or return NULL. + * + * If an entry has been returned, the caller becomes responsible for + * unlocking the bucket row after he is done reading/modifying the entry. + */ +static struct hc_metrics * +tcp_hc_lookup(struct in_conninfo *inc) +{ + int hash; + struct hc_head *hc_head; + struct hc_metrics *hc_entry; + + KASSERT(inc != NULL, ("tcp_hc_lookup with NULL in_conninfo pointer")); + + /* + * Hash the foreign ip address. + */ + if (inc->inc_isipv6) + hash = HOSTCACHE_HASH6(&inc->inc6_faddr); + else + hash = HOSTCACHE_HASH(&inc->inc_faddr); + + hc_head = &tcp_hostcache.hashbase[hash]; + + /* + * aquire lock for this bucket row + * we release the lock if we don't find an entry, + * otherwise the caller has to unlock after he is done + */ + THC_LOCK(&hc_head->hch_mtx); + + /* + * circle through entries in bucket row looking for a match + */ + TAILQ_FOREACH(hc_entry, &hc_head->hch_bucket, rmx_q) { + if (inc->inc_isipv6) { + if (memcmp(&inc->inc6_faddr, &hc_entry->ip6, + sizeof(inc->inc6_faddr)) == 0) + return hc_entry; + } else { + if (memcmp(&inc->inc_faddr, &hc_entry->ip4, + sizeof(inc->inc_faddr)) == 0) + return hc_entry; + } + } + + /* + * We were unsuccessful and didn't find anything + */ + THC_UNLOCK(&hc_head->hch_mtx); + return NULL; +} + +/* + * Internal function: insert an entry into the hostcache or return NULL + * if unable to allocate a new one. + * + * If an entry has been returned, the caller becomes responsible for + * unlocking the bucket row after he is done reading/modifying the entry. + */ +static struct hc_metrics * +tcp_hc_insert(struct in_conninfo *inc) +{ + int hash; + struct hc_head *hc_head; + struct hc_metrics *hc_entry; + + KASSERT(inc != NULL, ("tcp_hc_insert with NULL in_conninfo pointer")); + + /* + * Hash the foreign ip address + */ + if (inc->inc_isipv6) + hash = HOSTCACHE_HASH6(&inc->inc6_faddr); + else + hash = HOSTCACHE_HASH(&inc->inc_faddr); + + hc_head = &tcp_hostcache.hashbase[hash]; + + /* + * aquire lock for this bucket row + * we release the lock if we don't find an entry, + * otherwise the caller has to unlock after he is done + */ + THC_LOCK(&hc_head->hch_mtx); + + /* + * If the bucket limit is reached reuse the least used element + */ + if (hc_head->hch_length >= tcp_hostcache.bucket_limit || + tcp_hostcache.cache_count >= tcp_hostcache.cache_limit) { + hc_entry = TAILQ_LAST(&hc_head->hch_bucket, hc_qhead); + /* + * At first we were dropping the last element, just to + * reaquire it in the next two lines again which ain't + * very efficient. Instead just reuse the least used element. + * maybe we drop something that is still "in-use" but we can + * be "lossy". + */ +#if 0 + TAILQ_REMOVE(&hc_head->hch_bucket, hc_entry, rmx_q); + uma_zfree(tcp_hostcache.zone, hc_entry); + tcp_hostcache.hashbase[hash].hch_length--; + tcp_hostcache.cache_count--; +#endif + tcpstat.tcps_hc_bucketoverflow++; + } else { + /* + * Allocate a new entry, or balk if not possible + */ + hc_entry = uma_zalloc(tcp_hostcache.zone, M_NOWAIT); + if (hc_entry == NULL) { + THC_UNLOCK(&hc_head->hch_mtx); + return NULL; + } + } + + /* + * Initialize basic information of hostcache entry + */ + bzero(hc_entry, sizeof(*hc_entry)); + if (inc->inc_isipv6) + bcopy(&hc_entry->ip6, &inc->inc6_faddr, sizeof(hc_entry->ip6)); + else + hc_entry->ip4 = inc->inc_faddr; + hc_entry->rmx_head = hc_head; + hc_entry->rmx_expire = tcp_hostcache.expire; + + /* + * Put it upfront + */ + TAILQ_INSERT_HEAD(&hc_head->hch_bucket, hc_entry, rmx_q); + tcp_hostcache.hashbase[hash].hch_length++; + tcp_hostcache.cache_count++; + tcpstat.tcps_hc_added++; + + return hc_entry; +} + +/* + * External function: lookup an entry in the hostcache and fill out the + * supplied tcp metrics structure. Fills in null when no entry was found + * or a value is not set. + */ +void +tcp_hc_get(struct in_conninfo *inc, struct hc_metrics_lite *hc_metrics_lite) +{ + struct hc_metrics *hc_entry; + + /* + * Find the right bucket + */ + hc_entry = tcp_hc_lookup(inc); + + /* + * If we don't have an existing object + */ + if (hc_entry == NULL) { + bzero(hc_metrics_lite, sizeof(*hc_metrics_lite)); + return; + } + hc_entry->rmx_hits++; + hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + + hc_metrics_lite->rmx_mtu = hc_entry->rmx_mtu; + hc_metrics_lite->rmx_ssthresh = hc_entry->rmx_ssthresh; + hc_metrics_lite->rmx_rtt = hc_entry->rmx_rtt; + hc_metrics_lite->rmx_rttvar = hc_entry->rmx_rttvar; + hc_metrics_lite->rmx_bandwidth = hc_entry->rmx_bandwidth; + hc_metrics_lite->rmx_cwnd = hc_entry->rmx_cwnd; + hc_metrics_lite->rmx_sendpipe = hc_entry->rmx_sendpipe; + hc_metrics_lite->rmx_recvpipe = hc_entry->rmx_recvpipe; + + /* + * unlock bucket row + */ + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * External function: lookup an entry in the hostcache and return the + * discovered path mtu. Returns null if no entry found or value not is set. + */ +u_long +tcp_hc_getmtu(struct in_conninfo *inc) +{ + struct hc_metrics *hc_entry; + u_long mtu; + + hc_entry = tcp_hc_lookup(inc); + if (hc_entry == NULL) { + return 0; + } + hc_entry->rmx_hits++; + hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + + mtu = hc_entry->rmx_mtu; + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); + return mtu; +} + +/* + * External function: lookup an entry in the hostcache and fill out the + * supplied t/tcp tao structure. Fills in null when no entry was found + * or a value is not set. + */ +void +tcp_hc_gettao(struct in_conninfo *inc, struct rmxp_tao *tao) +{ + struct hc_metrics *hc_entry; + + hc_entry = tcp_hc_lookup(inc); + if (hc_entry == NULL) { + bzero(tao, sizeof(*tao)); + return; + } + hc_entry->rmx_hits++; + hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + + bcopy(tao, &hc_entry->rmx_tao, sizeof(*tao)); + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * External function: update the mtu value of an entry in the hostcache. + * Creates a new entry if none was found. + */ +void +tcp_hc_updatemtu(struct in_conninfo *inc, u_long mtu) +{ + struct hc_metrics *hc_entry; + + /* + * Find the right bucket + */ + hc_entry = tcp_hc_lookup(inc); + + /* + * If we don't have an existing object try to insert a new one + */ + if (hc_entry == NULL) { + hc_entry = tcp_hc_insert(inc); + if (hc_entry == NULL) + return; + } + hc_entry->rmx_updates++; + hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + + hc_entry->rmx_mtu = mtu; + + /* + * put it upfront so we find it faster next time + */ + TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + + /* + * unlock bucket row + */ + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * External function: update the tcp metrics of an entry in the hostcache. + * Creates a new entry if none was found. + */ +void +tcp_hc_update(struct in_conninfo *inc, struct hc_metrics_lite *hcml) +{ + struct hc_metrics *hc_entry; + + hc_entry = tcp_hc_lookup(inc); + if (hc_entry == NULL) { + hc_entry = tcp_hc_insert(inc); + if (hc_entry == NULL) + return; + } + hc_entry->rmx_updates++; + hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + + if (hcml->rmx_rtt != 0) { + if (hc_entry->rmx_rtt == 0) + hc_entry->rmx_rtt = hcml->rmx_rtt; + else + hc_entry->rmx_rtt = + (hc_entry->rmx_rtt + hcml->rmx_rtt) / 2; + tcpstat.tcps_cachedrtt++; + } + if (hcml->rmx_rttvar != 0) { + if (hc_entry->rmx_rttvar == 0) + hc_entry->rmx_rttvar = hcml->rmx_rttvar; + else + hc_entry->rmx_rttvar = + (hc_entry->rmx_rttvar + hcml->rmx_rttvar) / 2; + tcpstat.tcps_cachedrttvar++; + } + if (hcml->rmx_ssthresh != 0) { + if (hc_entry->rmx_ssthresh == 0) + hc_entry->rmx_ssthresh = hcml->rmx_ssthresh; + else + hc_entry->rmx_ssthresh = + (hc_entry->rmx_ssthresh + hcml->rmx_ssthresh) / 2; + tcpstat.tcps_cachedssthresh++; + } + if (hcml->rmx_bandwidth != 0) { + if (hc_entry->rmx_bandwidth == 0) + hc_entry->rmx_bandwidth = hcml->rmx_bandwidth; + else + hc_entry->rmx_bandwidth = + (hc_entry->rmx_bandwidth + hcml->rmx_bandwidth) / 2; + /* tcpstat.tcps_cachedbandwidth++; */ + } + if (hcml->rmx_cwnd != 0) { + if (hc_entry->rmx_cwnd == 0) + hc_entry->rmx_cwnd = hcml->rmx_cwnd; + else + hc_entry->rmx_cwnd = + (hc_entry->rmx_cwnd + hcml->rmx_cwnd) / 2; + /* tcpstat.tcps_cachedcwnd++; */ + } + if (hcml->rmx_sendpipe != 0) { + if (hc_entry->rmx_sendpipe == 0) + hc_entry->rmx_sendpipe = hcml->rmx_sendpipe; + else + hc_entry->rmx_sendpipe = + (hc_entry->rmx_sendpipe + hcml->rmx_sendpipe) /2; + /* tcpstat.tcps_cachedsendpipe++; */ + } + if (hcml->rmx_recvpipe != 0) { + if (hc_entry->rmx_recvpipe == 0) + hc_entry->rmx_recvpipe = hcml->rmx_recvpipe; + else + hc_entry->rmx_recvpipe = + (hc_entry->rmx_recvpipe + hcml->rmx_recvpipe) /2; + /* tcpstat.tcps_cachedrecvpipe++; */ + } + + TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * External function: update the t/tcp tao of an entry in the hostcache. + * Creates a new entry if none was found. + */ +void +tcp_hc_updatetao(struct in_conninfo *inc, int field, tcp_cc ccount, u_short mss) +{ + struct hc_metrics *hc_entry; + + hc_entry = tcp_hc_lookup(inc); + if (hc_entry == NULL) { + hc_entry = tcp_hc_insert(inc); + if (hc_entry == NULL) + return; + } + hc_entry->rmx_updates++; + hc_entry->rmx_expire = tcp_hostcache.expire; /* start over again */ + + switch(field) { + case TCP_HC_TAO_CC: + hc_entry->rmx_tao.tao_cc = ccount; + break; + + case TCP_HC_TAO_CCSENT: + hc_entry->rmx_tao.tao_ccsent = ccount; + break; + + case TCP_HC_TAO_MSSOPT: + hc_entry->rmx_tao.tao_mssopt = mss; + break; + } + + TAILQ_REMOVE(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + TAILQ_INSERT_HEAD(&hc_entry->rmx_head->hch_bucket, hc_entry, rmx_q); + THC_UNLOCK(&hc_entry->rmx_head->hch_mtx); +} + +/* + * Sysctl function: prints the list and values of all hostcache entries in + * unsorted order. + */ +static int +sysctl_tcp_hc_list(SYSCTL_HANDLER_ARGS) +{ + int bufsize; + int linesize = 128; + char *p, *buf; + int len, i, error; + struct hc_metrics *hc_entry; + + bufsize = linesize * (tcp_hostcache.cache_count + 1); + + p = buf = (char *)malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO); + + len = snprintf(p, linesize, + "\nIP address MTU SSTRESH RTT RTTVAR BANDWIDTH " + " CWND SENDPIPE RECVPIPE HITS UPD EXP\n"); + p += len; + +#define msec(u) (((u) + 500) / 1000) + for (i = 0; i < tcp_hostcache.hashsize; i++) { + THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket, + rmx_q) { + len = snprintf(p, linesize, + "%-15s %5lu %8lu %6lums %6lums %9lu %8lu %8lu %8lu " + "%4lu %4lu %4i\n", + hc_entry->ip4.s_addr ? inet_ntoa(hc_entry->ip4) : +#ifdef INET6 + ip6_sprintf(&hc_entry->ip6), +#else + "IPv6?", +#endif + hc_entry->rmx_mtu, + hc_entry->rmx_ssthresh, + msec(hc_entry->rmx_rtt * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), + msec(hc_entry->rmx_rttvar * + (RTM_RTTUNIT / (hz * TCP_RTT_SCALE))), + hc_entry->rmx_bandwidth * hz * 8, + hc_entry->rmx_cwnd, + hc_entry->rmx_sendpipe, + hc_entry->rmx_recvpipe, + hc_entry->rmx_hits, + hc_entry->rmx_updates, + hc_entry->rmx_expire); + p += len; + } + THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx); + } +#undef msec + error = SYSCTL_OUT(req, buf, p - buf); + free(buf, M_TEMP); + return(error); +} + +/* + * Expire and purge (old|all) entries in the tcp_hostcache. Runs periodically + * from the callout. + */ +static void +tcp_hc_purge(void *arg) +{ + struct hc_metrics *hc_entry; + int all = (intptr_t)arg; + int i; + + if (tcp_hostcache.purgeall) { + all = 1; + tcp_hostcache.purgeall = 0; + } + + for (i = 0; i < tcp_hostcache.hashsize; i++) { + THC_LOCK(&tcp_hostcache.hashbase[i].hch_mtx); + TAILQ_FOREACH(hc_entry, &tcp_hostcache.hashbase[i].hch_bucket, + rmx_q) { + if (all || hc_entry->rmx_expire <= 0) { + TAILQ_REMOVE(&tcp_hostcache.hashbase[i].hch_bucket, + hc_entry, rmx_q); + uma_zfree(tcp_hostcache.zone, hc_entry); + tcp_hostcache.hashbase[i].hch_length--; + tcp_hostcache.cache_count--; + } else + hc_entry->rmx_expire -= TCP_HOSTCACHE_PRUNE; + } + THC_UNLOCK(&tcp_hostcache.hashbase[i].hch_mtx); + } + callout_reset(&tcp_hc_callout, TCP_HOSTCACHE_PRUNE * hz, tcp_hc_purge, 0); +} diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index a247138..eca5cb2 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *, #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ - ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ - (tp)->t_inpcb->in6p_route.ro_rt) \ - nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ + nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) @@ -358,8 +357,7 @@ tcp_input(m, off0) int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ - struct rmxp_tao *taop; /* pointer to our TAO cache entry */ - struct rmxp_tao tao_noncached; /* in case there's no cached entry */ + struct rmxp_tao tao; /* our TAO cache entry */ int headlocked = 0; struct sockaddr_in *next_hop = NULL; int rstreason; /* For badport_bandlim accounting purposes */ @@ -389,6 +387,7 @@ tcp_input(m, off0) #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif + bzero(&tao, sizeof(tao)); bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; @@ -707,11 +706,9 @@ findpcb: if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; - inc.inc6_route.ro_rt = NULL; /* XXX */ } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; - inc.inc_route.ro_rt = NULL; /* XXX */ } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; @@ -916,7 +913,7 @@ findpcb: } after_listen: -/* XXX temp debugging */ + /* XXX temp debugging */ /* should not happen - syncache should pick up these connections */ if (tp->t_state == TCPS_LISTEN) panic("tcp_input: TCPS_LISTEN"); @@ -930,8 +927,9 @@ after_listen: callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* - * Process options. - * XXX this is tradtitional behavior, may need to be cleaned up. + * Process options only when we get SYN/ACK back. The SYN case + * for incoming connections is handled in tcp_syncache. + * XXX this is traditional behavior, may need to be cleaned up. */ tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); if (thflags & TH_SYN) { @@ -1179,10 +1177,8 @@ after_listen: * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: - if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { - taop = &tao_noncached; - bzero(taop, sizeof(*taop)); - } + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || @@ -1195,7 +1191,7 @@ after_listen: * Our new SYN, when it arrives, will serve as the * needed ACK. */ - if (taop->tao_ccsent != 0) + if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; @@ -1225,7 +1221,7 @@ after_listen: */ if (to.to_flags & TOF_CCECHO) { if (tp->cc_send != to.to_ccecho) { - if (taop->tao_ccsent != 0) + if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; @@ -1246,8 +1242,8 @@ after_listen: tp->rcv_scale = tp->request_r_scale; } /* Segment is acceptable, update cache if undefined. */ - if (taop->tao_ccsent == 0) - taop->tao_ccsent = to.to_ccecho; + if (tao.tao_ccsent == 0 && tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0); tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ @@ -1290,14 +1286,16 @@ after_listen: tp->t_flags |= TF_ACKNOW; callout_stop(tp->tt_rexmt); if (to.to_flags & TOF_CC) { - if (taop->tao_cc != 0 && - CC_GT(to.to_cc, taop->tao_cc)) { + if (tao.tao_cc != 0 && + CC_GT(to.to_cc, tao.tao_cc)) { /* * update cache and make transition: * SYN-SENT -> ESTABLISHED* * SYN-SENT* -> FIN-WAIT-1* */ - taop->tao_cc = to.to_cc; + tao.tao_cc = to.to_cc; + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_CC, to.to_cc, 0); tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; @@ -1313,8 +1311,12 @@ after_listen: } else tp->t_state = TCPS_SYN_RECEIVED; } else { - /* CC.NEW or no option => invalidate cache */ - taop->tao_cc = 0; + if (tcp_do_rfc1644) { + /* CC.NEW or no option => invalidate cache */ + tao.tao_cc = 0; + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_CC, to.to_cc, 0); + } tp->t_state = TCPS_SYN_RECEIVED; } } @@ -1682,13 +1684,14 @@ trimthenstep6: } /* * Upon successful completion of 3-way handshake, - * update cache.CC if it was undefined, pass any queued - * data to the user, and advance state appropriately. + * update cache.CC, pass any queued data to the user, + * and advance state appropriately. */ - if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && - taop->tao_cc == 0) - taop->tao_cc = tp->cc_recv; - + if (tcp_do_rfc1644) { + tao.tao_cc = tp->cc_recv; + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, + tp->cc_recv, 0); + } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED @@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt) * are present. Store the upper limit of the length of options plus * data in maxopd. * - * NOTE that this routine is only called when we process an incoming - * segment, for outgoing segments only tcp_mssopt is called. * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. + * + * NOTE that this routine is only called when we process an incoming + * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { - register struct rtentry *rt; - struct ifnet *ifp; - register int rtt, mss; + int rtt, mss; u_long bufsize; + u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; - struct rmxp_tao *taop; + struct hc_metrics_lite metrics; + struct rmxp_tao tao; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; @@ -2637,96 +2641,96 @@ tcp_mss(tp, offer) sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else - const int isipv6 = 0; - const size_t min_protoh = sizeof (struct tcpiphdr); + const size_t min_protoh = sizeof(struct tcpiphdr); #endif + bzero(&tao, sizeof(tao)); - if (isipv6) - rt = tcp_rtlookup6(&inp->inp_inc); - else - rt = tcp_rtlookup(&inp->inp_inc); - if (rt == NULL) { - tp->t_maxopd = tp->t_maxseg = - isipv6 ? tcp_v6mssdflt : tcp_mssdflt; - return; + /* initialize */ +#ifdef INET6 + if (isipv6) { + maxmtu = tcp_maxmtu6(&inp->inp_inc); + tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; + } else +#endif + { + maxmtu = tcp_maxmtu(&inp->inp_inc); + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } - ifp = rt->rt_ifp; so = inp->inp_socket; - taop = rmx_taop(rt->rt_rmx); /* - * Offer == -1 means that we didn't receive SYN yet, - * use cached value in that case; + * no route to sender, take default mss and return */ - if (offer == -1) - offer = taop->tao_mssopt; - /* - * Offer == 0 means that there was no MSS on the SYN segment, - * in this case we use tcp_mssdflt. - */ - if (offer == 0) - offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; - else - /* - * Sanity check: make sure that maxopd will be large - * enough to allow some data on segments even is the - * all the option space is used (40bytes). Otherwise - * funny things may happen in tcp_output. - */ - offer = max(offer, 64); - taop->tao_mssopt = offer; + if (maxmtu == 0) + return; + + /* what have we got? */ + switch (offer) { + case 0: + /* + * Offer == 0 means that there was no MSS on the SYN + * segment, in this case we use tcp_mssdflt. + */ + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif + tcp_mssdflt; + break; + + case -1: + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); + if (tao.tao_mssopt != 0) + offer = tao.tao_mssopt; + /* FALLTHROUGH */ + + default: + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even if the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + if (tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_MSSOPT, 0, offer); + } /* - * While we're here, check if there's an initial rtt - * or rttvar. Convert from the route-table units - * to scaled multiples of the slow timeout timer. + * rmx information is now retrieved from tcp_hostcache */ - if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { - /* - * XXX the lock bit for RTT indicates that the value - * is also a minimum value; this is subject to time. - */ - if (rt->rt_rmx.rmx_locks & RTV_RTT) - tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); - tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); - tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; - tcpstat.tcps_usedrtt++; - if (rt->rt_rmx.rmx_rttvar) { - tp->t_rttvar = rt->rt_rmx.rmx_rttvar / - (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); - tcpstat.tcps_usedrttvar++; - } else { - /* default variation is +- 1 rtt */ - tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; - } - TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); - } + tcp_hc_get(&inp->inp_inc, &metrics); + /* - * if there's an mtu associated with the route, use it + * if there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ - if (rt->rt_rmx.rmx_mtu) - mss = rt->rt_rmx.rmx_mtu - min_protoh; + if (metrics.rmx_mtu) + mss = metrics.rmx_mtu - min_protoh; else { #ifdef INET6 - mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu) - - min_protoh; -#else - mss = ifp->if_mtu - min_protoh; -#endif -#ifdef INET6 if (isipv6) { - if (!in6_localaddr(&inp->in6p_faddr)) + mss = maxmtu - min_protoh; + if (!path_mtu_discovery && + !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif - if (!in_localaddr(inp->inp_faddr)) + { + mss = maxmtu - min_protoh; + if (!path_mtu_discovery && + !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); + } } mss = min(mss, offer); + /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal @@ -2749,6 +2753,7 @@ tcp_mss(tp, offer) (origoffer == -1 || (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; + tp->t_maxseg = mss; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) @@ -2757,15 +2762,18 @@ tcp_mss(tp, offer) if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif + tp->t_maxseg = mss; + /* - * If there's a pipesize, change the socket buffer - * to that size. Make the socket buffers an integral - * number of mss units; if the mss is larger than - * the socket buffer, decrease the mss. + * If there's a pipesize, change the socket buffer to that size, + * don't change if sb_hiwat is different than default (then it + * has been changed on purpose with setsockopt). + * Make the socket buffers an integral number of mss units; + * if the mss is larger than the socket buffer, decrease the mss. */ -#ifdef RTV_SPIPE - if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) -#endif + if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) + bufsize = metrics.rmx_sendpipe; + else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; @@ -2778,9 +2786,9 @@ tcp_mss(tp, offer) } tp->t_maxseg = mss; -#ifdef RTV_RPIPE - if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) -#endif + if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) + bufsize = metrics.rmx_recvpipe; + else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); @@ -2789,62 +2797,110 @@ tcp_mss(tp, offer) if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve(&so->so_rcv, bufsize, so, NULL); } + /* + * While we're here, check the others too + */ + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + tp->t_srtt = rtt; + tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; + tcpstat.tcps_usedrtt++; + if (metrics.rmx_rttvar) { + tp->t_rttvar = metrics.rmx_rttvar; + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + if (metrics.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } + if (metrics.rmx_bandwidth) + tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. */ +#define TCP_METRICS_CWND +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(mss, + min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); +#ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) tp->snd_cwnd = mss * ss_fltsz_local; +#endif else tp->snd_cwnd = mss * ss_fltsz; - - if (rt->rt_rmx.rmx_ssthresh) { - /* - * There's some sort of gateway or interface - * buffer limit on the path. Use this to set - * the slow start threshhold, but set the - * threshold to no less than 2*mss. - */ - tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); - tcpstat.tcps_usedssthresh++; - } } /* * Determine the MSS option to send on an outgoing SYN. */ int -tcp_mssopt(tp) - struct tcpcb *tp; +tcp_mssopt(inc) + struct in_conninfo *inc; { - struct rtentry *rt; + int mss = 0; + u_long maxmtu = 0; + u_long thcmtu = 0; + size_t min_protoh; #ifdef INET6 - int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; - size_t min_protoh = isipv6 ? - sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : - sizeof (struct tcpiphdr); -#else - const int isipv6 = 0; - const size_t min_protoh = sizeof (struct tcpiphdr); + int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif - if (isipv6) - rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); - else - rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); - if (rt == NULL) - return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt); + KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 - return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : - rt->rt_ifp->if_mtu - min_protoh); -#else - return (rt->rt_ifp->if_mtu - min_protoh); + if (isipv6) { + mss = tcp_v6mssdflt; + maxmtu = tcp_maxmtu6(inc); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + } else #endif + { + mss = tcp_mssdflt; + maxmtu = tcp_maxmtu(inc); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct tcpiphdr); + } + if (maxmtu && thcmtu) + mss = min(maxmtu, thcmtu) - min_protoh; + else if (maxmtu || thcmtu) + mss = max(maxmtu, thcmtu) - min_protoh; + + return (mss); } diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index a48ec4a..a8b8e53 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -125,11 +125,12 @@ tcp_output(struct tcpcb *tp) #if 0 int maxburst = TCP_MAXBURST; #endif - struct rmxp_tao *taop; + struct rmxp_tao tao; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; + bzero(&tao, sizeof(tao)); isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif @@ -232,7 +233,6 @@ again: */ len = (long)ulmin(so->so_snd.sb_cc, win) - off; - taop = tcp_gettaocache(&tp->t_inpcb->inp_inc); /* * Lop off SYN bit if it has already been sent. However, if this @@ -242,8 +242,10 @@ again: if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { flags &= ~TH_SYN; off--, len++; + if (tcp_do_rfc1644) + tcp_hc_gettao(&tp->t_inpcb->inp_inc, &tao); if (len > 0 && tp->t_state == TCPS_SYN_SENT && - (taop == NULL || taop->tao_ccsent == 0)) + tao.tao_ccsent == 0) return 0; } @@ -429,7 +431,7 @@ send: opt[0] = TCPOPT_MAXSEG; opt[1] = TCPOLEN_MAXSEG; - mss = htons((u_short) tcp_mssopt(tp)); + mss = htons((u_short) tcp_mssopt(&tp->t_inpcb->inp_inc)); (void)memcpy(opt + 2, &mss, sizeof(mss)); optlen = TCPOLEN_MAXSEG; @@ -872,10 +874,7 @@ send: * Also, desired default hop limit might be changed via * Neighbor Discovery. */ - ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, - tp->t_inpcb->in6p_route.ro_rt ? - tp->t_inpcb->in6p_route.ro_rt->rt_ifp - : NULL); + ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); /* TODO: IPv6 IP6TOS_ECT bit on */ #if defined(IPSEC) && !defined(FAST_IPSEC) @@ -886,36 +885,27 @@ send: } #endif /*IPSEC*/ error = ip6_output(m, - tp->t_inpcb->in6p_outputopts, - &tp->t_inpcb->in6p_route, + tp->t_inpcb->in6p_outputopts, NULL, (so->so_options & SO_DONTROUTE), NULL, NULL, tp->t_inpcb); } else #endif /* INET6 */ { - struct rtentry *rt; ip->ip_len = m->m_pkthdr.len; #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) - ip->ip_ttl = in6_selecthlim(tp->t_inpcb, - tp->t_inpcb->in6p_route.ro_rt ? - tp->t_inpcb->in6p_route.ro_rt->rt_ifp - : NULL); + ip->ip_ttl = in6_selecthlim(tp->t_inpcb, NULL); #endif /* INET6 */ /* - * See if we should do MTU discovery. We do it only if the following - * are true: - * 1) we have a valid route to the destination - * 2) the MTU is not locked (if it is, then discovery has been - * disabled) + * If we do path MTU discovery, then we set DF on every packet. + * This might not be the best thing to do according to RFC3390 + * Section 2. However the tcp hostcache migitates the problem + * so it affects only the first tcp connection with a host. */ - if (path_mtu_discovery - && (rt = tp->t_inpcb->inp_route.ro_rt) - && rt->rt_flags & RTF_UP - && !(rt->rt_rmx.rmx_locks & RTV_MTU)) { + if (path_mtu_discovery) ip->ip_off |= IP_DF; - } - error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, + + error = ip_output(m, tp->t_inpcb->inp_options, NULL, (so->so_options & SO_DONTROUTE), 0, tp->t_inpcb); } if (error) { diff --git a/sys/netinet/tcp_reass.c b/sys/netinet/tcp_reass.c index a247138..eca5cb2 100644 --- a/sys/netinet/tcp_reass.c +++ b/sys/netinet/tcp_reass.c @@ -154,9 +154,8 @@ static int tcp_timewait(struct tcptw *, struct tcpopt *, #define ND6_HINT(tp) \ do { \ if ((tp) && (tp)->t_inpcb && \ - ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0 && \ - (tp)->t_inpcb->in6p_route.ro_rt) \ - nd6_nud_hint((tp)->t_inpcb->in6p_route.ro_rt, NULL, 0); \ + ((tp)->t_inpcb->inp_vflag & INP_IPV6) != 0) \ + nd6_nud_hint(NULL, NULL, 0); \ } while (0) #else #define ND6_HINT(tp) @@ -358,8 +357,7 @@ tcp_input(m, off0) int todrop, acked, ourfinisacked, needoutput = 0; u_long tiwin; struct tcpopt to; /* options in this segment */ - struct rmxp_tao *taop; /* pointer to our TAO cache entry */ - struct rmxp_tao tao_noncached; /* in case there's no cached entry */ + struct rmxp_tao tao; /* our TAO cache entry */ int headlocked = 0; struct sockaddr_in *next_hop = NULL; int rstreason; /* For badport_bandlim accounting purposes */ @@ -389,6 +387,7 @@ tcp_input(m, off0) #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif + bzero(&tao, sizeof(tao)); bzero((char *)&to, sizeof(to)); tcpstat.tcps_rcvtotal++; @@ -707,11 +706,9 @@ findpcb: if (isipv6) { inc.inc6_faddr = ip6->ip6_src; inc.inc6_laddr = ip6->ip6_dst; - inc.inc6_route.ro_rt = NULL; /* XXX */ } else { inc.inc_faddr = ip->ip_src; inc.inc_laddr = ip->ip_dst; - inc.inc_route.ro_rt = NULL; /* XXX */ } inc.inc_fport = th->th_sport; inc.inc_lport = th->th_dport; @@ -916,7 +913,7 @@ findpcb: } after_listen: -/* XXX temp debugging */ + /* XXX temp debugging */ /* should not happen - syncache should pick up these connections */ if (tp->t_state == TCPS_LISTEN) panic("tcp_input: TCPS_LISTEN"); @@ -930,8 +927,9 @@ after_listen: callout_reset(tp->tt_keep, tcp_keepidle, tcp_timer_keep, tp); /* - * Process options. - * XXX this is tradtitional behavior, may need to be cleaned up. + * Process options only when we get SYN/ACK back. The SYN case + * for incoming connections is handled in tcp_syncache. + * XXX this is traditional behavior, may need to be cleaned up. */ tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); if (thflags & TH_SYN) { @@ -1179,10 +1177,8 @@ after_listen: * continue processing rest of data/controls, beginning with URG */ case TCPS_SYN_SENT: - if ((taop = tcp_gettaocache(&inp->inp_inc)) == NULL) { - taop = &tao_noncached; - bzero(taop, sizeof(*taop)); - } + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || @@ -1195,7 +1191,7 @@ after_listen: * Our new SYN, when it arrives, will serve as the * needed ACK. */ - if (taop->tao_ccsent != 0) + if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; @@ -1225,7 +1221,7 @@ after_listen: */ if (to.to_flags & TOF_CCECHO) { if (tp->cc_send != to.to_ccecho) { - if (taop->tao_ccsent != 0) + if (tao.tao_ccsent != 0) goto drop; else { rstreason = BANDLIM_UNLIMITED; @@ -1246,8 +1242,8 @@ after_listen: tp->rcv_scale = tp->request_r_scale; } /* Segment is acceptable, update cache if undefined. */ - if (taop->tao_ccsent == 0) - taop->tao_ccsent = to.to_ccecho; + if (tao.tao_ccsent == 0 && tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, to.to_ccecho, 0); tp->rcv_adv += tp->rcv_wnd; tp->snd_una++; /* SYN is acked */ @@ -1290,14 +1286,16 @@ after_listen: tp->t_flags |= TF_ACKNOW; callout_stop(tp->tt_rexmt); if (to.to_flags & TOF_CC) { - if (taop->tao_cc != 0 && - CC_GT(to.to_cc, taop->tao_cc)) { + if (tao.tao_cc != 0 && + CC_GT(to.to_cc, tao.tao_cc)) { /* * update cache and make transition: * SYN-SENT -> ESTABLISHED* * SYN-SENT* -> FIN-WAIT-1* */ - taop->tao_cc = to.to_cc; + tao.tao_cc = to.to_cc; + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_CC, to.to_cc, 0); tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { tp->t_state = TCPS_FIN_WAIT_1; @@ -1313,8 +1311,12 @@ after_listen: } else tp->t_state = TCPS_SYN_RECEIVED; } else { - /* CC.NEW or no option => invalidate cache */ - taop->tao_cc = 0; + if (tcp_do_rfc1644) { + /* CC.NEW or no option => invalidate cache */ + tao.tao_cc = 0; + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_CC, to.to_cc, 0); + } tp->t_state = TCPS_SYN_RECEIVED; } } @@ -1682,13 +1684,14 @@ trimthenstep6: } /* * Upon successful completion of 3-way handshake, - * update cache.CC if it was undefined, pass any queued - * data to the user, and advance state appropriately. + * update cache.CC, pass any queued data to the user, + * and advance state appropriately. */ - if ((taop = tcp_gettaocache(&inp->inp_inc)) != NULL && - taop->tao_cc == 0) - taop->tao_cc = tp->cc_recv; - + if (tcp_do_rfc1644) { + tao.tao_cc = tp->cc_recv; + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CC, + tp->cc_recv, 0); + } /* * Make transitions: * SYN-RECEIVED -> ESTABLISHED @@ -2611,25 +2614,26 @@ tcp_xmit_timer(tp, rtt) * are present. Store the upper limit of the length of options plus * data in maxopd. * - * NOTE that this routine is only called when we process an incoming - * segment, for outgoing segments only tcp_mssopt is called. * * In case of T/TCP, we call this routine during implicit connection * setup as well (offer = -1), to initialize maxseg from the cached * MSS of our peer. + * + * NOTE that this routine is only called when we process an incoming + * segment. Outgoing SYN/ACK MSS settings are handled in tcp_mssopt(). */ void tcp_mss(tp, offer) struct tcpcb *tp; int offer; { - register struct rtentry *rt; - struct ifnet *ifp; - register int rtt, mss; + int rtt, mss; u_long bufsize; + u_long maxmtu; struct inpcb *inp = tp->t_inpcb; struct socket *so; - struct rmxp_tao *taop; + struct hc_metrics_lite metrics; + struct rmxp_tao tao; int origoffer = offer; #ifdef INET6 int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0; @@ -2637,96 +2641,96 @@ tcp_mss(tp, offer) sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : sizeof (struct tcpiphdr); #else - const int isipv6 = 0; - const size_t min_protoh = sizeof (struct tcpiphdr); + const size_t min_protoh = sizeof(struct tcpiphdr); #endif + bzero(&tao, sizeof(tao)); - if (isipv6) - rt = tcp_rtlookup6(&inp->inp_inc); - else - rt = tcp_rtlookup(&inp->inp_inc); - if (rt == NULL) { - tp->t_maxopd = tp->t_maxseg = - isipv6 ? tcp_v6mssdflt : tcp_mssdflt; - return; + /* initialize */ +#ifdef INET6 + if (isipv6) { + maxmtu = tcp_maxmtu6(&inp->inp_inc); + tp->t_maxopd = tp->t_maxseg = tcp_v6mssdflt; + } else +#endif + { + maxmtu = tcp_maxmtu(&inp->inp_inc); + tp->t_maxopd = tp->t_maxseg = tcp_mssdflt; } - ifp = rt->rt_ifp; so = inp->inp_socket; - taop = rmx_taop(rt->rt_rmx); /* - * Offer == -1 means that we didn't receive SYN yet, - * use cached value in that case; + * no route to sender, take default mss and return */ - if (offer == -1) - offer = taop->tao_mssopt; - /* - * Offer == 0 means that there was no MSS on the SYN segment, - * in this case we use tcp_mssdflt. - */ - if (offer == 0) - offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt; - else - /* - * Sanity check: make sure that maxopd will be large - * enough to allow some data on segments even is the - * all the option space is used (40bytes). Otherwise - * funny things may happen in tcp_output. - */ - offer = max(offer, 64); - taop->tao_mssopt = offer; + if (maxmtu == 0) + return; + + /* what have we got? */ + switch (offer) { + case 0: + /* + * Offer == 0 means that there was no MSS on the SYN + * segment, in this case we use tcp_mssdflt. + */ + offer = +#ifdef INET6 + isipv6 ? tcp_v6mssdflt : +#endif + tcp_mssdflt; + break; + + case -1: + /* + * Offer == -1 means that we didn't receive SYN yet, + * use cached value in that case; + */ + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); + if (tao.tao_mssopt != 0) + offer = tao.tao_mssopt; + /* FALLTHROUGH */ + + default: + /* + * Sanity check: make sure that maxopd will be large + * enough to allow some data on segments even if the + * all the option space is used (40bytes). Otherwise + * funny things may happen in tcp_output. + */ + offer = max(offer, 64); + if (tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, + TCP_HC_TAO_MSSOPT, 0, offer); + } /* - * While we're here, check if there's an initial rtt - * or rttvar. Convert from the route-table units - * to scaled multiples of the slow timeout timer. + * rmx information is now retrieved from tcp_hostcache */ - if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) { - /* - * XXX the lock bit for RTT indicates that the value - * is also a minimum value; this is subject to time. - */ - if (rt->rt_rmx.rmx_locks & RTV_RTT) - tp->t_rttmin = rtt / (RTM_RTTUNIT / hz); - tp->t_srtt = rtt / (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); - tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; - tcpstat.tcps_usedrtt++; - if (rt->rt_rmx.rmx_rttvar) { - tp->t_rttvar = rt->rt_rmx.rmx_rttvar / - (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); - tcpstat.tcps_usedrttvar++; - } else { - /* default variation is +- 1 rtt */ - tp->t_rttvar = - tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; - } - TCPT_RANGESET(tp->t_rxtcur, - ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, - tp->t_rttmin, TCPTV_REXMTMAX); - } + tcp_hc_get(&inp->inp_inc, &metrics); + /* - * if there's an mtu associated with the route, use it + * if there's a discovered mtu int tcp hostcache, use it * else, use the link mtu. */ - if (rt->rt_rmx.rmx_mtu) - mss = rt->rt_rmx.rmx_mtu - min_protoh; + if (metrics.rmx_mtu) + mss = metrics.rmx_mtu - min_protoh; else { #ifdef INET6 - mss = (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : ifp->if_mtu) - - min_protoh; -#else - mss = ifp->if_mtu - min_protoh; -#endif -#ifdef INET6 if (isipv6) { - if (!in6_localaddr(&inp->in6p_faddr)) + mss = maxmtu - min_protoh; + if (!path_mtu_discovery && + !in6_localaddr(&inp->in6p_faddr)) mss = min(mss, tcp_v6mssdflt); } else #endif - if (!in_localaddr(inp->inp_faddr)) + { + mss = maxmtu - min_protoh; + if (!path_mtu_discovery && + !in_localaddr(inp->inp_faddr)) mss = min(mss, tcp_mssdflt); + } } mss = min(mss, offer); + /* * maxopd stores the maximum length of data AND options * in a segment; maxseg is the amount of data in a normal @@ -2749,6 +2753,7 @@ tcp_mss(tp, offer) (origoffer == -1 || (tp->t_flags & TF_RCVD_CC) == TF_RCVD_CC)) mss -= TCPOLEN_CC_APPA; + tp->t_maxseg = mss; #if (MCLBYTES & (MCLBYTES - 1)) == 0 if (mss > MCLBYTES) @@ -2757,15 +2762,18 @@ tcp_mss(tp, offer) if (mss > MCLBYTES) mss = mss / MCLBYTES * MCLBYTES; #endif + tp->t_maxseg = mss; + /* - * If there's a pipesize, change the socket buffer - * to that size. Make the socket buffers an integral - * number of mss units; if the mss is larger than - * the socket buffer, decrease the mss. + * If there's a pipesize, change the socket buffer to that size, + * don't change if sb_hiwat is different than default (then it + * has been changed on purpose with setsockopt). + * Make the socket buffers an integral number of mss units; + * if the mss is larger than the socket buffer, decrease the mss. */ -#ifdef RTV_SPIPE - if ((bufsize = rt->rt_rmx.rmx_sendpipe) == 0) -#endif + if ((so->so_snd.sb_hiwat == tcp_sendspace) && metrics.rmx_sendpipe) + bufsize = metrics.rmx_sendpipe; + else bufsize = so->so_snd.sb_hiwat; if (bufsize < mss) mss = bufsize; @@ -2778,9 +2786,9 @@ tcp_mss(tp, offer) } tp->t_maxseg = mss; -#ifdef RTV_RPIPE - if ((bufsize = rt->rt_rmx.rmx_recvpipe) == 0) -#endif + if ((so->so_rcv.sb_hiwat == tcp_recvspace) && metrics.rmx_recvpipe) + bufsize = metrics.rmx_recvpipe; + else bufsize = so->so_rcv.sb_hiwat; if (bufsize > mss) { bufsize = roundup(bufsize, mss); @@ -2789,62 +2797,110 @@ tcp_mss(tp, offer) if (bufsize > so->so_rcv.sb_hiwat) (void)sbreserve(&so->so_rcv, bufsize, so, NULL); } + /* + * While we're here, check the others too + */ + if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) { + tp->t_srtt = rtt; + tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE; + tcpstat.tcps_usedrtt++; + if (metrics.rmx_rttvar) { + tp->t_rttvar = metrics.rmx_rttvar; + tcpstat.tcps_usedrttvar++; + } else { + /* default variation is +- 1 rtt */ + tp->t_rttvar = + tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE; + } + TCPT_RANGESET(tp->t_rxtcur, + ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1, + tp->t_rttmin, TCPTV_REXMTMAX); + } + if (metrics.rmx_ssthresh) { + /* + * There's some sort of gateway or interface + * buffer limit on the path. Use this to set + * the slow start threshhold, but set the + * threshold to no less than 2*mss. + */ + tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh); + tcpstat.tcps_usedssthresh++; + } + if (metrics.rmx_bandwidth) + tp->snd_bandwidth = metrics.rmx_bandwidth; /* * Set the slow-start flight size depending on whether this * is a local network or not. + * + * Extend this so we cache the cwnd too and retrieve it here. + * Make cwnd even bigger than RFC3390 suggests but only if we + * have previous experience with the remote host. Be careful + * not make cwnd bigger than remote receive window or our own + * send socket buffer. Maybe put some additional upper bound + * on the retrieved cwnd. Should do incremental updates to + * hostcache when cwnd collapses so next connection doesn't + * overloads the path again. + * + * RFC3390 says only do this if SYN or SYN/ACK didn't got lost. + * We currently check only in syncache_socket for that. */ +#define TCP_METRICS_CWND +#ifdef TCP_METRICS_CWND + if (metrics.rmx_cwnd) + tp->snd_cwnd = max(mss, + min(metrics.rmx_cwnd / 2, + min(tp->snd_wnd, so->so_snd.sb_hiwat))); + else +#endif if (tcp_do_rfc3390) tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380)); +#ifdef INET6 else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) || (!isipv6 && in_localaddr(inp->inp_faddr))) tp->snd_cwnd = mss * ss_fltsz_local; +#endif else tp->snd_cwnd = mss * ss_fltsz; - - if (rt->rt_rmx.rmx_ssthresh) { - /* - * There's some sort of gateway or interface - * buffer limit on the path. Use this to set - * the slow start threshhold, but set the - * threshold to no less than 2*mss. - */ - tp->snd_ssthresh = max(2 * mss, rt->rt_rmx.rmx_ssthresh); - tcpstat.tcps_usedssthresh++; - } } /* * Determine the MSS option to send on an outgoing SYN. */ int -tcp_mssopt(tp) - struct tcpcb *tp; +tcp_mssopt(inc) + struct in_conninfo *inc; { - struct rtentry *rt; + int mss = 0; + u_long maxmtu = 0; + u_long thcmtu = 0; + size_t min_protoh; #ifdef INET6 - int isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0; - size_t min_protoh = isipv6 ? - sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : - sizeof (struct tcpiphdr); -#else - const int isipv6 = 0; - const size_t min_protoh = sizeof (struct tcpiphdr); + int isipv6 = inc->inc_isipv6 ? 1 : 0; #endif - if (isipv6) - rt = tcp_rtlookup6(&tp->t_inpcb->inp_inc); - else - rt = tcp_rtlookup(&tp->t_inpcb->inp_inc); - if (rt == NULL) - return (isipv6 ? tcp_v6mssdflt : tcp_mssdflt); + KASSERT(inc != NULL, ("tcp_mssopt with NULL in_conninfo pointer")); #ifdef INET6 - return (isipv6 ? IN6_LINKMTU(rt->rt_ifp) : - rt->rt_ifp->if_mtu - min_protoh); -#else - return (rt->rt_ifp->if_mtu - min_protoh); + if (isipv6) { + mss = tcp_v6mssdflt; + maxmtu = tcp_maxmtu6(inc); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + } else #endif + { + mss = tcp_mssdflt; + maxmtu = tcp_maxmtu(inc); + thcmtu = tcp_hc_getmtu(inc); /* IPv4 and IPv6 */ + min_protoh = sizeof(struct tcpiphdr); + } + if (maxmtu && thcmtu) + mss = min(maxmtu, thcmtu) - min_protoh; + else if (maxmtu || thcmtu) + mss = max(maxmtu, thcmtu) - min_protoh; + + return (mss); } diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 7ce06f6..dfd6de1 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -76,6 +76,7 @@ #include <netinet/ip_var.h> #ifdef INET6 #include <netinet6/ip6_var.h> +#include <netinet6/nd6.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> @@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); -static void tcp_cleartaocache(void); static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_discardcb(struct tcpcb *); @@ -215,7 +215,6 @@ tcp_init() int hashsize = TCBHASHSIZE; tcp_ccgen = 1; - tcp_cleartaocache(); tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; @@ -262,6 +261,7 @@ tcp_init() uma_zone_set_max(tcptw_zone, maxsockets / 5); tcp_timer_init(); syncache_init(); + tcp_hc_init(); } /* @@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) { register int tlen; int win = 0; - struct route *ro = 0; - struct route sro; struct ip *ip; struct tcphdr *nth; #ifdef INET6 - struct route_in6 *ro6 = 0; - struct route_in6 sro6; struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; - struct inpcb *inp; + struct inpcb *inp = NULL; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); @@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } -#ifdef INET6 - if (isipv6) - ro6 = &inp->in6p_route; - else -#endif /* INET6 */ - ro = &inp->inp_route; - } else { - inp = NULL; -#ifdef INET6 - if (isipv6) { - ro6 = &sro6; - bzero(ro6, sizeof *ro6); - } else -#endif /* INET6 */ - { - ro = &sro; - bzero(ro, sizeof *ro); - } } if (m == 0) { m = m_gethdr(M_DONTWAIT, MT_HEADER); @@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); - ip6->ip6_hlim = in6_selecthlim(inp, - ro6 && ro6->ro_rt ? - ro6->ro_rt->rt_ifp : - NULL); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL); } else #endif /* INET6 */ { @@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif #ifdef INET6 - if (isipv6) { - (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp); - if (ro6 == &sro6 && ro6->ro_rt) { - RTFREE(ro6->ro_rt); - ro6->ro_rt = NULL; - } - } else + if (isipv6) + (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); + else #endif /* INET6 */ - { - (void) ip_output(m, NULL, ro, ipflags, NULL, inp); - if (ro == &sro && ro->ro_rt) { - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; - } - } + (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); } /* @@ -647,8 +612,6 @@ tcp_discardcb(tp) #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - struct rtentry *rt; - int dosavessthresh; /* * Make sure that all of our timers are stopped before we @@ -663,89 +626,34 @@ tcp_discardcb(tp) /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. - * 'Enough' is arbitrarily defined as the 16 samples. - * 16 samples is enough for the srtt filter to converge - * to within 5% of the correct value; fewer samples and - * we could save a very bogus rtt. - * - * Don't update the default route's characteristics and don't - * update anything that the user "locked". + * 'Enough' is arbitrarily defined as 4 rtt samples. + * 4 samples is enough for the srtt filter to converge + * to within enough % of the correct value; fewer samples + * and we could save a bogus rtt. The danger is not high + * as tcp quickly recovers from everything. + * XXX: Works very well but needs some more statistics! */ - if (tp->t_rttupdated >= 16) { - register u_long i = 0; -#ifdef INET6 - if (isipv6) { - struct sockaddr_in6 *sin6; + if (tp->t_rttupdated >= 4) { + struct hc_metrics_lite metrics; + u_long ssthresh; - if ((rt = inp->in6p_route.ro_rt) == NULL) - goto no_valid_rt; - sin6 = (struct sockaddr_in6 *)rt_key(rt); - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) - goto no_valid_rt; - } - else -#endif /* INET6 */ - if ((rt = inp->inp_route.ro_rt) == NULL || - ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr - == INADDR_ANY) - goto no_valid_rt; - - if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { - i = tp->t_srtt * - (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); - if (rt->rt_rmx.rmx_rtt && i) - /* - * filter this update to half the old & half - * the new values, converting scale. - * See route.h and tcp_var.h for a - * description of the scaling constants. - */ - rt->rt_rmx.rmx_rtt = - (rt->rt_rmx.rmx_rtt + i) / 2; - else - rt->rt_rmx.rmx_rtt = i; - tcpstat.tcps_cachedrtt++; - } - if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { - i = tp->t_rttvar * - (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); - if (rt->rt_rmx.rmx_rttvar && i) - rt->rt_rmx.rmx_rttvar = - (rt->rt_rmx.rmx_rttvar + i) / 2; - else - rt->rt_rmx.rmx_rttvar = i; - tcpstat.tcps_cachedrttvar++; - } + bzero(&metrics, sizeof(metrics)); /* - * The old comment here said: - * update the pipelimit (ssthresh) if it has been updated - * already or if a pipesize was specified & the threshhold - * got below half the pipesize. I.e., wait for bad news - * before we start updating, then update on both good - * and bad news. - * - * But we want to save the ssthresh even if no pipesize is - * specified explicitly in the route, because such - * connections still have an implicit pipesize specified - * by the global tcp_sendspace. In the absence of a reliable - * way to calculate the pipesize, it will have to do. + * Update the ssthresh always when the conditions below + * are satisfied. This gives us better new start value + * for the congestion avoidance for new connections. + * ssthresh is only set if packet loss occured on a session. */ - i = tp->snd_ssthresh; - if (rt->rt_rmx.rmx_sendpipe != 0) - dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); - else - dosavessthresh = (i < so->so_snd.sb_hiwat / 2); - if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && - i != 0 && rt->rt_rmx.rmx_ssthresh != 0) - || dosavessthresh) { + ssthresh = tp->snd_ssthresh; + if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ - i = (i + tp->t_maxseg / 2) / tp->t_maxseg; - if (i < 2) - i = 2; - i *= (u_long)(tp->t_maxseg + + ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; + if (ssthresh < 2) + ssthresh = 2; + ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : @@ -755,15 +663,21 @@ tcp_discardcb(tp) ) #endif ); - if (rt->rt_rmx.rmx_ssthresh) - rt->rt_rmx.rmx_ssthresh = - (rt->rt_rmx.rmx_ssthresh + i) / 2; - else - rt->rt_rmx.rmx_ssthresh = i; - tcpstat.tcps_cachedssthresh++; - } + } else + ssthresh = 0; + metrics.rmx_ssthresh = ssthresh; + + metrics.rmx_rtt = tp->t_srtt; + metrics.rmx_rttvar = tp->t_rttvar; + /* XXX: This wraps if the pipe is more than 4 Gbit per second */ + metrics.rmx_bandwidth = tp->snd_bandwidth; + metrics.rmx_cwnd = tp->snd_cwnd; + metrics.rmx_sendpipe = 0; + metrics.rmx_recvpipe = 0; + + tcp_hc_update(&inp->inp_inc, &metrics); } - no_valid_rt: + /* free the reassembly queue, if any */ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); @@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip) notify = tcp_drop_syn_sent; else if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; - else if (PRC_IS_REDIRECT(cmd)) { - ip = 0; - notify = in_rtchange; - } else if (cmd == PRC_HOSTDEAD) + /* + * Redirects don't need to be handled up here. + */ + else if (PRC_IS_REDIRECT(cmd)) + return; + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * XXX: We never get this from ICMP, otherwise it makes an + * excellent DoS attack on machines with many connections. + */ + else if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; @@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno) int errno; { struct tcpcb *tp = intotcpcb(inp); - struct rtentry *rt; - struct rmxp_tao *taop; + struct rmxp_tao tao; struct socket *so = inp->inp_socket; - int offered; + u_int maxmtu; + u_int romtu; int mss; #ifdef INET6 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ + bzero(&tao, sizeof(tao)); if (tp) { + maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ + romtu = #ifdef INET6 - if (isipv6) - rt = tcp_rtlookup6(&inp->inp_inc); - else + isipv6 ? tcp_maxmtu6(&inp->inp_inc) : #endif /* INET6 */ - rt = tcp_rtlookup(&inp->inp_inc); - if (!rt || !rt->rt_rmx.rmx_mtu) { + tcp_maxmtu(&inp->inp_inc); + if (!maxmtu) + maxmtu = romtu; + else + maxmtu = min(maxmtu, romtu); + if (!maxmtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : @@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno) tcp_mssdflt; return inp; } - taop = rmx_taop(rt->rt_rmx); - offered = taop->tao_mssopt; - mss = rt->rt_rmx.rmx_mtu - + mss = maxmtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : @@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno) #endif /* INET6 */ ; - if (offered) - mss = min(mss, offered); + if (tcp_do_rfc1644) { + tcp_hc_gettao(&inp->inp_inc, &tao); + if (tao.tao_mssopt) + mss = min(mss, tao.tao_mssopt); + } /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the @@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno) * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ -struct rtentry * -tcp_rtlookup(inc) +u_long +tcp_maxmtu(inc) struct in_conninfo *inc; { - struct route *ro; - struct rtentry *rt; - - ro = &inc->inc_route; - rt = ro->ro_rt; - if (rt == NULL || !(rt->rt_flags & RTF_UP)) { - /* No route yet, so try to acquire one */ - if (inc->inc_faddr.s_addr != INADDR_ANY) { - ro->ro_dst.sa_family = AF_INET; - ro->ro_dst.sa_len = sizeof(struct sockaddr_in); - ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = - inc->inc_faddr; - rtalloc(ro); - rt = ro->ro_rt; - } + struct route sro; + struct sockaddr_in *dst; + struct ifnet *ifp; + u_long maxmtu = 0; + + KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); + + sro.ro_rt = NULL; + if (inc->inc_faddr.s_addr != INADDR_ANY) { + dst = (struct sockaddr_in *)&sro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = inc->inc_faddr; + rtalloc_ign(&sro, RTF_CLONING); + } + if (sro.ro_rt != NULL) { + ifp = sro.ro_rt->rt_ifp; + if (sro.ro_rt->rt_rmx.rmx_mtu == 0) + maxmtu = ifp->if_mtu; + else + maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + RTFREE(sro.ro_rt); } - return rt; + return (maxmtu); } #ifdef INET6 -struct rtentry * -tcp_rtlookup6(inc) +u_long +tcp_maxmtu6(inc) struct in_conninfo *inc; { - struct route_in6 *ro6; - struct rtentry *rt; - - ro6 = &inc->inc6_route; - rt = ro6->ro_rt; - if (rt == NULL || !(rt->rt_flags & RTF_UP)) { - /* No route yet, so try to acquire one */ - if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { - ro6->ro_dst.sin6_family = AF_INET6; - ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); - ro6->ro_dst.sin6_addr = inc->inc6_faddr; - rtalloc((struct route *)ro6); - rt = ro6->ro_rt; - } + struct route_in6 sro6; + struct ifnet *ifp; + u_long maxmtu = 0; + + KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); + + sro6.ro_rt = NULL; + if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { + sro6.ro_dst.sin6_family = AF_INET6; + sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); + sro6.ro_dst.sin6_addr = inc->inc6_faddr; + rtalloc_ign((struct route *)&sro6, RTF_CLONING); } - return rt; + if (sro6.ro_rt != NULL) { + ifp = sro6.ro_rt->rt_ifp; + if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) + maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); + else + maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, + IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + RTFREE(sro6.ro_rt); + } + + return (maxmtu); } #endif /* INET6 */ @@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp) #endif /*IPSEC*/ /* - * Return a pointer to the cached information about the remote host. - * The cached information is stored in the protocol specific part of - * the route metrics. - */ -struct rmxp_tao * -tcp_gettaocache(inc) - struct in_conninfo *inc; -{ - struct rtentry *rt; - -#ifdef INET6 - if (inc->inc_isipv6) - rt = tcp_rtlookup6(inc); - else -#endif /* INET6 */ - rt = tcp_rtlookup(inc); - - /* Make sure this is a host route and is up. */ - if (rt == NULL || - (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) - return NULL; - - return rmx_taop(rt->rt_rmx); -} - -/* - * Clear all the TAO cache entries, called from tcp_init. - * - * XXX - * This routine is just an empty one, because we assume that the routing - * routing tables are initialized at the same time when TCP, so there is - * nothing in the cache left over. - */ -static void -tcp_cleartaocache() -{ -} - -/* * Move a TCP connection into TIME_WAIT state. * tcbinfo is unlocked. * inp is locked, and is unlocked before returning. @@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc, if (isipv6) { th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen); - ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ? - inp->in6p_route.ro_rt->rt_ifp : NULL); - error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route, + ip6->ip6_hlim = in6_selecthlim(inp, NULL); + error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } else #endif @@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc, m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; - error = ip_output(m, inp->inp_options, &inp->inp_route, + error = ip_output(m, inp->inp_options, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, inp); } if (flags & TH_ACK) diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c index 822ffeb..e2d96e9 100644 --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -202,29 +202,9 @@ static MALLOC_DEFINE(M_SYNCACHE, "syncache", "TCP syncache"); static void syncache_free(struct syncache *sc) { - struct rtentry *rt; - if (sc->sc_ipopts) (void) m_free(sc->sc_ipopts); -#ifdef INET6 - if (sc->sc_inc.inc_isipv6) - rt = sc->sc_route6.ro_rt; - else -#endif - rt = sc->sc_route.ro_rt; - if (rt != NULL) { - /* - * If this is the only reference to a protocol cloned - * route, remove it immediately. - */ - if (rt->rt_flags & RTF_WASCLONED && - (sc->sc_flags & SCF_KEEPROUTE) == 0 && - rt->rt_refcnt == 1) - rtrequest(RTM_DELETE, rt_key(rt), - rt->rt_gateway, rt_mask(rt), - rt->rt_flags, NULL); - RTFREE(rt); - } + uma_zfree(tcp_syncache.zone, sc); } @@ -644,8 +624,6 @@ syncache_socket(sc, lso, m) if (oinp->in6p_outputopts) inp->in6p_outputopts = ip6_copypktopts(oinp->in6p_outputopts, M_NOWAIT); - inp->in6p_route = sc->sc_route6; - sc->sc_route6.ro_rt = NULL; MALLOC(sin6, struct sockaddr_in6 *, sizeof *sin6, M_SONAME, M_NOWAIT | M_ZERO); @@ -675,8 +653,6 @@ syncache_socket(sc, lso, m) inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; } - inp->inp_route = sc->sc_route; - sc->sc_route.ro_rt = NULL; MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, M_NOWAIT | M_ZERO); @@ -733,6 +709,10 @@ syncache_socket(sc, lso, m) tp->cc_recv = sc->sc_cc_recv; } + /* + * Set up MSS and get cached values from tcp_hostcache. + * This might overwrite some of the defaults we just set. + */ tcp_mss(tp, sc->sc_peer_mss); /* @@ -811,10 +791,9 @@ resetandabort: #endif m_freem(m); /* XXX only needed for above */ tcpstat.tcps_sc_aborted++; - } else { - sc->sc_flags |= SCF_KEEPROUTE; + } else tcpstat.tcps_sc_completed++; - } + if (sch == NULL) syncache_free(sc); else @@ -849,13 +828,14 @@ syncache_add(inc, to, th, sop, m) struct syncache *sc = NULL; struct syncache_head *sch; struct mbuf *ipopts = NULL; - struct rmxp_tao *taop; + struct rmxp_tao tao; int i, win; INP_INFO_WLOCK_ASSERT(&tcbinfo); so = *sop; tp = sototcpcb(so); + bzero(&tao, sizeof(tao)); /* * Remember the IP options, if any. @@ -949,13 +929,11 @@ syncache_add(inc, to, th, sop, m) if (inc->inc_isipv6) { sc->sc_inc.inc6_faddr = inc->inc6_faddr; sc->sc_inc.inc6_laddr = inc->inc6_laddr; - sc->sc_route6.ro_rt = NULL; } else #endif { sc->sc_inc.inc_faddr = inc->inc_faddr; sc->sc_inc.inc_laddr = inc->inc_laddr; - sc->sc_route.ro_rt = NULL; } sc->sc_irs = th->th_seq; sc->sc_flags = 0; @@ -1027,17 +1005,19 @@ syncache_add(inc, to, th, sop, m) * processing: drop SYN, process data and FIN. * - otherwise do a normal 3-way handshake. */ - taop = tcp_gettaocache(&sc->sc_inc); + if (tcp_do_rfc1644) + tcp_hc_gettao(&sc->sc_inc, &tao); + if ((to->to_flags & TOF_CC) != 0) { if (((tp->t_flags & TF_NOPUSH) != 0) && - sc->sc_flags & SCF_CC && - taop != NULL && taop->tao_cc != 0 && - CC_GT(to->to_cc, taop->tao_cc)) { + sc->sc_flags & SCF_CC && tao.tao_cc != 0 && + CC_GT(to->to_cc, tao.tao_cc)) { sc->sc_rxtslot = 0; so = syncache_socket(sc, *sop, m); if (so != NULL) { - sc->sc_flags |= SCF_KEEPROUTE; - taop->tao_cc = to->to_cc; + tao.tao_cc = to->to_cc; + tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC, + tao.tao_cc, 0); *sop = so; } syncache_free(sc); @@ -1047,9 +1027,13 @@ syncache_add(inc, to, th, sop, m) /* * No CC option, but maybe CC.NEW: invalidate cached value. */ - if (taop != NULL) - taop->tao_cc = 0; + if (tcp_do_rfc1644) { + tao.tao_cc = 0; + tcp_hc_updatetao(&sc->sc_inc, TCP_HC_TAO_CC, + tao.tao_cc, 0); + } } + /* * TAO test failed or there was no CC option, * do a standard 3-way handshake. @@ -1087,33 +1071,22 @@ syncache_respond(sc, m) int optlen, error; u_int16_t tlen, hlen, mssopt; struct ip *ip = NULL; - struct rtentry *rt; struct tcphdr *th; struct inpcb *inp; #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif + hlen = #ifdef INET6 - if (sc->sc_inc.inc_isipv6) { - rt = tcp_rtlookup6(&sc->sc_inc); - if (rt != NULL) - mssopt = rt->rt_ifp->if_mtu - - (sizeof(struct ip6_hdr) + sizeof(struct tcphdr)); - else - mssopt = tcp_v6mssdflt; - hlen = sizeof(struct ip6_hdr); - } else + (sc->sc_inc.inc_isipv6) ? sizeof(struct ip6_hdr) : #endif - { - rt = tcp_rtlookup(&sc->sc_inc); - if (rt != NULL) - mssopt = rt->rt_ifp->if_mtu - - (sizeof(struct ip) + sizeof(struct tcphdr)); - else - mssopt = tcp_mssdflt; - hlen = sizeof(struct ip); - } + sizeof(struct ip); + + KASSERT((&sc->sc_inc) != NULL, ("syncache_respond with NULL in_conninfo pointer")); + + /* Determine MSS we advertize to other end of connection */ + mssopt = tcp_mssopt(&sc->sc_inc); /* Compute the size of the TCP options. */ if (sc->sc_flags & SCF_NOOPT) { @@ -1244,13 +1217,10 @@ syncache_respond(sc, m) #ifdef INET6 if (sc->sc_inc.inc_isipv6) { - struct route_in6 *ro6 = &sc->sc_route6; - th->th_sum = 0; th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen); - ip6->ip6_hlim = in6_selecthlim(NULL, - ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL); - error = ip6_output(m, NULL, ro6, 0, NULL, NULL, inp); + ip6->ip6_hlim = in6_selecthlim(NULL, NULL); + error = ip6_output(m, NULL, NULL, 0, NULL, NULL, inp); } else #endif { @@ -1268,7 +1238,7 @@ syncache_respond(sc, m) mtod(m, void *), th, 0); } #endif - error = ip_output(m, sc->sc_ipopts, &sc->sc_route, 0, NULL,inp); + error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, inp); } INP_UNLOCK(inp); return (error); @@ -1435,13 +1405,11 @@ syncookie_lookup(inc, th, so) if (inc->inc_isipv6) { sc->sc_inc.inc6_faddr = inc->inc6_faddr; sc->sc_inc.inc6_laddr = inc->inc6_laddr; - sc->sc_route6.ro_rt = NULL; } else #endif { sc->sc_inc.inc_faddr = inc->inc_faddr; sc->sc_inc.inc_laddr = inc->inc_laddr; - sc->sc_route.ro_rt = NULL; } sc->sc_irs = th->th_seq - 1; sc->sc_iss = th->th_ack - 1; diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 1a253ab..1eeb66e 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -551,10 +551,8 @@ tcp_timer_rexmt(xtp) if ((tp->t_state == TCPS_SYN_SENT) && (tp->t_rxtshift == 3)) tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC); /* - * If losing, let the lower level know and try for - * a better route. Also, if we backed off this far, - * our srtt estimate is probably bogus. Clobber it - * so we'll take the next rtt measurement as our srtt; + * If we backed off this far, our srtt estimate is probably bogus. + * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current * retransmit times until then. */ @@ -564,7 +562,6 @@ tcp_timer_rexmt(xtp) in6_losing(tp->t_inpcb); else #endif - in_losing(tp->t_inpcb); tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); tp->t_srtt = 0; } diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c index 7ce06f6..dfd6de1 100644 --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -76,6 +76,7 @@ #include <netinet/ip_var.h> #ifdef INET6 #include <netinet6/ip6_var.h> +#include <netinet6/nd6.h> #endif #include <netinet/tcp.h> #include <netinet/tcp_fsm.h> @@ -177,7 +178,6 @@ static int tcp_inflight_stab = 20; SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets"); -static void tcp_cleartaocache(void); static struct inpcb *tcp_notify(struct inpcb *, int); static void tcp_discardcb(struct tcpcb *); @@ -215,7 +215,6 @@ tcp_init() int hashsize = TCBHASHSIZE; tcp_ccgen = 1; - tcp_cleartaocache(); tcp_delacktime = TCPTV_DELACK; tcp_keepinit = TCPTV_KEEP_INIT; @@ -262,6 +261,7 @@ tcp_init() uma_zone_set_max(tcptw_zone, maxsockets / 5); tcp_timer_init(); syncache_init(); + tcp_hc_init(); } /* @@ -367,18 +367,14 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) { register int tlen; int win = 0; - struct route *ro = 0; - struct route sro; struct ip *ip; struct tcphdr *nth; #ifdef INET6 - struct route_in6 *ro6 = 0; - struct route_in6 sro6; struct ip6_hdr *ip6; int isipv6; #endif /* INET6 */ int ipflags = 0; - struct inpcb *inp; + struct inpcb *inp = NULL; KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); @@ -398,24 +394,6 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) if (win > (long)TCP_MAXWIN << tp->rcv_scale) win = (long)TCP_MAXWIN << tp->rcv_scale; } -#ifdef INET6 - if (isipv6) - ro6 = &inp->in6p_route; - else -#endif /* INET6 */ - ro = &inp->inp_route; - } else { - inp = NULL; -#ifdef INET6 - if (isipv6) { - ro6 = &sro6; - bzero(ro6, sizeof *ro6); - } else -#endif /* INET6 */ - { - ro = &sro; - bzero(ro, sizeof *ro); - } } if (m == 0) { m = m_gethdr(M_DONTWAIT, MT_HEADER); @@ -516,10 +494,7 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) nth->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen - sizeof(struct ip6_hdr)); - ip6->ip6_hlim = in6_selecthlim(inp, - ro6 && ro6->ro_rt ? - ro6->ro_rt->rt_ifp : - NULL); + ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL, NULL); } else #endif /* INET6 */ { @@ -533,21 +508,11 @@ tcp_respond(tp, ipgen, th, m, ack, seq, flags) tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0); #endif #ifdef INET6 - if (isipv6) { - (void) ip6_output(m, NULL, ro6, ipflags, NULL, NULL, inp); - if (ro6 == &sro6 && ro6->ro_rt) { - RTFREE(ro6->ro_rt); - ro6->ro_rt = NULL; - } - } else + if (isipv6) + (void) ip6_output(m, NULL, NULL, ipflags, NULL, NULL, inp); + else #endif /* INET6 */ - { - (void) ip_output(m, NULL, ro, ipflags, NULL, inp); - if (ro == &sro && ro->ro_rt) { - RTFREE(ro->ro_rt); - ro->ro_rt = NULL; - } - } + (void) ip_output(m, NULL, NULL, ipflags, NULL, inp); } /* @@ -647,8 +612,6 @@ tcp_discardcb(tp) #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - struct rtentry *rt; - int dosavessthresh; /* * Make sure that all of our timers are stopped before we @@ -663,89 +626,34 @@ tcp_discardcb(tp) /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. - * 'Enough' is arbitrarily defined as the 16 samples. - * 16 samples is enough for the srtt filter to converge - * to within 5% of the correct value; fewer samples and - * we could save a very bogus rtt. - * - * Don't update the default route's characteristics and don't - * update anything that the user "locked". + * 'Enough' is arbitrarily defined as 4 rtt samples. + * 4 samples is enough for the srtt filter to converge + * to within enough % of the correct value; fewer samples + * and we could save a bogus rtt. The danger is not high + * as tcp quickly recovers from everything. + * XXX: Works very well but needs some more statistics! */ - if (tp->t_rttupdated >= 16) { - register u_long i = 0; -#ifdef INET6 - if (isipv6) { - struct sockaddr_in6 *sin6; + if (tp->t_rttupdated >= 4) { + struct hc_metrics_lite metrics; + u_long ssthresh; - if ((rt = inp->in6p_route.ro_rt) == NULL) - goto no_valid_rt; - sin6 = (struct sockaddr_in6 *)rt_key(rt); - if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) - goto no_valid_rt; - } - else -#endif /* INET6 */ - if ((rt = inp->inp_route.ro_rt) == NULL || - ((struct sockaddr_in *)rt_key(rt))->sin_addr.s_addr - == INADDR_ANY) - goto no_valid_rt; - - if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) { - i = tp->t_srtt * - (RTM_RTTUNIT / (hz * TCP_RTT_SCALE)); - if (rt->rt_rmx.rmx_rtt && i) - /* - * filter this update to half the old & half - * the new values, converting scale. - * See route.h and tcp_var.h for a - * description of the scaling constants. - */ - rt->rt_rmx.rmx_rtt = - (rt->rt_rmx.rmx_rtt + i) / 2; - else - rt->rt_rmx.rmx_rtt = i; - tcpstat.tcps_cachedrtt++; - } - if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) { - i = tp->t_rttvar * - (RTM_RTTUNIT / (hz * TCP_RTTVAR_SCALE)); - if (rt->rt_rmx.rmx_rttvar && i) - rt->rt_rmx.rmx_rttvar = - (rt->rt_rmx.rmx_rttvar + i) / 2; - else - rt->rt_rmx.rmx_rttvar = i; - tcpstat.tcps_cachedrttvar++; - } + bzero(&metrics, sizeof(metrics)); /* - * The old comment here said: - * update the pipelimit (ssthresh) if it has been updated - * already or if a pipesize was specified & the threshhold - * got below half the pipesize. I.e., wait for bad news - * before we start updating, then update on both good - * and bad news. - * - * But we want to save the ssthresh even if no pipesize is - * specified explicitly in the route, because such - * connections still have an implicit pipesize specified - * by the global tcp_sendspace. In the absence of a reliable - * way to calculate the pipesize, it will have to do. + * Update the ssthresh always when the conditions below + * are satisfied. This gives us better new start value + * for the congestion avoidance for new connections. + * ssthresh is only set if packet loss occured on a session. */ - i = tp->snd_ssthresh; - if (rt->rt_rmx.rmx_sendpipe != 0) - dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2); - else - dosavessthresh = (i < so->so_snd.sb_hiwat / 2); - if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 && - i != 0 && rt->rt_rmx.rmx_ssthresh != 0) - || dosavessthresh) { + ssthresh = tp->snd_ssthresh; + if (ssthresh != 0 && ssthresh < so->so_snd.sb_hiwat / 2) { /* * convert the limit from user data bytes to * packets then to packet data bytes. */ - i = (i + tp->t_maxseg / 2) / tp->t_maxseg; - if (i < 2) - i = 2; - i *= (u_long)(tp->t_maxseg + + ssthresh = (ssthresh + tp->t_maxseg / 2) / tp->t_maxseg; + if (ssthresh < 2) + ssthresh = 2; + ssthresh *= (u_long)(tp->t_maxseg + #ifdef INET6 (isipv6 ? sizeof (struct ip6_hdr) + sizeof (struct tcphdr) : @@ -755,15 +663,21 @@ tcp_discardcb(tp) ) #endif ); - if (rt->rt_rmx.rmx_ssthresh) - rt->rt_rmx.rmx_ssthresh = - (rt->rt_rmx.rmx_ssthresh + i) / 2; - else - rt->rt_rmx.rmx_ssthresh = i; - tcpstat.tcps_cachedssthresh++; - } + } else + ssthresh = 0; + metrics.rmx_ssthresh = ssthresh; + + metrics.rmx_rtt = tp->t_srtt; + metrics.rmx_rttvar = tp->t_rttvar; + /* XXX: This wraps if the pipe is more than 4 Gbit per second */ + metrics.rmx_bandwidth = tp->snd_bandwidth; + metrics.rmx_cwnd = tp->snd_cwnd; + metrics.rmx_sendpipe = 0; + metrics.rmx_recvpipe = 0; + + tcp_hc_update(&inp->inp_inc, &metrics); } - no_valid_rt: + /* free the reassembly queue, if any */ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); @@ -1138,10 +1052,17 @@ tcp_ctlinput(cmd, sa, vip) notify = tcp_drop_syn_sent; else if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc; - else if (PRC_IS_REDIRECT(cmd)) { - ip = 0; - notify = in_rtchange; - } else if (cmd == PRC_HOSTDEAD) + /* + * Redirects don't need to be handled up here. + */ + else if (PRC_IS_REDIRECT(cmd)) + return; + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * XXX: We never get this from ICMP, otherwise it makes an + * excellent DoS attack on machines with many connections. + */ + else if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; @@ -1379,23 +1300,28 @@ tcp_mtudisc(inp, errno) int errno; { struct tcpcb *tp = intotcpcb(inp); - struct rtentry *rt; - struct rmxp_tao *taop; + struct rmxp_tao tao; struct socket *so = inp->inp_socket; - int offered; + u_int maxmtu; + u_int romtu; int mss; #ifdef INET6 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ + bzero(&tao, sizeof(tao)); if (tp) { + maxmtu = tcp_hc_getmtu(&inp->inp_inc); /* IPv4 and IPv6 */ + romtu = #ifdef INET6 - if (isipv6) - rt = tcp_rtlookup6(&inp->inp_inc); - else + isipv6 ? tcp_maxmtu6(&inp->inp_inc) : #endif /* INET6 */ - rt = tcp_rtlookup(&inp->inp_inc); - if (!rt || !rt->rt_rmx.rmx_mtu) { + tcp_maxmtu(&inp->inp_inc); + if (!maxmtu) + maxmtu = romtu; + else + maxmtu = min(maxmtu, romtu); + if (!maxmtu) { tp->t_maxopd = tp->t_maxseg = #ifdef INET6 isipv6 ? tcp_v6mssdflt : @@ -1403,9 +1329,7 @@ tcp_mtudisc(inp, errno) tcp_mssdflt; return inp; } - taop = rmx_taop(rt->rt_rmx); - offered = taop->tao_mssopt; - mss = rt->rt_rmx.rmx_mtu - + mss = maxmtu - #ifdef INET6 (isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr) : @@ -1416,8 +1340,11 @@ tcp_mtudisc(inp, errno) #endif /* INET6 */ ; - if (offered) - mss = min(mss, offered); + if (tcp_do_rfc1644) { + tcp_hc_gettao(&inp->inp_inc, &tao); + if (tao.tao_mssopt) + mss = min(mss, tao.tao_mssopt); + } /* * XXX - The above conditional probably violates the TCP * spec. The problem is that, since we don't know the @@ -1471,50 +1398,65 @@ tcp_mtudisc(inp, errno) * is called by TCP routines that access the rmx structure and by tcp_mss * to get the interface MTU. */ -struct rtentry * -tcp_rtlookup(inc) +u_long +tcp_maxmtu(inc) struct in_conninfo *inc; { - struct route *ro; - struct rtentry *rt; - - ro = &inc->inc_route; - rt = ro->ro_rt; - if (rt == NULL || !(rt->rt_flags & RTF_UP)) { - /* No route yet, so try to acquire one */ - if (inc->inc_faddr.s_addr != INADDR_ANY) { - ro->ro_dst.sa_family = AF_INET; - ro->ro_dst.sa_len = sizeof(struct sockaddr_in); - ((struct sockaddr_in *) &ro->ro_dst)->sin_addr = - inc->inc_faddr; - rtalloc(ro); - rt = ro->ro_rt; - } + struct route sro; + struct sockaddr_in *dst; + struct ifnet *ifp; + u_long maxmtu = 0; + + KASSERT(inc != NULL, ("tcp_maxmtu with NULL in_conninfo pointer")); + + sro.ro_rt = NULL; + if (inc->inc_faddr.s_addr != INADDR_ANY) { + dst = (struct sockaddr_in *)&sro.ro_dst; + dst->sin_family = AF_INET; + dst->sin_len = sizeof(*dst); + dst->sin_addr = inc->inc_faddr; + rtalloc_ign(&sro, RTF_CLONING); + } + if (sro.ro_rt != NULL) { + ifp = sro.ro_rt->rt_ifp; + if (sro.ro_rt->rt_rmx.rmx_mtu == 0) + maxmtu = ifp->if_mtu; + else + maxmtu = min(sro.ro_rt->rt_rmx.rmx_mtu, ifp->if_mtu); + RTFREE(sro.ro_rt); } - return rt; + return (maxmtu); } #ifdef INET6 -struct rtentry * -tcp_rtlookup6(inc) +u_long +tcp_maxmtu6(inc) struct in_conninfo *inc; { - struct route_in6 *ro6; - struct rtentry *rt; - - ro6 = &inc->inc6_route; - rt = ro6->ro_rt; - if (rt == NULL || !(rt->rt_flags & RTF_UP)) { - /* No route yet, so try to acquire one */ - if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { - ro6->ro_dst.sin6_family = AF_INET6; - ro6->ro_dst.sin6_len = sizeof(struct sockaddr_in6); - ro6->ro_dst.sin6_addr = inc->inc6_faddr; - rtalloc((struct route *)ro6); - rt = ro6->ro_rt; - } + struct route_in6 sro6; + struct ifnet *ifp; + u_long maxmtu = 0; + + KASSERT(inc != NULL, ("tcp_maxmtu6 with NULL in_conninfo pointer")); + + sro6.ro_rt = NULL; + if (!IN6_IS_ADDR_UNSPECIFIED(&inc->inc6_faddr)) { + sro6.ro_dst.sin6_family = AF_INET6; + sro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); + sro6.ro_dst.sin6_addr = inc->inc6_faddr; + rtalloc_ign((struct route *)&sro6, RTF_CLONING); } - return rt; + if (sro6.ro_rt != NULL) { + ifp = sro6.ro_rt->rt_ifp; + if (sro6.ro_rt->rt_rmx.rmx_mtu == 0) + maxmtu = IN6_LINKMTU(sro6.ro_rt->rt_ifp); + else + maxmtu = min(sro6.ro_rt->rt_rmx.rmx_mtu, + IN6_LINKMTU(sro6.ro_rt->rt_ifp)); + RTFREE(sro6.ro_rt); + } + + return (maxmtu); } #endif /* INET6 */ @@ -1563,45 +1505,6 @@ ipsec_hdrsiz_tcp(tp) #endif /*IPSEC*/ /* - * Return a pointer to the cached information about the remote host. - * The cached information is stored in the protocol specific part of - * the route metrics. - */ -struct rmxp_tao * -tcp_gettaocache(inc) - struct in_conninfo *inc; -{ - struct rtentry *rt; - -#ifdef INET6 - if (inc->inc_isipv6) - rt = tcp_rtlookup6(inc); - else -#endif /* INET6 */ - rt = tcp_rtlookup(inc); - - /* Make sure this is a host route and is up. */ - if (rt == NULL || - (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) - return NULL; - - return rmx_taop(rt->rt_rmx); -} - -/* - * Clear all the TAO cache entries, called from tcp_init. - * - * XXX - * This routine is just an empty one, because we assume that the routing - * routing tables are initialized at the same time when TCP, so there is - * nothing in the cache left over. - */ -static void -tcp_cleartaocache() -{ -} - -/* * Move a TCP connection into TIME_WAIT state. * tcbinfo is unlocked. * inp is locked, and is unlocked before returning. @@ -1822,9 +1725,8 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc, if (isipv6) { th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), sizeof(struct tcphdr) + optlen); - ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ? - inp->in6p_route.ro_rt->rt_ifp : NULL); - error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route, + ip6->ip6_hlim = in6_selecthlim(inp, NULL); + error = ip6_output(m, inp->in6p_outputopts, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); } else #endif @@ -1834,7 +1736,7 @@ tcp_twrespond(struct tcptw *tw, struct socket *so, struct mbuf *msrc, m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); ip->ip_len = m->m_pkthdr.len; - error = ip_output(m, inp->inp_options, &inp->inp_route, + error = ip_output(m, inp->inp_options, NULL, (tw->tw_so_options & SO_DONTROUTE), NULL, inp); } if (flags & TH_ACK) diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index 7035227..17566c8 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -848,12 +848,13 @@ tcp_connect(tp, nam, td) struct inpcb *inp = tp->t_inpcb, *oinp; struct socket *so = inp->inp_socket; struct tcptw *otw; - struct rmxp_tao *taop; - struct rmxp_tao tao_noncached; + struct rmxp_tao tao; struct in_addr laddr; u_short lport; int error; + bzero(&tao, sizeof(tao)); + if (inp->inp_lport == 0) { error = in_pcbbind(inp, (struct sockaddr *)0, td); if (error) @@ -902,20 +903,22 @@ tcp_connect(tp, nam, td) * Generate a CC value for this connection and * check whether CC or CCnew should be used. */ - if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { - taop = &tao_noncached; - bzero(taop, sizeof(*taop)); - } + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); tp->cc_send = CC_INC(tcp_ccgen); - if (taop->tao_ccsent != 0 && - CC_GEQ(tp->cc_send, taop->tao_ccsent)) { - taop->tao_ccsent = tp->cc_send; + if (tao.tao_ccsent != 0 && + CC_GEQ(tp->cc_send, tao.tao_ccsent)) { + tao.tao_ccsent = tp->cc_send; } else { - taop->tao_ccsent = 0; + tao.tao_ccsent = 0; tp->t_flags |= TF_SENDCCNEW; } + if (tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, + tao.tao_ccsent, 0); + return 0; } @@ -931,10 +934,11 @@ tcp6_connect(tp, nam, td) struct tcptw *otw; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam; struct in6_addr *addr6; - struct rmxp_tao *taop; - struct rmxp_tao tao_noncached; + struct rmxp_tao tao; int error; + bzero(&tao, sizeof(tao)); + if (inp->inp_lport == 0) { error = in6_pcbbind(inp, (struct sockaddr *)0, td); if (error) @@ -991,19 +995,20 @@ tcp6_connect(tp, nam, td) * Generate a CC value for this connection and * check whether CC or CCnew should be used. */ - if ((taop = tcp_gettaocache(&tp->t_inpcb->inp_inc)) == NULL) { - taop = &tao_noncached; - bzero(taop, sizeof(*taop)); - } + if (tcp_do_rfc1644) + tcp_hc_gettao(&inp->inp_inc, &tao); tp->cc_send = CC_INC(tcp_ccgen); - if (taop->tao_ccsent != 0 && - CC_GEQ(tp->cc_send, taop->tao_ccsent)) { - taop->tao_ccsent = tp->cc_send; + if (tao.tao_ccsent != 0 && + CC_GEQ(tp->cc_send, tao.tao_ccsent)) { + tao.tao_ccsent = tp->cc_send; } else { - taop->tao_ccsent = 0; + tao.tao_ccsent = 0; tp->t_flags |= TF_SENDCCNEW; } + if (tcp_do_rfc1644) + tcp_hc_updatetao(&inp->inp_inc, TCP_HC_TAO_CCSENT, + tao.tao_ccsent, 0); return 0; } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 2e5b3fa..ddcfd3c 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -213,8 +213,6 @@ struct syncache { struct tcpcb *sc_tp; /* tcb for listening socket */ struct mbuf *sc_ipopts; /* source route */ struct in_conninfo sc_inc; /* addresses */ -#define sc_route sc_inc.inc_route -#define sc_route6 sc_inc.inc6_route u_int32_t sc_tsrecent; tcp_cc sc_cc_send; /* holds CC or CCnew */ tcp_cc sc_cc_recv; @@ -232,7 +230,6 @@ struct syncache { #define SCF_TIMESTAMP 0x04 /* negotiated timestamps */ #define SCF_CC 0x08 /* negotiated CC */ #define SCF_UNREACH 0x10 /* icmp unreachable received */ -#define SCF_KEEPROUTE 0x20 /* keep cloned route */ TAILQ_ENTRY(syncache) sc_hash; TAILQ_ENTRY(syncache) sc_timerq; }; @@ -242,6 +239,17 @@ struct syncache_head { u_int sch_length; }; +struct hc_metrics_lite { /* must stay in sync with hc_metrics */ + u_long rmx_mtu; /* MTU for this path */ + u_long rmx_ssthresh; /* outbound gateway buffer limit */ + u_long rmx_rtt; /* estimated round trip time */ + u_long rmx_rttvar; /* estimated rtt variance */ + u_long rmx_bandwidth; /* estimated bandwidth */ + u_long rmx_cwnd; /* congestion window */ + u_long rmx_sendpipe; /* outbound delay-bandwidth product */ + u_long rmx_recvpipe; /* inbound delay-bandwidth product */ +}; + struct tcptw { struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ tcp_seq snd_nxt; @@ -260,8 +268,7 @@ struct tcptw { }; /* - * The TAO cache entry which is stored in the protocol family specific - * portion of the route metrics. + * The TAO cache entry which is stored in the tcp hostcache. */ struct rmxp_tao { tcp_cc tao_cc; /* latest CC in valid SYN */ @@ -274,7 +281,6 @@ struct rmxp_tao { #define TAOF_UNDEF 0 /* we don't know yet */ #endif /* notyet */ }; -#define rmx_taop(r) ((struct rmxp_tao *)(r).rmx_filler) #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) #define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) @@ -401,6 +407,9 @@ struct tcpstat { u_long tcps_sc_zonefail; /* zalloc() failed */ u_long tcps_sc_sendcookie; /* SYN cookie sent */ u_long tcps_sc_recvcookie; /* SYN cookie received */ + + u_long tcps_hc_added; /* entry added to hostcache */ + u_long tcps_hc_bucketoverflow; /* hostcache per bucket limit hit */ }; /* @@ -451,6 +460,7 @@ struct xtcpcb { { "pcblist", CTLTYPE_STRUCT }, \ { "delacktime", CTLTYPE_INT }, \ { "v6mssdflt", CTLTYPE_INT }, \ + { "maxid", CTLTYPE_INT }, \ } @@ -482,12 +492,12 @@ struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void); void tcp_fasttimo(void); -struct rmxp_tao * - tcp_gettaocache(struct in_conninfo *); void tcp_init(void); void tcp_input(struct mbuf *, int); +u_long tcp_maxmtu(struct in_conninfo *); +u_long tcp_maxmtu6(struct in_conninfo *); void tcp_mss(struct tcpcb *, int); -int tcp_mssopt(struct tcpcb *); +int tcp_mssopt(struct in_conninfo *); struct inpcb * tcp_drop_syn_sent(struct inpcb *, int); struct inpcb * @@ -500,8 +510,6 @@ struct inpcb * void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); int tcp_twrespond(struct tcptw *, struct socket *, struct mbuf *, int); -struct rtentry * - tcp_rtlookup(struct in_conninfo *); void tcp_setpersist(struct tcpcb *); void tcp_slowtimo(void); struct tcptemp * @@ -519,6 +527,20 @@ int syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **, struct mbuf *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *); void syncache_badack(struct in_conninfo *); +/* + * All tcp_hc_* functions are IPv4 and IPv6 (via in_conninfo) + */ +void tcp_hc_init(void); +void tcp_hc_get(struct in_conninfo *, struct hc_metrics_lite *); +u_long tcp_hc_getmtu(struct in_conninfo *); +void tcp_hc_gettao(struct in_conninfo *, struct rmxp_tao *); +void tcp_hc_updatemtu(struct in_conninfo *, u_long); +void tcp_hc_update(struct in_conninfo *, struct hc_metrics_lite *); +void tcp_hc_updatetao(struct in_conninfo *, int, tcp_cc, u_short); +/* update which tao field */ +#define TCP_HC_TAO_CC 0x1 +#define TCP_HC_TAO_CCSENT 0x2 +#define TCP_HC_TAO_MSSOPT 0x3 extern struct pr_usrreqs tcp_usrreqs; extern u_long tcp_sendspace; diff --git a/sys/netinet/udp_usrreq.c b/sys/netinet/udp_usrreq.c index 60ec82b..62e6131 100644 --- a/sys/netinet/udp_usrreq.c +++ b/sys/netinet/udp_usrreq.c @@ -544,10 +544,17 @@ udp_ctlinput(cmd, sa, vip) if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) return; - if (PRC_IS_REDIRECT(cmd)) { - ip = 0; - notify = in_rtchange; - } else if (cmd == PRC_HOSTDEAD) + /* + * Redirects don't need to be handled up here. + */ + if (PRC_IS_REDIRECT(cmd)) + return; + /* + * Hostdead is ugly because it goes linearly through all PCBs. + * XXX: We never get this from ICMP, otherwise it makes an + * excellent DoS attack on machines with many connections. + */ + if (cmd == PRC_HOSTDEAD) ip = 0; else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0) return; @@ -873,7 +880,7 @@ udp_output(inp, m, addr, control, td) ((struct ip *)ui)->ip_tos = inp->inp_ip_tos; /* XXX */ udpstat.udps_opackets++; - error = ip_output(m, inp->inp_options, &inp->inp_route, ipflags, + error = ip_output(m, inp->inp_options, NULL, ipflags, inp->inp_moptions, inp); return (error); diff --git a/sys/netinet6/icmp6.c b/sys/netinet6/icmp6.c index 997474e..6baa2db 100644 --- a/sys/netinet6/icmp6.c +++ b/sys/netinet6/icmp6.c @@ -94,6 +94,7 @@ #include <netinet/in_var.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> +#include <netinet/tcp_var.h> #include <netinet6/in6_ifattach.h> #include <netinet6/in6_pcb.h> #include <netinet6/ip6protosw.h> @@ -1105,8 +1106,7 @@ icmp6_mtudisc_update(ip6cp, validated) struct icmp6_hdr *icmp6 = ip6cp->ip6c_icmp6; struct mbuf *m = ip6cp->ip6c_m; /* will be necessary for scope issue */ u_int mtu = ntohl(icmp6->icmp6_mtu); - struct rtentry *rt = NULL; - struct sockaddr_in6 sin6; + struct in_conninfo inc; #if 0 /* @@ -1131,31 +1131,19 @@ icmp6_mtudisc_update(ip6cp, validated) if (!validated) return; - bzero(&sin6, sizeof(sin6)); - sin6.sin6_family = PF_INET6; - sin6.sin6_len = sizeof(struct sockaddr_in6); - sin6.sin6_addr = *dst; + bzero(&inc, sizeof(inc)); + inc.inc_flags = 1; /* IPv6 */ + inc.inc6_faddr = *dst; /* XXX normally, this won't happen */ if (IN6_IS_ADDR_LINKLOCAL(dst)) { - sin6.sin6_addr.s6_addr16[1] = + inc.inc6_faddr.s6_addr16[1] = htons(m->m_pkthdr.rcvif->if_index); } - /* sin6.sin6_scope_id = XXX: should be set if DST is a scoped addr */ - rt = rtalloc1((struct sockaddr *)&sin6, 0, RTF_CLONING); - - if (rt && (rt->rt_flags & RTF_HOST) && - !(rt->rt_rmx.rmx_locks & RTV_MTU)) { - if (mtu < IPV6_MMTU) { - /* xxx */ - rt->rt_rmx.rmx_locks |= RTV_MTU; - } else if (mtu < rt->rt_ifp->if_mtu && - rt->rt_rmx.rmx_mtu > mtu) { - icmp6stat.icp6s_pmtuchg++; - rt->rt_rmx.rmx_mtu = mtu; - } + + if (mtu >= IPV6_MMTU) { + tcp_hc_updatemtu(&inc, mtu); + icmp6stat.icp6s_pmtuchg++; } - if (rt) - rtfree(rt); } /* diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c index 5c7f1f2..b3d58e8 100644 --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -337,8 +337,7 @@ in6_pcbladdr(inp, nam, plocal_addr6) * Is it the intended behavior? */ *plocal_addr6 = in6_selectsrc(sin6, inp->in6p_outputopts, - inp->in6p_moptions, - &inp->in6p_route, + inp->in6p_moptions, NULL, &inp->in6p_laddr, &error); if (*plocal_addr6 == 0) { if (error == 0) @@ -351,10 +350,6 @@ in6_pcbladdr(inp, nam, plocal_addr6) * and exit to caller, that will do the lookup. */ } - - if (inp->in6p_route.ro_rt) - ifp = inp->in6p_route.ro_rt->rt_ifp; - return (0); } @@ -447,8 +442,6 @@ in6_pcbdetach(inp) ip6_freepcbopts(inp->in6p_outputopts); ip6_freemoptions(inp->in6p_moptions); - if (inp->in6p_route.ro_rt) - RTFREE(inp->in6p_route.ro_rt); /* Check and free IPv4 related resources in case of mapped addr */ if (inp->inp_options) (void)m_free(inp->inp_options); @@ -830,26 +823,10 @@ void in6_losing(in6p) struct inpcb *in6p; { - struct rtentry *rt; - struct rt_addrinfo info; - - if ((rt = in6p->in6p_route.ro_rt) != NULL) { - RT_LOCK(rt); - in6p->in6p_route.ro_rt = NULL; - bzero((caddr_t)&info, sizeof(info)); - info.rti_flags = rt->rt_flags; - info.rti_info[RTAX_DST] = rt_key(rt); - info.rti_info[RTAX_GATEWAY] = rt->rt_gateway; - info.rti_info[RTAX_NETMASK] = rt_mask(rt); - rt_missmsg(RTM_LOSING, &info, rt->rt_flags, 0); - if (rt->rt_flags & RTF_DYNAMIC) - rtexpunge(rt); - RTFREE_LOCKED(rt); - /* - * A new route can be allocated - * the next time output is attempted. - */ - } + /* + * We don't store route pointers in the routing table anymore + */ + return; } /* @@ -861,14 +838,9 @@ in6_rtchange(inp, errno) struct inpcb *inp; int errno; { - if (inp->in6p_route.ro_rt) { - RTFREE(inp->in6p_route.ro_rt); - inp->in6p_route.ro_rt = 0; - /* - * A new route can be allocated the next time - * output is attempted. - */ - } + /* + * We don't store route pointers in the routing table anymore + */ return inp; } diff --git a/sys/netinet6/in6_rmx.c b/sys/netinet6/in6_rmx.c index 09526b2..b68852d 100644 --- a/sys/netinet6/in6_rmx.c +++ b/sys/netinet6/in6_rmx.c @@ -141,8 +141,7 @@ in6_addroute(void *v_arg, void *n_arg, struct radix_node_head *head, } } - if (!rt->rt_rmx.rmx_mtu && !(rt->rt_rmx.rmx_locks & RTV_MTU) - && rt->rt_ifp) + if (!rt->rt_rmx.rmx_mtu && rt->rt_ifp) rt->rt_rmx.rmx_mtu = IN6_LINKMTU(rt->rt_ifp); ret = rn_addroute(v_arg, n_arg, head, treenodes); diff --git a/sys/netinet6/in6_src.c b/sys/netinet6/in6_src.c index d584956..88ace1c 100644 --- a/sys/netinet6/in6_src.c +++ b/sys/netinet6/in6_src.c @@ -211,7 +211,6 @@ in6_selectsrc(dstsock, opts, mopts, ro, laddr, errorp) != 0) { return (NULL); } - /* * determine the appropriate zone id of the source based on * the zone of the destination and the outgoing interface. @@ -449,12 +448,19 @@ in6_selectif(dstsock, opts, mopts, ro, retifp) struct route_in6 *ro; struct ifnet **retifp; { - int error, clone; + int error; + struct route_in6 sro; struct rtentry *rt = NULL; - clone = IN6_IS_ADDR_MULTICAST(&dstsock->sin6_addr) ? 0 : 1; + if (ro == NULL) { + bzero(&sro, sizeof(sro)); + ro = &sro; + } + if ((error = in6_selectroute(dstsock, opts, mopts, ro, retifp, - &rt, clone)) != 0) { + &rt, 0)) != 0) { + if (rt && rt == sro.ro_rt) + RTFREE(rt); return (error); } @@ -476,7 +482,11 @@ in6_selectif(dstsock, opts, mopts, ro, retifp) * We thus reject the case here. */ if (rt && (rt->rt_flags & (RTF_REJECT | RTF_BLACKHOLE))) { - return (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + int flags = (rt->rt_flags & RTF_HOST ? EHOSTUNREACH : ENETUNREACH); + + if (rt && rt == sro.ro_rt) + RTFREE(rt); + return (flags); } /* @@ -489,6 +499,8 @@ in6_selectif(dstsock, opts, mopts, ro, retifp) if (rt && rt->rt_ifa && rt->rt_ifa->ifa_ifp) *retifp = rt->rt_ifa->ifa_ifp; + if (rt && rt == sro.ro_rt) + RTFREE(rt); return (0); } @@ -623,6 +635,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone) sa6 = (struct sockaddr_in6 *)&ro->ro_dst; *sa6 = *dstsock; sa6->sin6_scope_id = 0; + if (clone) { rtalloc((struct route *)ro); } else { @@ -695,7 +708,7 @@ in6_selectroute(dstsock, opts, mopts, ro, retifp, retrt, clone) * 2. (If the outgoing interface is detected) the current * hop limit of the interface specified by router advertisement. * 3. The system default hoplimit. -*/ + */ int in6_selecthlim(in6p, ifp) struct in6pcb *in6p; @@ -705,8 +718,24 @@ in6_selecthlim(in6p, ifp) return (in6p->in6p_hops); else if (ifp) return (ND_IFINFO(ifp)->chlim); - else - return (ip6_defhlim); + else if (in6p && !IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr)) { + struct route_in6 ro6; + struct ifnet *lifp; + + bzero(&ro6, sizeof(ro6)); + ro6.ro_dst.sin6_family = AF_INET6; + ro6.ro_dst.sin6_len = sizeof(struct sockaddr_in6); + ro6.ro_dst.sin6_addr = in6p->in6p_faddr; + rtalloc((struct route *)&ro6); + if (ro6.ro_rt) { + lifp = ro6.ro_rt->rt_ifp; + RTFREE(ro6.ro_rt); + if (lifp) + return (ND_IFINFO(lifp)->chlim); + } else + return (ip6_defhlim); + } + return (ip6_defhlim); } /* diff --git a/sys/netinet6/ip6_output.c b/sys/netinet6/ip6_output.c index b95b197..3072851 100644 --- a/sys/netinet6/ip6_output.c +++ b/sys/netinet6/ip6_output.c @@ -96,6 +96,7 @@ #include <netinet/icmp6.h> #include <netinet6/ip6_var.h> #include <netinet/in_pcb.h> +#include <netinet/tcp_var.h> #include <netinet6/nd6.h> #ifdef IPSEC @@ -661,7 +662,7 @@ skip_ipsec2:; /* XXX rt not locked */ ia = ifatoia6(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; - ro->ro_rt->rt_use++; + ro->ro_rt->rt_rmx.rmx_pksent++; if (ro->ro_rt->rt_flags & RTF_GATEWAY) dst = (struct sockaddr_in6 *)ro->ro_rt->rt_gateway; m->m_flags &= ~(M_BCAST | M_MCAST); /* just in case */ @@ -757,7 +758,7 @@ skip_ipsec2:; } ia = ifatoia6(ro->ro_rt->rt_ifa); ifp = ro->ro_rt->rt_ifp; - ro->ro_rt->rt_use++; + ro->ro_rt->rt_rmx.rmx_pksent++; RT_UNLOCK(ro->ro_rt); } @@ -1387,11 +1388,20 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp) } if (ro_pmtu->ro_rt) { u_int32_t ifmtu; + struct in_conninfo inc; + + bzero(&inc, sizeof(inc)); + inc.inc_flags = 1; /* IPv6 */ + inc.inc6_faddr = *dst; if (ifp == NULL) ifp = ro_pmtu->ro_rt->rt_ifp; ifmtu = IN6_LINKMTU(ifp); - mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; + mtu = tcp_hc_getmtu(&inc); + if (mtu) + mtu = min(mtu, ro_pmtu->ro_rt->rt_rmx.rmx_mtu); + else + mtu = ro_pmtu->ro_rt->rt_rmx.rmx_mtu; if (mtu == 0) mtu = ifmtu; else if (mtu < IPV6_MMTU) { @@ -1415,8 +1425,7 @@ ip6_getpmtu(ro_pmtu, ro, ifp, dst, mtup, alwaysfragp) * field isn't locked). */ mtu = ifmtu; - if (!(ro_pmtu->ro_rt->rt_rmx.rmx_locks & RTV_MTU)) - ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; + ro_pmtu->ro_rt->rt_rmx.rmx_mtu = mtu; } } else if (ifp) { mtu = IN6_LINKMTU(ifp); @@ -1993,7 +2002,9 @@ do { \ { u_long pmtu = 0; struct ip6_mtuinfo mtuinfo; - struct route_in6 *ro = (struct route_in6 *)&in6p->in6p_route; + struct route_in6 sro; + + bzero(&sro, sizeof(sro)); if (!(so->so_state & SS_ISCONNECTED)) return (ENOTCONN); @@ -2002,8 +2013,10 @@ do { \ * routing, or optional information to specify * the outgoing interface. */ - error = ip6_getpmtu(ro, NULL, NULL, + error = ip6_getpmtu(&sro, NULL, NULL, &in6p->in6p_faddr, &pmtu, NULL); + if (sro.ro_rt) + RTFREE(sro.ro_rt); if (error) break; if (pmtu > IPV6_MAXPACKET) diff --git a/sys/netinet6/udp6_output.c b/sys/netinet6/udp6_output.c index 36a7fba..d905e84 100644 --- a/sys/netinet6/udp6_output.c +++ b/sys/netinet6/udp6_output.c @@ -203,8 +203,7 @@ udp6_output(in6p, m, addr6, control, td) if (!IN6_IS_ADDR_V4MAPPED(faddr)) { laddr = in6_selectsrc(sin6, in6p->in6p_outputopts, - in6p->in6p_moptions, - &in6p->in6p_route, + in6p->in6p_moptions, NULL, &in6p->in6p_laddr, &error); } else laddr = &in6p->in6p_laddr; /* XXX */ @@ -277,9 +276,7 @@ udp6_output(in6p, m, addr6, control, td) ip6->ip6_plen = htons((u_short)plen); #endif ip6->ip6_nxt = IPPROTO_UDP; - ip6->ip6_hlim = in6_selecthlim(in6p, - in6p->in6p_route.ro_rt ? - in6p->in6p_route.ro_rt->rt_ifp : NULL); + ip6->ip6_hlim = in6_selecthlim(in6p, NULL); ip6->ip6_src = *laddr; ip6->ip6_dst = *faddr; @@ -297,7 +294,7 @@ udp6_output(in6p, m, addr6, control, td) goto release; } #endif /* IPSEC */ - error = ip6_output(m, in6p->in6p_outputopts, &in6p->in6p_route, + error = ip6_output(m, in6p->in6p_outputopts, NULL, flags, in6p->in6p_moptions, NULL, in6p); break; case AF_INET: |