From a8bc02dcb257f24a8246bb1c31abe58bf12ebd04 Mon Sep 17 00:00:00 2001 From: jlemon Date: Wed, 19 Feb 2003 22:32:43 +0000 Subject: Add a TCP TIMEWAIT state which uses less space than a fullblown TCP control block. Allow the socket and tcpcb structures to be freed earlier than inpcb. Update code to understand an inp w/o a socket. Reviewed by: hsu, silby, jayanth Sponsored by: DARPA, NAI Labs --- sys/netinet/tcp_subr.c | 316 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 268 insertions(+), 48 deletions(-) (limited to 'sys/netinet/tcp_subr.c') diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index 3c43acb..dc7730f 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -179,6 +179,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, static void tcp_cleartaocache(void); static struct inpcb *tcp_notify(struct inpcb *, int); +static void tcp_discardcb(struct tcpcb *); /* * Target size of TCP PCB hash tables. Must be a power of two. @@ -191,26 +192,23 @@ static struct inpcb *tcp_notify(struct inpcb *, int); #endif /* - * This is the actual shape of what we allocate using the zone - * allocator. Doing it this way allows us to protect both structures - * using the same generation count, and also eliminates the overhead - * of allocating tcpcbs separately. By hiding the structure here, - * we avoid changing most of the rest of the code (although it needs - * to be changed, eventually, for greater efficiency). + * XXX + * Callouts should be moved into struct tcp directly. They are currently + * separate becuase the tcpcb structure is exported to userland for sysctl + * parsing purposes, which do not know about callouts. */ -#define ALIGNMENT 32 -#define ALIGNM1 (ALIGNMENT - 1) -struct inp_tp { - union { - struct inpcb inp; - char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1]; - } inp_tp_u; +struct tcpcb_mem { struct tcpcb tcb; - struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl; - struct callout inp_tp_delack; + struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep; + struct callout tcpcb_mem_2msl, tcpcb_mem_delack; +}; +struct tcptw_mem { + struct tcptw tw; + struct callout tcptw_mem_2msl; }; -#undef ALIGNMENT -#undef ALIGNM1 + +static uma_zone_t tcpcb_zone; +static uma_zone_t tcptw_zone; /* * Tcp initialization @@ -244,7 +242,7 @@ tcp_init() tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask); tcbinfo.porthashbase = hashinit(hashsize, M_PCB, &tcbinfo.porthashmask); - tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp), + tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); uma_zone_set_max(tcbinfo.ipi_zone, maxsockets); #ifdef INET6 @@ -257,6 +255,15 @@ tcp_init() if (max_linkhdr + TCP_MINPROTOHDR > MHLEN) panic("tcp_init"); #undef TCP_MINPROTOHDR + /* + * These have to be type stable for the benefit of the timers. + */ + tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcpcb_zone, maxsockets); + tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw_mem), + NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE); + uma_zone_set_max(tcptw_zone, maxsockets); syncache_init(); } @@ -552,16 +559,17 @@ struct tcpcb * tcp_newtcpcb(inp) struct inpcb *inp; { - struct inp_tp *it; - register struct tcpcb *tp; + struct tcpcb_mem *tm; + struct tcpcb *tp; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - it = (struct inp_tp *)inp; - tp = &it->tcb; - bzero((char *) tp, sizeof(struct tcpcb)); - LIST_INIT(&tp->t_segq); + tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO); + if (tm == NULL) + return (NULL); + tp = &tm->tcb; + /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */ tp->t_maxseg = tp->t_maxopd = #ifdef INET6 isipv6 ? tcp_v6mssdflt : @@ -569,11 +577,11 @@ tcp_newtcpcb(inp) tcp_mssdflt; /* Set up our timeouts. */ - callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0); - callout_init(tp->tt_persist = &it->inp_tp_persist, 0); - callout_init(tp->tt_keep = &it->inp_tp_keep, 0); - callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0); - callout_init(tp->tt_delack = &it->inp_tp_delack, 0); + callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0); + callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0); + callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0); + callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0); + callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0); if (tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); @@ -628,23 +636,17 @@ tcp_drop(tp, errno) return (tcp_close(tp)); } -/* - * Close a TCP control block: - * discard all space held by the tcp - * discard internet protocol block - * wake up any sleepers - */ -struct tcpcb * -tcp_close(tp) - register struct tcpcb *tp; +static void +tcp_discardcb(tp) + struct tcpcb *tp; { - register struct tseg_qent *q; + struct tseg_qent *q; struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; #ifdef INET6 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0; #endif /* INET6 */ - register struct rtentry *rt; + struct rtentry *rt; int dosavessthresh; /* @@ -762,20 +764,37 @@ tcp_close(tp) } no_valid_rt: /* free the reassembly queue, if any */ - while((q = LIST_FIRST(&tp->t_segq)) != NULL) { + while ((q = LIST_FIRST(&tp->t_segq)) != NULL) { LIST_REMOVE(q, tqe_q); m_freem(q->tqe_m); FREE(q, M_TSEGQ); } inp->inp_ppcb = NULL; tp->t_inpcb = NULL; + uma_zfree(tcpcb_zone, tp); soisdisconnected(so); +} + +/* + * Close a TCP control block: + * discard all space held by the tcp + * discard internet protocol block + * wake up any sleepers + */ +struct tcpcb * +tcp_close(tp) + struct tcpcb *tp; +{ + struct inpcb *inp = tp->t_inpcb; + struct socket *so = inp->inp_socket; + + tcp_discardcb(tp); #ifdef INET6 if (INP_CHECK_SOCKAF(so, AF_INET6)) in6_pcbdetach(inp); else -#endif /* INET6 */ - in_pcbdetach(inp); +#endif + in_pcbdetach(inp); tcpstat.tcps_closed++; return ((struct tcpcb *)0); } @@ -799,6 +818,8 @@ tcp_drain() */ INP_INFO_RLOCK(&tcbinfo); LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) { + if (inpb->inp_vflag & INP_TIMEWAIT) + continue; INP_LOCK(inpb); if ((tcpb = intotcpcb(inpb))) { while ((te = LIST_FIRST(&tcpb->t_segq)) @@ -908,7 +929,9 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) inp = LIST_NEXT(inp, inp_list)) { INP_LOCK(inp); if (inp->inp_gencnt <= gencnt && - cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0) + (((inp->inp_vflag & INP_TIMEWAIT) && + cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred) == 0) || + cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)) inp_list[i++] = inp; INP_UNLOCK(inp); } @@ -926,12 +949,19 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS) /* XXX should avoid extra copy */ bcopy(inp, &xt.xt_inp, sizeof *inp); inp_ppcb = inp->inp_ppcb; - if (inp_ppcb != NULL) - bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); - else + if (inp_ppcb == NULL) + bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + else if (inp->inp_vflag & INP_TIMEWAIT) { bzero((char *) &xt.xt_tp, sizeof xt.xt_tp); + xt.xt_tp.t_state = TCPS_TIME_WAIT; + } else + bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp); if (inp->inp_socket) sotoxsocket(inp->inp_socket, &xt.xt_socket); + else { + bzero(&xt.xt_socket, sizeof xt.xt_socket); + xt.xt_socket.xso_protocol = IPPROTO_TCP; + } xt.xt_inp.inp_gencnt = inp->inp_gencnt; error = SYSCTL_OUT(req, &xt, sizeof xt); } @@ -1487,7 +1517,7 @@ ipsec_hdrsiz_tcp(tp) struct ip *ip; #ifdef INET6 struct ip6_hdr *ip6; -#endif /* INET6 */ +#endif struct tcphdr *th; if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) @@ -1559,6 +1589,196 @@ tcp_cleartaocache() } /* + * Move a TCP connection into TIME_WAIT state. + * tcbinfo is unlocked. + * inp is locked, and is unlocked before returning. + */ +void +tcp_twstart(tp) + struct tcpcb *tp; +{ + struct tcptw_mem *tm; + struct tcptw *tw; + struct inpcb *inp; + int tw_time, acknow; + struct socket *so; + + tm = uma_zalloc(tcptw_zone, M_NOWAIT); + if (tm == NULL) + /* EEEK! -- preserve old structure or just kill everything? */ + /* must obtain tcbinfo lock in order to drop the structure. */ + panic("uma_zalloc(tcptw)"); + tw = &tm->tw; + inp = tp->t_inpcb; + tw->tw_inpcb = inp; + + /* + * Recover last window size sent. + */ + tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale; + + /* + * Set t_recent if timestamps are used on the connection. + */ + if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == + (TF_REQ_TSTMP|TF_RCVD_TSTMP)) + tw->t_recent = tp->ts_recent; + else + tw->t_recent = 0; + + tw->snd_nxt = tp->snd_nxt; + tw->rcv_nxt = tp->rcv_nxt; + tw->cc_recv = tp->cc_recv; + tw->cc_send = tp->cc_send; + tw->t_starttime = tp->t_starttime; + callout_init(tw->tt_2msl = &tm->tcptw_mem_2msl, 0); + +/* XXX + * If this code will + * be used for fin-wait-2 state also, then we may need + * a ts_recent from the last segment. + */ + /* Shorten TIME_WAIT [RFC-1644, p.28] */ + if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) { + tw_time = tp->t_rxtcur * TCPTV_TWTRUNC; + /* For T/TCP client, force ACK now. */ + acknow = 1; + } else { + tw_time = 2 * tcp_msl; + acknow = tp->t_flags & TF_ACKNOW; + } + tcp_discardcb(tp); + so = inp->inp_socket; + so->so_pcb = NULL; + tw->tw_cred = crhold(so->so_cred); + tw->tw_so_options = so->so_options; + sotryfree(so); + inp->inp_socket = NULL; + inp->inp_ppcb = (caddr_t)tw; + inp->inp_vflag |= INP_TIMEWAIT; + callout_reset(tw->tt_2msl, tw_time, tcp_timer_2msl_tw, tw); + if (acknow) + tcp_twrespond(tw, TH_ACK); + INP_UNLOCK(inp); +} + +void +tcp_twclose(tw) + struct tcptw *tw; +{ + struct inpcb *inp; + + inp = tw->tw_inpcb; + tw->tw_inpcb = NULL; + callout_stop(tw->tt_2msl); + inp->inp_ppcb = NULL; + uma_zfree(tcptw_zone, tw); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) + in6_pcbdetach(inp); + else +#endif + in_pcbdetach(inp); + tcpstat.tcps_closed++; +} + +int +tcp_twrespond(struct tcptw *tw, int flags) +{ + struct inpcb *inp = tw->tw_inpcb; + struct tcphdr *th; + struct mbuf *m; + struct ip *ip = NULL; + u_int8_t *optp; + u_int hdrlen, optlen; + int error; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + int isipv6 = inp->inp_inc.inc_isipv6; +#else + const int isipv6 = 0; +#endif + + m = m_gethdr(M_NOWAIT, MT_HEADER); + if (m == NULL) + return (ENOBUFS); + m->m_data += max_linkhdr; + + if (isipv6) { + hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + tcpip_fillheaders(inp, ip6, th); + } else { + hdrlen = sizeof(struct tcpiphdr); + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(inp, ip, th); + } + optp = (u_int8_t *)(th + 1); + + /* + * Send a timestamp and echo-reply if both our side and our peer + * have sent timestamps in our SYN's and this is not a RST. + */ + if (tw->t_recent && flags == TH_ACK) { + u_int32_t *lp = (u_int32_t *)optp; + + /* Form timestamp option as shown in appendix A of RFC 1323. */ + *lp++ = htonl(TCPOPT_TSTAMP_HDR); + *lp++ = htonl(ticks); + *lp = htonl(tw->t_recent); + optp += TCPOLEN_TSTAMP_APPA; + } + + /* + * Send `CC-family' options if needed, and it's not a RST. + */ + if (tw->cc_recv != 0 && flags == TH_ACK) { + u_int32_t *lp = (u_int32_t *)optp; + + *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC)); + *lp = htonl(tw->cc_send); + optp += TCPOLEN_CC_APPA; + } + optlen = optp - (u_int8_t *)(th + 1); + + m->m_len = hdrlen + optlen; + m->m_pkthdr.len = m->m_len; + + KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); + + th->th_seq = htonl(tw->snd_nxt); + th->th_ack = htonl(tw->rcv_nxt); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + th->th_flags = flags; + th->th_win = htons(tw->last_win); + + if (isipv6) { + th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), + sizeof(struct tcphdr) + optlen); + ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ? + inp->in6p_route.ro_rt->rt_ifp : NULL); + error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route, + (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); + } else { + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + ip->ip_len = m->m_pkthdr.len; + error = ip_output(m, inp->inp_options, &inp->inp_route, + (tw->tw_so_options & SO_DONTROUTE), NULL, inp); + } + if (flags & TH_ACK) + tcpstat.tcps_sndacks++; + else + tcpstat.tcps_sndctrl++; + tcpstat.tcps_sndtotal++; + return (error); +} + +/* * TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING * * This code attempts to calculate the bandwidth-delay product as a -- cgit v1.1