summaryrefslogtreecommitdiffstats
path: root/sys/netinet/tcp_subr.c
diff options
context:
space:
mode:
authorjlemon <jlemon@FreeBSD.org>2003-02-19 22:32:43 +0000
committerjlemon <jlemon@FreeBSD.org>2003-02-19 22:32:43 +0000
commita8bc02dcb257f24a8246bb1c31abe58bf12ebd04 (patch)
tree6317b858b0555a797efb2e5b5bd4c2eebbdc0d10 /sys/netinet/tcp_subr.c
parent79a1ebfa6f53d733dc4775c2031ff3d16e53b75c (diff)
downloadFreeBSD-src-a8bc02dcb257f24a8246bb1c31abe58bf12ebd04.zip
FreeBSD-src-a8bc02dcb257f24a8246bb1c31abe58bf12ebd04.tar.gz
Add a TCP TIMEWAIT state which uses less space than a fullblown TCP
control block. Allow the socket and tcpcb structures to be freed earlier than inpcb. Update code to understand an inp w/o a socket. Reviewed by: hsu, silby, jayanth Sponsored by: DARPA, NAI Labs
Diffstat (limited to 'sys/netinet/tcp_subr.c')
-rw-r--r--sys/netinet/tcp_subr.c316
1 files changed, 268 insertions, 48 deletions
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index 3c43acb..dc7730f 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -179,6 +179,7 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW,
static void tcp_cleartaocache(void);
static struct inpcb *tcp_notify(struct inpcb *, int);
+static void tcp_discardcb(struct tcpcb *);
/*
* Target size of TCP PCB hash tables. Must be a power of two.
@@ -191,26 +192,23 @@ static struct inpcb *tcp_notify(struct inpcb *, int);
#endif
/*
- * This is the actual shape of what we allocate using the zone
- * allocator. Doing it this way allows us to protect both structures
- * using the same generation count, and also eliminates the overhead
- * of allocating tcpcbs separately. By hiding the structure here,
- * we avoid changing most of the rest of the code (although it needs
- * to be changed, eventually, for greater efficiency).
+ * XXX
+ * Callouts should be moved into struct tcp directly. They are currently
+ * separate becuase the tcpcb structure is exported to userland for sysctl
+ * parsing purposes, which do not know about callouts.
*/
-#define ALIGNMENT 32
-#define ALIGNM1 (ALIGNMENT - 1)
-struct inp_tp {
- union {
- struct inpcb inp;
- char align[(sizeof(struct inpcb) + ALIGNM1) & ~ALIGNM1];
- } inp_tp_u;
+struct tcpcb_mem {
struct tcpcb tcb;
- struct callout inp_tp_rexmt, inp_tp_persist, inp_tp_keep, inp_tp_2msl;
- struct callout inp_tp_delack;
+ struct callout tcpcb_mem_rexmt, tcpcb_mem_persist, tcpcb_mem_keep;
+ struct callout tcpcb_mem_2msl, tcpcb_mem_delack;
+};
+struct tcptw_mem {
+ struct tcptw tw;
+ struct callout tcptw_mem_2msl;
};
-#undef ALIGNMENT
-#undef ALIGNM1
+
+static uma_zone_t tcpcb_zone;
+static uma_zone_t tcptw_zone;
/*
* Tcp initialization
@@ -244,7 +242,7 @@ tcp_init()
tcbinfo.hashbase = hashinit(hashsize, M_PCB, &tcbinfo.hashmask);
tcbinfo.porthashbase = hashinit(hashsize, M_PCB,
&tcbinfo.porthashmask);
- tcbinfo.ipi_zone = uma_zcreate("tcpcb", sizeof(struct inp_tp),
+ tcbinfo.ipi_zone = uma_zcreate("inpcb", sizeof(struct inpcb),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
uma_zone_set_max(tcbinfo.ipi_zone, maxsockets);
#ifdef INET6
@@ -257,6 +255,15 @@ tcp_init()
if (max_linkhdr + TCP_MINPROTOHDR > MHLEN)
panic("tcp_init");
#undef TCP_MINPROTOHDR
+ /*
+ * These have to be type stable for the benefit of the timers.
+ */
+ tcpcb_zone = uma_zcreate("tcpcb", sizeof(struct tcpcb_mem),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcpcb_zone, maxsockets);
+ tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw_mem),
+ NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+ uma_zone_set_max(tcptw_zone, maxsockets);
syncache_init();
}
@@ -552,16 +559,17 @@ struct tcpcb *
tcp_newtcpcb(inp)
struct inpcb *inp;
{
- struct inp_tp *it;
- register struct tcpcb *tp;
+ struct tcpcb_mem *tm;
+ struct tcpcb *tp;
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- it = (struct inp_tp *)inp;
- tp = &it->tcb;
- bzero((char *) tp, sizeof(struct tcpcb));
- LIST_INIT(&tp->t_segq);
+ tm = uma_zalloc(tcpcb_zone, M_NOWAIT | M_ZERO);
+ if (tm == NULL)
+ return (NULL);
+ tp = &tm->tcb;
+ /* LIST_INIT(&tp->t_segq); */ /* XXX covered by M_ZERO */
tp->t_maxseg = tp->t_maxopd =
#ifdef INET6
isipv6 ? tcp_v6mssdflt :
@@ -569,11 +577,11 @@ tcp_newtcpcb(inp)
tcp_mssdflt;
/* Set up our timeouts. */
- callout_init(tp->tt_rexmt = &it->inp_tp_rexmt, 0);
- callout_init(tp->tt_persist = &it->inp_tp_persist, 0);
- callout_init(tp->tt_keep = &it->inp_tp_keep, 0);
- callout_init(tp->tt_2msl = &it->inp_tp_2msl, 0);
- callout_init(tp->tt_delack = &it->inp_tp_delack, 0);
+ callout_init(tp->tt_rexmt = &tm->tcpcb_mem_rexmt, 0);
+ callout_init(tp->tt_persist = &tm->tcpcb_mem_persist, 0);
+ callout_init(tp->tt_keep = &tm->tcpcb_mem_keep, 0);
+ callout_init(tp->tt_2msl = &tm->tcpcb_mem_2msl, 0);
+ callout_init(tp->tt_delack = &tm->tcpcb_mem_delack, 0);
if (tcp_do_rfc1323)
tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
@@ -628,23 +636,17 @@ tcp_drop(tp, errno)
return (tcp_close(tp));
}
-/*
- * Close a TCP control block:
- * discard all space held by the tcp
- * discard internet protocol block
- * wake up any sleepers
- */
-struct tcpcb *
-tcp_close(tp)
- register struct tcpcb *tp;
+static void
+tcp_discardcb(tp)
+ struct tcpcb *tp;
{
- register struct tseg_qent *q;
+ struct tseg_qent *q;
struct inpcb *inp = tp->t_inpcb;
struct socket *so = inp->inp_socket;
#ifdef INET6
int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
#endif /* INET6 */
- register struct rtentry *rt;
+ struct rtentry *rt;
int dosavessthresh;
/*
@@ -762,20 +764,37 @@ tcp_close(tp)
}
no_valid_rt:
/* free the reassembly queue, if any */
- while((q = LIST_FIRST(&tp->t_segq)) != NULL) {
+ while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
LIST_REMOVE(q, tqe_q);
m_freem(q->tqe_m);
FREE(q, M_TSEGQ);
}
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
+ uma_zfree(tcpcb_zone, tp);
soisdisconnected(so);
+}
+
+/*
+ * Close a TCP control block:
+ * discard all space held by the tcp
+ * discard internet protocol block
+ * wake up any sleepers
+ */
+struct tcpcb *
+tcp_close(tp)
+ struct tcpcb *tp;
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+
+ tcp_discardcb(tp);
#ifdef INET6
if (INP_CHECK_SOCKAF(so, AF_INET6))
in6_pcbdetach(inp);
else
-#endif /* INET6 */
- in_pcbdetach(inp);
+#endif
+ in_pcbdetach(inp);
tcpstat.tcps_closed++;
return ((struct tcpcb *)0);
}
@@ -799,6 +818,8 @@ tcp_drain()
*/
INP_INFO_RLOCK(&tcbinfo);
LIST_FOREACH(inpb, tcbinfo.listhead, inp_list) {
+ if (inpb->inp_vflag & INP_TIMEWAIT)
+ continue;
INP_LOCK(inpb);
if ((tcpb = intotcpcb(inpb))) {
while ((te = LIST_FIRST(&tcpb->t_segq))
@@ -908,7 +929,9 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
inp = LIST_NEXT(inp, inp_list)) {
INP_LOCK(inp);
if (inp->inp_gencnt <= gencnt &&
- cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0)
+ (((inp->inp_vflag & INP_TIMEWAIT) &&
+ cr_cansee(req->td->td_ucred, intotw(inp)->tw_cred) == 0) ||
+ cr_canseesocket(req->td->td_ucred, inp->inp_socket) == 0))
inp_list[i++] = inp;
INP_UNLOCK(inp);
}
@@ -926,12 +949,19 @@ tcp_pcblist(SYSCTL_HANDLER_ARGS)
/* XXX should avoid extra copy */
bcopy(inp, &xt.xt_inp, sizeof *inp);
inp_ppcb = inp->inp_ppcb;
- if (inp_ppcb != NULL)
- bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
- else
+ if (inp_ppcb == NULL)
+ bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
+ else if (inp->inp_vflag & INP_TIMEWAIT) {
bzero((char *) &xt.xt_tp, sizeof xt.xt_tp);
+ xt.xt_tp.t_state = TCPS_TIME_WAIT;
+ } else
+ bcopy(inp_ppcb, &xt.xt_tp, sizeof xt.xt_tp);
if (inp->inp_socket)
sotoxsocket(inp->inp_socket, &xt.xt_socket);
+ else {
+ bzero(&xt.xt_socket, sizeof xt.xt_socket);
+ xt.xt_socket.xso_protocol = IPPROTO_TCP;
+ }
xt.xt_inp.inp_gencnt = inp->inp_gencnt;
error = SYSCTL_OUT(req, &xt, sizeof xt);
}
@@ -1487,7 +1517,7 @@ ipsec_hdrsiz_tcp(tp)
struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
-#endif /* INET6 */
+#endif
struct tcphdr *th;
if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
@@ -1559,6 +1589,196 @@ tcp_cleartaocache()
}
/*
+ * Move a TCP connection into TIME_WAIT state.
+ * tcbinfo is unlocked.
+ * inp is locked, and is unlocked before returning.
+ */
+void
+tcp_twstart(tp)
+ struct tcpcb *tp;
+{
+ struct tcptw_mem *tm;
+ struct tcptw *tw;
+ struct inpcb *inp;
+ int tw_time, acknow;
+ struct socket *so;
+
+ tm = uma_zalloc(tcptw_zone, M_NOWAIT);
+ if (tm == NULL)
+ /* EEEK! -- preserve old structure or just kill everything? */
+ /* must obtain tcbinfo lock in order to drop the structure. */
+ panic("uma_zalloc(tcptw)");
+ tw = &tm->tw;
+ inp = tp->t_inpcb;
+ tw->tw_inpcb = inp;
+
+ /*
+ * Recover last window size sent.
+ */
+ tw->last_win = (tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale;
+
+ /*
+ * Set t_recent if timestamps are used on the connection.
+ */
+ if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
+ (TF_REQ_TSTMP|TF_RCVD_TSTMP))
+ tw->t_recent = tp->ts_recent;
+ else
+ tw->t_recent = 0;
+
+ tw->snd_nxt = tp->snd_nxt;
+ tw->rcv_nxt = tp->rcv_nxt;
+ tw->cc_recv = tp->cc_recv;
+ tw->cc_send = tp->cc_send;
+ tw->t_starttime = tp->t_starttime;
+ callout_init(tw->tt_2msl = &tm->tcptw_mem_2msl, 0);
+
+/* XXX
+ * If this code will
+ * be used for fin-wait-2 state also, then we may need
+ * a ts_recent from the last segment.
+ */
+ /* Shorten TIME_WAIT [RFC-1644, p.28] */
+ if (tp->cc_recv != 0 && (ticks - tp->t_starttime) < tcp_msl) {
+ tw_time = tp->t_rxtcur * TCPTV_TWTRUNC;
+ /* For T/TCP client, force ACK now. */
+ acknow = 1;
+ } else {
+ tw_time = 2 * tcp_msl;
+ acknow = tp->t_flags & TF_ACKNOW;
+ }
+ tcp_discardcb(tp);
+ so = inp->inp_socket;
+ so->so_pcb = NULL;
+ tw->tw_cred = crhold(so->so_cred);
+ tw->tw_so_options = so->so_options;
+ sotryfree(so);
+ inp->inp_socket = NULL;
+ inp->inp_ppcb = (caddr_t)tw;
+ inp->inp_vflag |= INP_TIMEWAIT;
+ callout_reset(tw->tt_2msl, tw_time, tcp_timer_2msl_tw, tw);
+ if (acknow)
+ tcp_twrespond(tw, TH_ACK);
+ INP_UNLOCK(inp);
+}
+
+void
+tcp_twclose(tw)
+ struct tcptw *tw;
+{
+ struct inpcb *inp;
+
+ inp = tw->tw_inpcb;
+ tw->tw_inpcb = NULL;
+ callout_stop(tw->tt_2msl);
+ inp->inp_ppcb = NULL;
+ uma_zfree(tcptw_zone, tw);
+#ifdef INET6
+ if (inp->inp_vflag & INP_IPV6PROTO)
+ in6_pcbdetach(inp);
+ else
+#endif
+ in_pcbdetach(inp);
+ tcpstat.tcps_closed++;
+}
+
+int
+tcp_twrespond(struct tcptw *tw, int flags)
+{
+ struct inpcb *inp = tw->tw_inpcb;
+ struct tcphdr *th;
+ struct mbuf *m;
+ struct ip *ip = NULL;
+ u_int8_t *optp;
+ u_int hdrlen, optlen;
+ int error;
+#ifdef INET6
+ struct ip6_hdr *ip6 = NULL;
+ int isipv6 = inp->inp_inc.inc_isipv6;
+#else
+ const int isipv6 = 0;
+#endif
+
+ m = m_gethdr(M_NOWAIT, MT_HEADER);
+ if (m == NULL)
+ return (ENOBUFS);
+ m->m_data += max_linkhdr;
+
+ if (isipv6) {
+ hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
+ ip6 = mtod(m, struct ip6_hdr *);
+ th = (struct tcphdr *)(ip6 + 1);
+ tcpip_fillheaders(inp, ip6, th);
+ } else {
+ hdrlen = sizeof(struct tcpiphdr);
+ ip = mtod(m, struct ip *);
+ th = (struct tcphdr *)(ip + 1);
+ tcpip_fillheaders(inp, ip, th);
+ }
+ optp = (u_int8_t *)(th + 1);
+
+ /*
+ * Send a timestamp and echo-reply if both our side and our peer
+ * have sent timestamps in our SYN's and this is not a RST.
+ */
+ if (tw->t_recent && flags == TH_ACK) {
+ u_int32_t *lp = (u_int32_t *)optp;
+
+ /* Form timestamp option as shown in appendix A of RFC 1323. */
+ *lp++ = htonl(TCPOPT_TSTAMP_HDR);
+ *lp++ = htonl(ticks);
+ *lp = htonl(tw->t_recent);
+ optp += TCPOLEN_TSTAMP_APPA;
+ }
+
+ /*
+ * Send `CC-family' options if needed, and it's not a RST.
+ */
+ if (tw->cc_recv != 0 && flags == TH_ACK) {
+ u_int32_t *lp = (u_int32_t *)optp;
+
+ *lp++ = htonl(TCPOPT_CC_HDR(TCPOPT_CC));
+ *lp = htonl(tw->cc_send);
+ optp += TCPOLEN_CC_APPA;
+ }
+ optlen = optp - (u_int8_t *)(th + 1);
+
+ m->m_len = hdrlen + optlen;
+ m->m_pkthdr.len = m->m_len;
+
+ KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small"));
+
+ th->th_seq = htonl(tw->snd_nxt);
+ th->th_ack = htonl(tw->rcv_nxt);
+ th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
+ th->th_flags = flags;
+ th->th_win = htons(tw->last_win);
+
+ if (isipv6) {
+ th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
+ sizeof(struct tcphdr) + optlen);
+ ip6->ip6_hlim = in6_selecthlim(inp, inp->in6p_route.ro_rt ?
+ inp->in6p_route.ro_rt->rt_ifp : NULL);
+ error = ip6_output(m, inp->in6p_outputopts, &inp->in6p_route,
+ (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp);
+ } else {
+ th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
+ htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP));
+ m->m_pkthdr.csum_flags = CSUM_TCP;
+ m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
+ ip->ip_len = m->m_pkthdr.len;
+ error = ip_output(m, inp->inp_options, &inp->inp_route,
+ (tw->tw_so_options & SO_DONTROUTE), NULL, inp);
+ }
+ if (flags & TH_ACK)
+ tcpstat.tcps_sndacks++;
+ else
+ tcpstat.tcps_sndctrl++;
+ tcpstat.tcps_sndtotal++;
+ return (error);
+}
+
+/*
* TCP BANDWIDTH DELAY PRODUCT WINDOW LIMITING
*
* This code attempts to calculate the bandwidth-delay product as a
OpenPOWER on IntegriCloud