summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--UPDATING7
-rw-r--r--sys/conf/files2
-rw-r--r--sys/netinet/cc.h161
-rw-r--r--sys/netinet/cc/cc.c340
-rw-r--r--sys/netinet/cc/cc_module.h70
-rw-r--r--sys/netinet/cc/cc_newreno.c231
-rw-r--r--sys/netinet/tcp_input.c484
-rw-r--r--sys/netinet/tcp_output.c24
-rw-r--r--sys/netinet/tcp_sack.c2
-rw-r--r--sys/netinet/tcp_subr.c33
-rw-r--r--sys/netinet/tcp_timer.c49
-rw-r--r--sys/netinet/tcp_usrreq.c62
-rw-r--r--sys/netinet/tcp_var.h30
-rw-r--r--sys/sys/param.h2
14 files changed, 1209 insertions, 288 deletions
diff --git a/UPDATING b/UPDATING
index aa8e590..5009b09 100644
--- a/UPDATING
+++ b/UPDATING
@@ -22,6 +22,13 @@ NOTE TO PEOPLE WHO THINK THAT FreeBSD 9.x IS SLOW:
machines to maximize performance. (To disable malloc debugging, run
ln -s aj /etc/malloc.conf.)
+20101111:
+ The TCP stack has received a significant update to add support for
+ modularised congestion control and generally improve the clarity of
+ congestion control decisions. Bump __FreeBSD_version to 900025. User
+ space tools that rely on the size of struct tcpcb in tcp_var.h (e.g.
+ sockstat) need to be recompiled.
+
20101002:
The man(1) utility has been replaced by a new version that no longer
uses /etc/manpath.config. Please consult man.conf(5) for how to
diff --git a/sys/conf/files b/sys/conf/files
index ce2eb82..c859ec8 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -2598,6 +2598,8 @@ netinet/ip_mroute.c optional mrouting inet | mrouting inet6
netinet/ip_options.c optional inet
netinet/ip_output.c optional inet
netinet/raw_ip.c optional inet
+netinet/cc/cc.c optional inet
+netinet/cc/cc_newreno.c optional inet
netinet/sctp_asconf.c optional inet sctp
netinet/sctp_auth.c optional inet sctp
netinet/sctp_bsd_addr.c optional inet sctp
diff --git a/sys/netinet/cc.h b/sys/netinet/cc.h
new file mode 100644
index 0000000..6f24f11
--- /dev/null
+++ b/sys/netinet/cc.h
@@ -0,0 +1,161 @@
+/*-
+ * Copyright (c) 2007-2008
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University's
+ * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley. More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#ifndef _NETINET_CC_H_
+#define _NETINET_CC_H_
+
+/* XXX: TCP_CA_NAME_MAX define lives in tcp.h for compat reasons. */
+#include <netinet/tcp.h>
+
+/* Global CC vars. */
+extern STAILQ_HEAD(cc_head, cc_algo) cc_list;
+extern const int tcprexmtthresh;
+extern struct cc_algo newreno_cc_algo;
+
+/* Define the new net.inet.tcp.cc sysctl tree. */
+SYSCTL_DECL(_net_inet_tcp_cc);
+
+/* CC housekeeping functions. */
+void cc_init(void);
+int cc_register_algo(struct cc_algo *add_cc);
+int cc_deregister_algo(struct cc_algo *remove_cc);
+
+/*
+ * Wrapper around transport structs that contain same-named congestion
+ * control variables. Allows algos to be shared amongst multiple CC aware
+ * transprots.
+ */
+struct cc_var {
+ void *cc_data; /* Per-connection private CC algorithm data. */
+ int bytes_this_ack; /* # bytes acked by the current ACK. */
+ tcp_seq curack; /* Most recent ACK. */
+ uint32_t flags; /* Flags for cc_var (see below) */
+ int type; /* Indicates which ptr is valid in ccvc. */
+ union ccv_container {
+ struct tcpcb *tcp;
+ struct sctp_nets *sctp;
+ } ccvc;
+};
+
+/* cc_var flags. */
+#define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */
+#define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */
+
+/* ACK types passed to the ack_received() hook. */
+#define CC_ACK 0x0001 /* Regular in sequence ACK. */
+#define CC_DUPACK 0x0002 /* Duplicate ACK. */
+#define CC_PARTIALACK 0x0004 /* Not yet. */
+#define CC_SACK 0x0008 /* Not yet. */
+
+/*
+ * Congestion signal types passed to the cong_signal() hook. The highest order 8
+ * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own
+ * congestion signal types.
+ */
+#define CC_ECN 0x000001/* ECN marked packet received. */
+#define CC_RTO 0x000002/* RTO fired. */
+#define CC_RTO_ERR 0x000004/* RTO fired in error. */
+#define CC_NDUPACK 0x000008/* Threshold of dupack's reached. */
+
+/*
+ * Structure to hold data and function pointers that together represent a
+ * congestion control algorithm.
+ */
+struct cc_algo {
+ char name[TCP_CA_NAME_MAX];
+
+ /* Init global module state on kldload. */
+ int (*mod_init)(void);
+
+ /* Cleanup global module state on kldunload. */
+ int (*mod_destroy)(void);
+
+ /* Init CC state for a new control block. */
+ int (*cb_init)(struct cc_var *ccv);
+
+ /* Cleanup CC state for a terminating control block. */
+ void (*cb_destroy)(struct cc_var *ccv);
+
+ /* Init variables for a newly established connection. */
+ void (*conn_init)(struct cc_var *ccv);
+
+ /* Called on receipt of an ack. */
+ void (*ack_received)(struct cc_var *ccv, uint16_t type);
+
+ /* Called on detection of a congestion signal. */
+ void (*cong_signal)(struct cc_var *ccv, uint32_t type);
+
+ /* Called after exiting congestion recovery. */
+ void (*post_recovery)(struct cc_var *ccv);
+
+ /* Called when data transfer resumes after an idle period. */
+ void (*after_idle)(struct cc_var *ccv);
+
+ STAILQ_ENTRY (cc_algo) entries;
+};
+
+/* Macro to obtain the CC algo's struct ptr. */
+#define CC_ALGO(tp) ((tp)->cc_algo)
+
+/* Macro to obtain the CC algo's data ptr. */
+#define CC_DATA(tp) ((tp)->ccv->cc_data)
+
+/* Macro to obtain the system default CC algo's struct ptr. */
+#define CC_DEFAULT() STAILQ_FIRST(&cc_list)
+
+extern struct rwlock cc_list_lock;
+#define CC_LIST_LOCK_INIT() rw_init(&cc_list_lock, "cc_list")
+#define CC_LIST_LOCK_DESTROY() rw_destroy(&cc_list_lock)
+#define CC_LIST_RLOCK() rw_rlock(&cc_list_lock)
+#define CC_LIST_RUNLOCK() rw_runlock(&cc_list_lock)
+#define CC_LIST_WLOCK() rw_wlock(&cc_list_lock)
+#define CC_LIST_WUNLOCK() rw_wunlock(&cc_list_lock)
+#define CC_LIST_WLOCK_ASSERT() rw_assert(&cc_list_lock, RA_WLOCKED)
+
+#endif /* _NETINET_CC_H_ */
diff --git a/sys/netinet/cc/cc.c b/sys/netinet/cc/cc.c
new file mode 100644
index 0000000..4643ca4
--- /dev/null
+++ b/sys/netinet/cc/cc.c
@@ -0,0 +1,340 @@
+/*-
+ * Copyright (c) 2007-2008
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart and James Healy,
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University's
+ * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley. More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/rwlock.h>
+#include <sys/sbuf.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc_module.h>
+
+/*
+ * List of available cc algorithms on the current system. First element
+ * is used as the system default CC algorithm.
+ */
+struct cc_head cc_list = STAILQ_HEAD_INITIALIZER(cc_list);
+
+/* Protects the cc_list TAILQ. */
+struct rwlock cc_list_lock;
+
+/*
+ * Set the default CC algorithm to new_default. The default is identified
+ * by being the first element in the cc_list TAILQ.
+ */
+static void
+cc_set_default(struct cc_algo *new_default)
+{
+ CC_LIST_WLOCK_ASSERT();
+
+ /*
+ * Make the requested system default CC algorithm the first element in
+ * the list if it isn't already.
+ */
+ if (new_default != CC_DEFAULT()) {
+ STAILQ_REMOVE(&cc_list, new_default, cc_algo, entries);
+ STAILQ_INSERT_HEAD(&cc_list, new_default, entries);
+ }
+}
+
+/*
+ * Sysctl handler to show and change the default CC algorithm.
+ */
+static int
+cc_default_algo(SYSCTL_HANDLER_ARGS)
+{
+ struct cc_algo *funcs;
+ int err, found;
+
+ err = found = 0;
+
+ if (req->newptr == NULL) {
+ char default_cc[TCP_CA_NAME_MAX];
+
+ /* Just print the current default. */
+ CC_LIST_RLOCK();
+ strlcpy(default_cc, CC_DEFAULT()->name, TCP_CA_NAME_MAX);
+ CC_LIST_RUNLOCK();
+ err = sysctl_handle_string(oidp, default_cc, 1, req);
+ } else {
+ /* Find algo with specified name and set it to default. */
+ CC_LIST_WLOCK();
+ STAILQ_FOREACH(funcs, &cc_list, entries) {
+ if (strncmp((char *)req->newptr, funcs->name,
+ TCP_CA_NAME_MAX) == 0) {
+ found = 1;
+ cc_set_default(funcs);
+ }
+ }
+ CC_LIST_WUNLOCK();
+
+ if (!found)
+ err = ESRCH;
+ }
+
+ return (err);
+}
+
+/*
+ * Sysctl handler to display the list of available CC algorithms.
+ */
+static int
+cc_list_available(SYSCTL_HANDLER_ARGS)
+{
+ struct cc_algo *algo;
+ struct sbuf *s;
+ int err, first;
+
+ err = 0;
+ first = 1;
+ s = sbuf_new(NULL, NULL, TCP_CA_NAME_MAX, SBUF_AUTOEXTEND);
+
+ if (s == NULL)
+ return (ENOMEM);
+
+ CC_LIST_RLOCK();
+ STAILQ_FOREACH(algo, &cc_list, entries) {
+ err = sbuf_printf(s, first ? "%s" : ", %s", algo->name);
+ if (err)
+ break;
+ first = 0;
+ }
+ CC_LIST_RUNLOCK();
+
+ if (!err) {
+ sbuf_finish(s);
+ err = sysctl_handle_string(oidp, sbuf_data(s), 1, req);
+ }
+
+ sbuf_delete(s);
+ return (err);
+}
+
+/*
+ * Initialise CC subsystem on system boot.
+ */
+void
+cc_init()
+{
+ CC_LIST_LOCK_INIT();
+ STAILQ_INIT(&cc_list);
+}
+
+/*
+ * Returns non-zero on success, 0 on failure.
+ */
+int
+cc_deregister_algo(struct cc_algo *remove_cc)
+{
+ struct cc_algo *funcs, *tmpfuncs;
+ struct tcpcb *tp;
+ struct inpcb *inp;
+ int err;
+
+ err = ENOENT;
+
+ /* Never allow newreno to be deregistered. */
+ if (&newreno_cc_algo == remove_cc)
+ return (EPERM);
+
+ /* Remove algo from cc_list so that new connections can't use it. */
+ CC_LIST_WLOCK();
+ STAILQ_FOREACH_SAFE(funcs, &cc_list, entries, tmpfuncs) {
+ if (funcs == remove_cc) {
+ /*
+ * If we're removing the current system default,
+ * reset the default to newreno.
+ */
+ if (strncmp(CC_DEFAULT()->name, remove_cc->name,
+ TCP_CA_NAME_MAX) == 0)
+ cc_set_default(&newreno_cc_algo);
+
+ STAILQ_REMOVE(&cc_list, funcs, cc_algo, entries);
+ err = 0;
+ break;
+ }
+ }
+ CC_LIST_WUNLOCK();
+
+ if (!err) {
+ /*
+ * Check all active control blocks and change any that are
+ * using this algorithm back to newreno. If the algorithm that
+ * was in use requires cleanup code to be run, call it.
+ *
+ * New connections already part way through being initialised
+ * with the CC algo we're removing will not race with this code
+ * because the INP_INFO_WLOCK is held during initialisation.
+ * We therefore don't enter the loop below until the connection
+ * list has stabilised.
+ */
+ INP_INFO_RLOCK(&V_tcbinfo);
+ LIST_FOREACH(inp, &V_tcb, inp_list) {
+ INP_WLOCK(inp);
+ /* Important to skip tcptw structs. */
+ if (!(inp->inp_flags & INP_TIMEWAIT) &&
+ (tp = intotcpcb(inp)) != NULL) {
+ /*
+ * By holding INP_WLOCK here, we are
+ * assured that the connection is not
+ * currently executing inside the CC
+ * module's functions i.e. it is safe to
+ * make the switch back to newreno.
+ */
+ if (CC_ALGO(tp) == remove_cc) {
+ tmpfuncs = CC_ALGO(tp);
+ /* Newreno does not require any init. */
+ CC_ALGO(tp) = &newreno_cc_algo;
+ if (tmpfuncs->cb_destroy != NULL)
+ tmpfuncs->cb_destroy(tp->ccv);
+ }
+ }
+ INP_WUNLOCK(inp);
+ }
+ INP_INFO_RUNLOCK(&V_tcbinfo);
+ }
+
+ return (err);
+}
+
+/*
+ * Returns 0 on success, non-zero on failure.
+ */
+int
+cc_register_algo(struct cc_algo *add_cc)
+{
+ struct cc_algo *funcs;
+ int err;
+
+ err = 0;
+
+ /*
+ * Iterate over list of registered CC algorithms and make sure
+ * we're not trying to add a duplicate.
+ */
+ CC_LIST_WLOCK();
+ STAILQ_FOREACH(funcs, &cc_list, entries) {
+ if (funcs == add_cc || strncmp(funcs->name, add_cc->name,
+ TCP_CA_NAME_MAX) == 0)
+ err = EEXIST;
+ }
+
+ if (!err)
+ STAILQ_INSERT_TAIL(&cc_list, add_cc, entries);
+
+ CC_LIST_WUNLOCK();
+
+ return (err);
+}
+
+/*
+ * Handles kld related events. Returns 0 on success, non-zero on failure.
+ */
+int
+cc_modevent(module_t mod, int event_type, void *data)
+{
+ struct cc_algo *algo;
+ int err;
+
+ err = 0;
+ algo = (struct cc_algo *)data;
+
+ switch(event_type) {
+ case MOD_LOAD:
+ if (algo->mod_init != NULL)
+ err = algo->mod_init();
+ if (!err)
+ err = cc_register_algo(algo);
+ break;
+
+ case MOD_QUIESCE:
+ case MOD_SHUTDOWN:
+ case MOD_UNLOAD:
+ err = cc_deregister_algo(algo);
+ if (!err && algo->mod_destroy != NULL)
+ algo->mod_destroy();
+ if (err == ENOENT)
+ err = 0;
+ break;
+
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ return (err);
+}
+
+/* Declare sysctl tree and populate it. */
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, cc, CTLFLAG_RW, NULL,
+ "congestion control related settings");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, algorithm, CTLTYPE_STRING|CTLFLAG_RW,
+ NULL, 0, cc_default_algo, "A", "default congestion control algorithm");
+
+SYSCTL_PROC(_net_inet_tcp_cc, OID_AUTO, available, CTLTYPE_STRING|CTLFLAG_RD,
+ NULL, 0, cc_list_available, "A",
+ "list available congestion control algorithms");
diff --git a/sys/netinet/cc/cc_module.h b/sys/netinet/cc/cc_module.h
new file mode 100644
index 0000000..f3fe752
--- /dev/null
+++ b/sys/netinet/cc/cc_module.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * All rights reserved.
+ *
+ * This software was developed by Lawrence Stewart while studying at the Centre
+ * for Advanced Internet Architectures, Swinburne University, made possible in
+ * part by a grant from the Cisco University Research Program Fund at Community
+ * Foundation Silicon Valley.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+/*
+ * This software was first released in 2009 by Lawrence Stewart as part of the
+ * NewTCP research project at Swinburne University's Centre for Advanced
+ * Internet Architectures, Melbourne, Australia, which was made possible in part
+ * by a grant from the Cisco University Research Program Fund at Community
+ * Foundation Silicon Valley. More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#ifndef _NETINET_CC_MODULE_H_
+#define _NETINET_CC_MODULE_H_
+
+/*
+ * Allows a CC algorithm to manipulate a commonly named CC variable regardless
+ * of the transport protocol and associated C struct.
+ * XXXLAS: Out of action until the work to support SCTP is done.
+ *
+#define CCV(ccv, what) \
+(*( \
+ (ccv)->type == IPPROTO_TCP ? &(ccv)->ccvc.tcp->what : \
+ &(ccv)->ccvc.sctp->what \
+))
+ */
+#define CCV(ccv, what) (ccv)->ccvc.tcp->what
+
+#define DECLARE_CC_MODULE(ccname, ccalgo) \
+ static moduledata_t cc_##ccname = { \
+ .name = #ccname, \
+ .evhand = cc_modevent, \
+ .priv = ccalgo \
+ }; \
+ DECLARE_MODULE(ccname, cc_##ccname, \
+ SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY)
+
+int cc_modevent(module_t mod, int type, void *data);
+
+#endif /* _NETINET_CC_MODULE_H_ */
diff --git a/sys/netinet/cc/cc_newreno.c b/sys/netinet/cc/cc_newreno.c
new file mode 100644
index 0000000..e383510
--- /dev/null
+++ b/sys/netinet/cc/cc_newreno.c
@@ -0,0 +1,231 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ * The Regents of the University of California.
+ * Copyright (c) 2007-2008,2010
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart, James Healy and
+ * David Hayes, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This software was first released in 2007 by James Healy and Lawrence Stewart
+ * whilst working on the NewTCP research project at Swinburne University's
+ * Centre for Advanced Internet Architectures, Melbourne, Australia, which was
+ * made possible in part by a grant from the Cisco University Research Program
+ * Fund at Community Foundation Silicon Valley. More details are available at:
+ * http://caia.swin.edu.au/urp/newtcp/
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_pcb.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_var.h>
+
+#include <netinet/cc/cc_module.h>
+
+void newreno_ack_received(struct cc_var *ccv, uint16_t type);
+void newreno_cong_signal(struct cc_var *ccv, uint32_t type);
+void newreno_post_recovery(struct cc_var *ccv);
+void newreno_after_idle(struct cc_var *ccv);
+
+struct cc_algo newreno_cc_algo = {
+ .name = "newreno",
+ .ack_received = newreno_ack_received,
+ .cong_signal = newreno_cong_signal,
+ .post_recovery = newreno_post_recovery,
+ .after_idle = newreno_after_idle
+};
+
+/*
+ * Increase cwnd on receipt of a successful ACK:
+ * if cwnd <= ssthresh, increases by 1 MSS per ACK
+ * if cwnd > ssthresh, increase by ~1 MSS per RTT
+ */
+void
+newreno_ack_received(struct cc_var *ccv, uint16_t type)
+{
+ if (type == CC_ACK && !IN_RECOVERY(CCV(ccv, t_flags)) &&
+ (ccv->flags & CCF_CWND_LIMITED)) {
+ u_int cw = CCV(ccv, snd_cwnd);
+ u_int incr = CCV(ccv, t_maxseg);
+
+ /*
+ * Regular in-order ACK, open the congestion window.
+ * Method depends on which congestion control state we're
+ * in (slow start or cong avoid) and if ABC (RFC 3465) is
+ * enabled.
+ *
+ * slow start: cwnd <= ssthresh
+ * cong avoid: cwnd > ssthresh
+ *
+ * slow start and ABC (RFC 3465):
+ * Grow cwnd exponentially by the amount of data
+ * ACKed capping the max increment per ACK to
+ * (abc_l_var * maxseg) bytes.
+ *
+ * slow start without ABC (RFC 5681):
+ * Grow cwnd exponentially by maxseg per ACK.
+ *
+ * cong avoid and ABC (RFC 3465):
+ * Grow cwnd linearly by maxseg per RTT for each
+ * cwnd worth of ACKed data.
+ *
+ * cong avoid without ABC (RFC 5681):
+ * Grow cwnd linearly by approximately maxseg per RTT using
+ * maxseg^2 / cwnd per ACK as the increment.
+ * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
+ * avoid capping cwnd.
+ */
+ if (cw > CCV(ccv, snd_ssthresh)) {
+ if (V_tcp_do_rfc3465) {
+ if (ccv->flags & CCF_ABC_SENTAWND)
+ ccv->flags &= ~CCF_ABC_SENTAWND;
+ else
+ incr = 0;
+ } else
+ incr = max((incr * incr / cw), 1);
+ } else if (V_tcp_do_rfc3465) {
+ /*
+ * In slow-start with ABC enabled and no RTO in sight?
+ * (Must not use abc_l_var > 1 if slow starting after
+ * an RTO. On RTO, snd_nxt = snd_una, so the
+ * snd_nxt == snd_max check is sufficient to
+ * handle this).
+ *
+ * XXXLAS: Find a way to signal SS after RTO that
+ * doesn't rely on tcpcb vars.
+ */
+ if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max))
+ incr = min(ccv->bytes_this_ack,
+ V_tcp_abc_l_var * CCV(ccv, t_maxseg));
+ else
+ incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg));
+ }
+ /* ABC is on by default, so incr equals 0 frequently. */
+ if (incr > 0)
+ CCV(ccv, snd_cwnd) = min(cw + incr,
+ TCP_MAXWIN << CCV(ccv, snd_scale));
+ }
+}
+
+/*
+ * manage congestion signals
+ */
+void
+newreno_cong_signal(struct cc_var *ccv, uint32_t type)
+{
+ u_int win;
+
+ win = max(CCV(ccv, snd_cwnd) / 2 / CCV(ccv, t_maxseg), 2) *
+ CCV(ccv, t_maxseg);
+
+ switch (type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+ if (!IN_CONGRECOVERY(CCV(ccv, t_flags)))
+ CCV(ccv, snd_ssthresh) = win;
+ ENTER_RECOVERY(CCV(ccv, t_flags));
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) {
+ CCV(ccv, snd_ssthresh) = win;
+ CCV(ccv, snd_cwnd) = win;
+ ENTER_CONGRECOVERY(CCV(ccv, t_flags));
+ }
+ break;
+ }
+}
+
+/*
+ * decrease the cwnd in response to packet loss or a transmit timeout.
+ * th can be null, in which case cwnd will be set according to reno instead
+ * of new reno.
+ */
+void
+newreno_post_recovery(struct cc_var *ccv)
+{
+ if (IN_FASTRECOVERY(CCV(ccv, t_flags))) {
+ /*
+ * Fast recovery will conclude after returning from this
+ * function. Window inflation should have left us with
+ * approximately snd_ssthresh outstanding data. But in case we
+ * would be inclined to send a burst, better to do it via the
+ * slow start mechanism.
+ *
+ * XXXLAS: Find a way to do this without needing curack
+ */
+ if (SEQ_GT(ccv->curack + CCV(ccv, snd_ssthresh),
+ CCV(ccv, snd_max)))
+ CCV(ccv, snd_cwnd) = CCV(ccv, snd_max) -
+ ccv->curack + CCV(ccv, t_maxseg);
+ else
+ CCV(ccv, snd_cwnd) = CCV(ccv, snd_ssthresh);
+ }
+}
+
+/*
+ * if a connection has been idle for a while and more data is ready to be sent,
+ * reset cwnd
+ */
+void
+newreno_after_idle(struct cc_var *ccv)
+{
+ /*
+ * We have been idle for "a while" and no acks are expected to clock out
+ * any data we send -- slow start to get ack "clock" running again.
+ */
+ if (V_tcp_do_rfc3390)
+ CCV(ccv, snd_cwnd) = min(4 * CCV(ccv, t_maxseg),
+ max(2 * CCV(ccv, t_maxseg), 4380));
+ else
+ CCV(ccv, snd_cwnd) = CCV(ccv, t_maxseg) * 2;
+}
+
+
+DECLARE_CC_MODULE(newreno, &newreno_cc_algo);
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 22a2ea4..8fb9a52 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1,6 +1,20 @@
/*-
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
* The Regents of the University of California. All rights reserved.
+ * Copyright (c) 2007-2008,2010
+ * Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstewart@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University, by Lawrence Stewart, James Healy and
+ * David Hayes, made possible in part by a grant from the Cisco University
+ * Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -61,6 +75,7 @@ __FBSDID("$FreeBSD$");
#define TCPSTATES /* for logging */
+#include <netinet/cc.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
@@ -75,7 +90,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/nd6.h>
-#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
@@ -96,7 +110,7 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h>
-static const int tcprexmtthresh = 3;
+const int tcprexmtthresh = 3;
VNET_DEFINE(struct tcpstat, tcpstat);
SYSCTL_VNET_STRUCT(_net_inet_tcp, TCPCTL_STATS, stats, CTLFLAG_RW,
@@ -132,19 +146,16 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3042, CTLFLAG_RW,
"Enable RFC 3042 (Limited Transmit)");
VNET_DEFINE(int, tcp_do_rfc3390) = 1;
-#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3390, CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3390), 0,
"Enable RFC 3390 (Increasing TCP's Initial Congestion Window)");
VNET_DEFINE(int, tcp_do_rfc3465) = 1;
-#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, rfc3465, CTLFLAG_RW,
&VNET_NAME(tcp_do_rfc3465), 0,
"Enable RFC 3465 (Appropriate Byte Counting)");
VNET_DEFINE(int, tcp_abc_l_var) = 2;
-#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, abc_l_var, CTLFLAG_RW,
&VNET_NAME(tcp_abc_l_var), 2,
"Cap the max cwnd increment during slow-start to this number of segments");
@@ -203,8 +214,10 @@ static void tcp_pulloutofband(struct socket *,
struct tcphdr *, struct mbuf *, int);
static void tcp_xmit_timer(struct tcpcb *, int);
static void tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline
- tcp_congestion_exp(struct tcpcb *);
+static void inline cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+ uint16_t type);
+static void inline cc_conn_init(struct tcpcb *tp);
+static void inline cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
/*
* Kernel module interface for updating tcpstat. The argument is an index
@@ -220,20 +233,188 @@ kmod_tcpstat_inc(int statnum)
(*((u_long *)&V_tcpstat + statnum))++;
}
+/*
+ * CC wrapper hook functions
+ */
static void inline
-tcp_congestion_exp(struct tcpcb *tp)
+cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
{
- u_int win;
-
- win = min(tp->snd_wnd, tp->snd_cwnd) /
- 2 / tp->t_maxseg;
- if (win < 2)
- win = 2;
- tp->snd_ssthresh = win * tp->t_maxseg;
- ENTER_FASTRECOVERY(tp);
- tp->snd_recover = tp->snd_max;
- if (tp->t_flags & TF_ECN_PERMIT)
- tp->t_flags |= TF_ECN_SND_CWR;
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
+ if (tp->snd_cwnd == min(tp->snd_cwnd, tp->snd_wnd))
+ tp->ccv->flags |= CCF_CWND_LIMITED;
+ else
+ tp->ccv->flags &= ~CCF_CWND_LIMITED;
+
+ if (type == CC_ACK) {
+ if (tp->snd_cwnd > tp->snd_ssthresh) {
+ tp->t_bytes_acked += min(tp->ccv->bytes_this_ack,
+ V_tcp_abc_l_var * tp->t_maxseg);
+ if (tp->t_bytes_acked >= tp->snd_cwnd) {
+ tp->t_bytes_acked -= tp->snd_cwnd;
+ tp->ccv->flags |= CCF_ABC_SENTAWND;
+ }
+ } else {
+ tp->ccv->flags &= ~CCF_ABC_SENTAWND;
+ tp->t_bytes_acked = 0;
+ }
+ }
+
+ if (CC_ALGO(tp)->ack_received != NULL) {
+ /* XXXLAS: Find a way to live without this */
+ tp->ccv->curack = th->th_ack;
+ CC_ALGO(tp)->ack_received(tp->ccv, type);
+ }
+}
+
+static void inline
+cc_conn_init(struct tcpcb *tp)
+{
+ struct hc_metrics_lite metrics;
+ struct inpcb *inp = tp->t_inpcb;
+ int rtt;
+#ifdef INET6
+ int isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
+#endif
+
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ tcp_hc_get(&inp->inp_inc, &metrics);
+
+ if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
+ tp->t_srtt = rtt;
+ tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
+ TCPSTAT_INC(tcps_usedrtt);
+ if (metrics.rmx_rttvar) {
+ tp->t_rttvar = metrics.rmx_rttvar;
+ TCPSTAT_INC(tcps_usedrttvar);
+ } else {
+ /* default variation is +- 1 rtt */
+ tp->t_rttvar =
+ tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
+ }
+ TCPT_RANGESET(tp->t_rxtcur,
+ ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
+ tp->t_rttmin, TCPTV_REXMTMAX);
+ }
+ if (metrics.rmx_ssthresh) {
+ /*
+ * There's some sort of gateway or interface
+ * buffer limit on the path. Use this to set
+ * the slow start threshhold, but set the
+ * threshold to no less than 2*mss.
+ */
+ tp->snd_ssthresh = max(2 * tp->t_maxseg, metrics.rmx_ssthresh);
+ TCPSTAT_INC(tcps_usedssthresh);
+ }
+
+ /*
+ * Set the slow-start flight size depending on whether this
+ * is a local network or not.
+ *
+ * Extend this so we cache the cwnd too and retrieve it here.
+ * Make cwnd even bigger than RFC3390 suggests but only if we
+ * have previous experience with the remote host. Be careful
+ * not make cwnd bigger than remote receive window or our own
+ * send socket buffer. Maybe put some additional upper bound
+ * on the retrieved cwnd. Should do incremental updates to
+ * hostcache when cwnd collapses so next connection doesn't
+ * overloads the path again.
+ *
+ * XXXAO: Initializing the CWND from the hostcache is broken
+ * and in its current form not RFC conformant. It is disabled
+ * until fixed or removed entirely.
+ *
+ * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
+ * We currently check only in syncache_socket for that.
+ */
+/* #define TCP_METRICS_CWND */
+#ifdef TCP_METRICS_CWND
+ if (metrics.rmx_cwnd)
+ tp->snd_cwnd = max(tp->t_maxseg, min(metrics.rmx_cwnd / 2,
+ min(tp->snd_wnd, so->so_snd.sb_hiwat)));
+ else
+#endif
+ if (V_tcp_do_rfc3390)
+ tp->snd_cwnd = min(4 * tp->t_maxseg,
+ max(2 * tp->t_maxseg, 4380));
+#ifdef INET6
+ else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
+ (!isipv6 && in_localaddr(inp->inp_faddr)))
+#else
+ else if (in_localaddr(inp->inp_faddr))
+#endif
+ tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz_local;
+ else
+ tp->snd_cwnd = tp->t_maxseg * V_ss_fltsz;
+
+ if (CC_ALGO(tp)->conn_init != NULL)
+ CC_ALGO(tp)->conn_init(tp->ccv);
+}
+
+void inline
+cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type)
+{
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ switch(type) {
+ case CC_NDUPACK:
+ if (!IN_FASTRECOVERY(tp->t_flags)) {
+ tp->snd_recover = tp->snd_max;
+ if (tp->t_flags & TF_ECN_PERMIT)
+ tp->t_flags |= TF_ECN_SND_CWR;
+ }
+ break;
+ case CC_ECN:
+ if (!IN_CONGRECOVERY(tp->t_flags)) {
+ TCPSTAT_INC(tcps_ecn_rcwnd);
+ tp->snd_recover = tp->snd_max;
+ if (tp->t_flags & TF_ECN_PERMIT)
+ tp->t_flags |= TF_ECN_SND_CWR;
+ }
+ break;
+ case CC_RTO:
+ tp->t_dupacks = 0;
+ tp->t_bytes_acked = 0;
+ EXIT_RECOVERY(tp->t_flags);
+ tp->snd_cwnd = tp->t_maxseg;
+ break;
+ case CC_RTO_ERR:
+ TCPSTAT_INC(tcps_sndrexmitbad);
+ /* RTO was unnecessary, so reset everything. */
+ tp->snd_cwnd = tp->snd_cwnd_prev;
+ tp->snd_ssthresh = tp->snd_ssthresh_prev;
+ tp->snd_recover = tp->snd_recover_prev;
+ if (tp->t_flags & TF_WASFRECOVERY)
+ ENTER_FASTRECOVERY(tp->t_flags);
+ if (tp->t_flags & TF_WASCRECOVERY)
+ ENTER_CONGRECOVERY(tp->t_flags);
+ tp->snd_nxt = tp->snd_max;
+ tp->t_badrxtwin = 0;
+ break;
+ }
+
+ if (CC_ALGO(tp)->cong_signal != NULL) {
+ if (th != NULL)
+ tp->ccv->curack = th->th_ack;
+ CC_ALGO(tp)->cong_signal(tp->ccv, type);
+ }
+}
+
+static void inline
+cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
+{
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ /* XXXLAS: KASSERT that we're in recovery? */
+
+ if (CC_ALGO(tp)->post_recovery != NULL) {
+ tp->ccv->curack = th->th_ack;
+ CC_ALGO(tp)->post_recovery(tp->ccv);
+ }
+ /* XXXLAS: EXIT_RECOVERY ? */
+ tp->t_bytes_acked = 0;
}
/* Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint. */
@@ -1157,14 +1338,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
TCPSTAT_INC(tcps_ecn_ect1);
break;
}
- /*
- * Congestion experienced.
- * Ignore if we are already trying to recover.
- */
- if ((thflags & TH_ECE) &&
- SEQ_LEQ(th->th_ack, tp->snd_recover)) {
- TCPSTAT_INC(tcps_ecn_rcwnd);
- tcp_congestion_exp(tp);
+ /* Congestion experienced. */
+ if (thflags & TH_ECE) {
+ cc_cong_signal(tp, th, CC_ECN);
}
}
@@ -1259,15 +1435,9 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
if (tlen == 0) {
if (SEQ_GT(th->th_ack, tp->snd_una) &&
SEQ_LEQ(th->th_ack, tp->snd_max) &&
- tp->snd_cwnd >= tp->snd_wnd &&
- ((!V_tcp_do_newreno &&
- !(tp->t_flags & TF_SACK_PERMIT) &&
- tp->t_dupacks < tcprexmtthresh) ||
- ((V_tcp_do_newreno ||
- (tp->t_flags & TF_SACK_PERMIT)) &&
- !IN_FASTRECOVERY(tp) &&
- (to.to_flags & TOF_SACK) == 0 &&
- TAILQ_EMPTY(&tp->snd_holes)))) {
+ !IN_RECOVERY(tp->t_flags) &&
+ (to.to_flags & TOF_SACK) == 0 &&
+ TAILQ_EMPTY(&tp->snd_holes)) {
/*
* This is a pure ack for outstanding data.
*/
@@ -1287,15 +1457,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
*/
if (tp->t_rxtshift == 1 &&
(int)(ticks - tp->t_badrxtwin) < 0) {
- TCPSTAT_INC(tcps_sndrexmitbad);
- tp->snd_cwnd = tp->snd_cwnd_prev;
- tp->snd_ssthresh =
- tp->snd_ssthresh_prev;
- tp->snd_recover = tp->snd_recover_prev;
- if (tp->t_flags & TF_WASFRECOVERY)
- ENTER_FASTRECOVERY(tp);
- tp->snd_nxt = tp->snd_max;
- tp->t_badrxtwin = 0;
+ cc_cong_signal(tp, th, CC_RTO_ERR);
}
/*
@@ -1321,13 +1483,22 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tcp_xmit_timer(tp,
ticks - tp->t_rtttime);
}
- acked = th->th_ack - tp->snd_una;
+ acked = BYTES_THIS_ACK(tp, th);
TCPSTAT_INC(tcps_rcvackpack);
TCPSTAT_ADD(tcps_rcvackbyte, acked);
sbdrop(&so->so_snd, acked);
if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
+
+ /*
+ * Let the congestion control algorithm update
+ * congestion control related information. This
+ * typically means increasing the congestion
+ * window.
+ */
+ cc_ack_received(tp, th, CC_ACK);
+
tp->snd_una = th->th_ack;
/*
* Pull snd_wl2 up to prevent seq wrap relative
@@ -1587,6 +1758,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
thflags &= ~TH_SYN;
} else {
tp->t_state = TCPS_ESTABLISHED;
+ cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
}
} else {
@@ -1990,6 +2162,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->t_flags &= ~TF_NEEDFIN;
} else {
tp->t_state = TCPS_ESTABLISHED;
+ cc_conn_init(tp);
tcp_timer_activate(tp, TT_KEEP, tcp_keepidle);
}
/*
@@ -2058,11 +2231,10 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
th->th_ack != tp->snd_una)
tp->t_dupacks = 0;
else if (++tp->t_dupacks > tcprexmtthresh ||
- ((V_tcp_do_newreno ||
- (tp->t_flags & TF_SACK_PERMIT)) &&
- IN_FASTRECOVERY(tp))) {
+ IN_FASTRECOVERY(tp->t_flags)) {
+ cc_ack_received(tp, th, CC_DUPACK);
if ((tp->t_flags & TF_SACK_PERMIT) &&
- IN_FASTRECOVERY(tp)) {
+ IN_FASTRECOVERY(tp->t_flags)) {
int awnd;
/*
@@ -2093,19 +2265,20 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* recovery.
*/
if (tp->t_flags & TF_SACK_PERMIT) {
- if (IN_FASTRECOVERY(tp)) {
+ if (IN_FASTRECOVERY(tp->t_flags)) {
tp->t_dupacks = 0;
break;
}
- } else if (V_tcp_do_newreno ||
- V_tcp_do_ecn) {
+ } else {
if (SEQ_LEQ(th->th_ack,
tp->snd_recover)) {
tp->t_dupacks = 0;
break;
}
}
- tcp_congestion_exp(tp);
+ /* Congestion signal before ack. */
+ cc_cong_signal(tp, th, CC_NDUPACK);
+ cc_ack_received(tp, th, CC_DUPACK);
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
if (tp->t_flags & TF_SACK_PERMIT) {
@@ -2129,6 +2302,7 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
tp->snd_nxt = onxt;
goto drop;
} else if (V_tcp_do_rfc3042) {
+ cc_ack_received(tp, th, CC_DUPACK);
u_long oldcwnd = tp->snd_cwnd;
tcp_seq oldsndmax = tp->snd_max;
u_int sent;
@@ -2170,37 +2344,14 @@ tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
* If the congestion window was inflated to account
* for the other side's cached packets, retract it.
*/
- if (V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) {
- if (IN_FASTRECOVERY(tp)) {
- if (SEQ_LT(th->th_ack, tp->snd_recover)) {
- if (tp->t_flags & TF_SACK_PERMIT)
- tcp_sack_partialack(tp, th);
- else
- tcp_newreno_partial_ack(tp, th);
- } else {
- /*
- * Out of fast recovery.
- * Window inflation should have left us
- * with approximately snd_ssthresh
- * outstanding data.
- * But in case we would be inclined to
- * send a burst, better to do it via
- * the slow start mechanism.
- */
- if (SEQ_GT(th->th_ack +
- tp->snd_ssthresh,
- tp->snd_max))
- tp->snd_cwnd = tp->snd_max -
- th->th_ack +
- tp->t_maxseg;
- else
- tp->snd_cwnd = tp->snd_ssthresh;
- }
- }
- } else {
- if (tp->t_dupacks >= tcprexmtthresh &&
- tp->snd_cwnd > tp->snd_ssthresh)
- tp->snd_cwnd = tp->snd_ssthresh;
+ if (IN_FASTRECOVERY(tp->t_flags)) {
+ if (SEQ_LT(th->th_ack, tp->snd_recover)) {
+ if (tp->t_flags & TF_SACK_PERMIT)
+ tcp_sack_partialack(tp, th);
+ else
+ tcp_newreno_partial_ack(tp, th);
+ } else
+ cc_post_recovery(tp, th);
}
tp->t_dupacks = 0;
/*
@@ -2231,7 +2382,7 @@ process_ACK:
("tcp_input: process_ACK ti_locked %d", ti_locked));
INP_WLOCK_ASSERT(tp->t_inpcb);
- acked = th->th_ack - tp->snd_una;
+ acked = BYTES_THIS_ACK(tp, th);
TCPSTAT_INC(tcps_rcvackpack);
TCPSTAT_ADD(tcps_rcvackbyte, acked);
@@ -2242,16 +2393,8 @@ process_ACK:
* original cwnd and ssthresh, and proceed to transmit where
* we left off.
*/
- if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) {
- TCPSTAT_INC(tcps_sndrexmitbad);
- tp->snd_cwnd = tp->snd_cwnd_prev;
- tp->snd_ssthresh = tp->snd_ssthresh_prev;
- tp->snd_recover = tp->snd_recover_prev;
- if (tp->t_flags & TF_WASFRECOVERY)
- ENTER_FASTRECOVERY(tp);
- tp->snd_nxt = tp->snd_max;
- tp->t_badrxtwin = 0; /* XXX probably not required */
- }
+ if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0)
+ cc_cong_signal(tp, th, CC_RTO_ERR);
/*
* If we have a timestamp reply, update smoothed
@@ -2298,61 +2441,12 @@ process_ACK:
goto step6;
/*
- * When new data is acked, open the congestion window.
- * Method depends on which congestion control state we're
- * in (slow start or cong avoid) and if ABC (RFC 3465) is
- * enabled.
- *
- * slow start: cwnd <= ssthresh
- * cong avoid: cwnd > ssthresh
- *
- * slow start and ABC (RFC 3465):
- * Grow cwnd exponentially by the amount of data
- * ACKed capping the max increment per ACK to
- * (abc_l_var * maxseg) bytes.
- *
- * slow start without ABC (RFC 2581):
- * Grow cwnd exponentially by maxseg per ACK.
- *
- * cong avoid and ABC (RFC 3465):
- * Grow cwnd linearly by maxseg per RTT for each
- * cwnd worth of ACKed data.
- *
- * cong avoid without ABC (RFC 2581):
- * Grow cwnd linearly by approximately maxseg per RTT using
- * maxseg^2 / cwnd per ACK as the increment.
- * If cwnd > maxseg^2, fix the cwnd increment at 1 byte to
- * avoid capping cwnd.
+ * Let the congestion control algorithm update congestion
+ * control related information. This typically means increasing
+ * the congestion window.
*/
- if ((!V_tcp_do_newreno && !(tp->t_flags & TF_SACK_PERMIT)) ||
- !IN_FASTRECOVERY(tp)) {
- u_int cw = tp->snd_cwnd;
- u_int incr = tp->t_maxseg;
- /* In congestion avoidance? */
- if (cw > tp->snd_ssthresh) {
- if (V_tcp_do_rfc3465) {
- tp->t_bytes_acked += acked;
- if (tp->t_bytes_acked >= tp->snd_cwnd)
- tp->t_bytes_acked -= cw;
- else
- incr = 0;
- }
- else
- incr = max((incr * incr / cw), 1);
- /*
- * In slow-start with ABC enabled and no RTO in sight?
- * (Must not use abc_l_var > 1 if slow starting after an
- * RTO. On RTO, snd_nxt = snd_una, so the snd_nxt ==
- * snd_max check is sufficient to handle this).
- */
- } else if (V_tcp_do_rfc3465 &&
- tp->snd_nxt == tp->snd_max)
- incr = min(acked,
- V_tcp_abc_l_var * tp->t_maxseg);
- /* ABC is on by default, so (incr == 0) frequently. */
- if (incr > 0)
- tp->snd_cwnd = min(cw+incr, TCP_MAXWIN<<tp->snd_scale);
- }
+ cc_ack_received(tp, th, CC_ACK);
+
SOCKBUF_LOCK(&so->so_snd);
if (acked > so->so_snd.sb_cc) {
tp->snd_wnd -= so->so_snd.sb_cc;
@@ -2366,16 +2460,14 @@ process_ACK:
/* NB: sowwakeup_locked() does an implicit unlock. */
sowwakeup_locked(so);
/* Detect una wraparound. */
- if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
- !IN_FASTRECOVERY(tp) &&
+ if (!IN_RECOVERY(tp->t_flags) &&
SEQ_GT(tp->snd_una, tp->snd_recover) &&
SEQ_LEQ(th->th_ack, tp->snd_recover))
tp->snd_recover = th->th_ack - 1;
- if ((V_tcp_do_newreno || (tp->t_flags & TF_SACK_PERMIT)) &&
- IN_FASTRECOVERY(tp) &&
+ /* XXXLAS: Can this be moved up into cc_post_recovery? */
+ if (IN_RECOVERY(tp->t_flags) &&
SEQ_GEQ(th->th_ack, tp->snd_recover)) {
- EXIT_FASTRECOVERY(tp);
- tp->t_bytes_acked = 0;
+ EXIT_RECOVERY(tp->t_flags);
}
tp->snd_una = th->th_ack;
if (tp->t_flags & TF_SACK_PERMIT) {
@@ -3240,24 +3332,19 @@ tcp_mss_update(struct tcpcb *tp, int offer,
void
tcp_mss(struct tcpcb *tp, int offer)
{
- int rtt, mss;
+ int mss;
u_long bufsize;
struct inpcb *inp;
struct socket *so;
struct hc_metrics_lite metrics;
int mtuflags = 0;
-#ifdef INET6
- int isipv6;
-#endif
+
KASSERT(tp != NULL, ("%s: tp == NULL", __func__));
tcp_mss_update(tp, offer, &metrics, &mtuflags);
mss = tp->t_maxseg;
inp = tp->t_inpcb;
-#ifdef INET6
- isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
-#endif
/*
* If there's a pipesize, change the socket buffer to that size,
@@ -3297,71 +3384,6 @@ tcp_mss(struct tcpcb *tp, int offer)
(void)sbreserve_locked(&so->so_rcv, bufsize, so, NULL);
}
SOCKBUF_UNLOCK(&so->so_rcv);
- /*
- * While we're here, check the others too.
- */
- if (tp->t_srtt == 0 && (rtt = metrics.rmx_rtt)) {
- tp->t_srtt = rtt;
- tp->t_rttbest = tp->t_srtt + TCP_RTT_SCALE;
- TCPSTAT_INC(tcps_usedrtt);
- if (metrics.rmx_rttvar) {
- tp->t_rttvar = metrics.rmx_rttvar;
- TCPSTAT_INC(tcps_usedrttvar);
- } else {
- /* default variation is +- 1 rtt */
- tp->t_rttvar =
- tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
- }
- TCPT_RANGESET(tp->t_rxtcur,
- ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
- tp->t_rttmin, TCPTV_REXMTMAX);
- }
- if (metrics.rmx_ssthresh) {
- /*
- * There's some sort of gateway or interface
- * buffer limit on the path. Use this to set
- * the slow start threshhold, but set the
- * threshold to no less than 2*mss.
- */
- tp->snd_ssthresh = max(2 * mss, metrics.rmx_ssthresh);
- TCPSTAT_INC(tcps_usedssthresh);
- }
-
- /*
- * Set the slow-start flight size depending on whether this
- * is a local network or not.
- *
- * Extend this so we cache the cwnd too and retrieve it here.
- * Make cwnd even bigger than RFC3390 suggests but only if we
- * have previous experience with the remote host. Be careful
- * not make cwnd bigger than remote receive window or our own
- * send socket buffer. Maybe put some additional upper bound
- * on the retrieved cwnd. Should do incremental updates to
- * hostcache when cwnd collapses so next connection doesn't
- * overloads the path again.
- *
- * RFC3390 says only do this if SYN or SYN/ACK didn't got lost.
- * We currently check only in syncache_socket for that.
- */
-#define TCP_METRICS_CWND
-#ifdef TCP_METRICS_CWND
- if (metrics.rmx_cwnd)
- tp->snd_cwnd = max(mss,
- min(metrics.rmx_cwnd / 2,
- min(tp->snd_wnd, so->so_snd.sb_hiwat)));
- else
-#endif
- if (V_tcp_do_rfc3390)
- tp->snd_cwnd = min(4 * mss, max(2 * mss, 4380));
-#ifdef INET6
- else if ((isipv6 && in6_localaddr(&inp->in6p_faddr)) ||
- (!isipv6 && in_localaddr(inp->inp_faddr)))
-#else
- else if (in_localaddr(inp->inp_faddr))
-#endif
- tp->snd_cwnd = mss * V_ss_fltsz_local;
- else
- tp->snd_cwnd = mss * V_ss_fltsz;
/* Check the interface for TSO capabilities. */
if (mtuflags & CSUM_TSO)
@@ -3425,7 +3447,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
* Set snd_cwnd to one segment beyond acknowledged offset.
* (tp->snd_una has not yet been updated when this function is called.)
*/
- tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
+ tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
tp->t_flags |= TF_ACKNOW;
(void) tcp_output(tp);
tp->snd_cwnd = ocwnd;
@@ -3435,8 +3457,8 @@ tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
* Partial window deflation. Relies on fact that tp->snd_una
* not updated yet.
*/
- if (tp->snd_cwnd > th->th_ack - tp->snd_una)
- tp->snd_cwnd -= th->th_ack - tp->snd_una;
+ if (tp->snd_cwnd > BYTES_THIS_ACK(tp, th))
+ tp->snd_cwnd -= BYTES_THIS_ACK(tp, th);
else
tp->snd_cwnd = 0;
tp->snd_cwnd += tp->t_maxseg;
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index b5bc3d9..7db0adb 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -53,6 +53,7 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
+#include <netinet/cc.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
@@ -64,7 +65,6 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#endif
-#include <netinet/tcp.h>
#define TCPOUTFLAGS
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
@@ -102,11 +102,6 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, local_slowstart_flightsize,
CTLFLAG_RW, &VNET_NAME(ss_fltsz_local), 1,
"Slow start flight size for local networks");
-VNET_DEFINE(int, tcp_do_newreno) = 1;
-SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, newreno, CTLFLAG_RW,
- &VNET_NAME(tcp_do_newreno), 0,
- "Enable NewReno Algorithms");
-
VNET_DEFINE(int, tcp_do_tso) = 1;
#define V_tcp_do_tso VNET(tcp_do_tso)
SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, tso, CTLFLAG_RW,
@@ -131,6 +126,19 @@ SYSCTL_VNET_INT(_net_inet_tcp, OID_AUTO, sendbuf_max, CTLFLAG_RW,
&VNET_NAME(tcp_autosndbuf_max), 0,
"Max size of automatic send buffer");
+static void inline cc_after_idle(struct tcpcb *tp);
+
+/*
+ * CC wrapper hook functions
+ */
+static void inline
+cc_after_idle(struct tcpcb *tp)
+{
+ INP_WLOCK_ASSERT(tp->t_inpcb);
+
+ if (CC_ALGO(tp)->after_idle != NULL)
+ CC_ALGO(tp)->after_idle(tp->ccv);
+}
/*
* Tcp output routine: figure out what should be sent and send it.
@@ -241,7 +249,7 @@ again:
sack_bytes_rxmt = 0;
len = 0;
p = NULL;
- if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp) &&
+ if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) &&
(p = tcp_sack_output(tp, &sack_bytes_rxmt))) {
long cwin;
@@ -1315,7 +1323,7 @@ out:
* on the transmitter effectively destroys the TCP window, forcing
* it to four packets (1.5Kx4 = 6K window).
*/
- if (sendalot && (!V_tcp_do_newreno || --maxburst))
+ if (sendalot && --maxburst)
goto again;
#endif
if (sendalot)
diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c
index 737c2b2..47d44ec 100644
--- a/sys/netinet/tcp_sack.c
+++ b/sys/netinet/tcp_sack.c
@@ -576,7 +576,7 @@ tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th)
tcp_timer_activate(tp, TT_REXMT, 0);
tp->t_rtttime = 0;
/* Send one or 2 segments based on how much new data was acked. */
- if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2)
+ if ((BYTES_THIS_ACK(tp, th) / tp->t_maxseg) > 2)
num_segs = 2;
tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
(tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_maxseg);
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index dc4395d..8596e23 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$");
#include <net/if.h>
#include <net/vnet.h>
+#include <netinet/cc.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
@@ -80,7 +81,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/nd6.h>
#endif
#include <netinet/ip_icmp.h>
-#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
@@ -238,6 +238,7 @@ static char * tcp_log_addr(struct in_conninfo *inc, struct tcphdr *th,
struct tcpcb_mem {
struct tcpcb tcb;
struct tcp_timer tt;
+ struct cc_var ccv;
};
static VNET_DEFINE(uma_zone_t, tcpcb_zone);
@@ -277,6 +278,8 @@ tcp_init(void)
{
int hashsize;
+ cc_init();
+
hashsize = TCBHASHSIZE;
TUNABLE_INT_FETCH("net.inet.tcp.tcbhashsize", &hashsize);
if (!powerof2(hashsize)) {
@@ -640,6 +643,26 @@ tcp_newtcpcb(struct inpcb *inp)
if (tm == NULL)
return (NULL);
tp = &tm->tcb;
+
+ /* Initialise cc_var struct for this tcpcb. */
+ tp->ccv = &tm->ccv;
+ tp->ccv->type = IPPROTO_TCP;
+ tp->ccv->ccvc.tcp = tp;
+
+ /*
+ * Use the current system default CC algorithm.
+ */
+ CC_LIST_RLOCK();
+ KASSERT(!STAILQ_EMPTY(&cc_list), ("cc_list is empty!"));
+ CC_ALGO(tp) = CC_DEFAULT();
+ CC_LIST_RUNLOCK();
+
+ if (CC_ALGO(tp)->cb_init != NULL)
+ if (CC_ALGO(tp)->cb_init(tp->ccv) > 0) {
+ uma_zfree(V_tcpcb_zone, tm);
+ return (NULL);
+ }
+
#ifdef VIMAGE
tp->t_vnet = inp->inp_vnet;
#endif
@@ -805,6 +828,12 @@ tcp_discardcb(struct tcpcb *tp)
tcp_offload_detach(tp);
tcp_free_sackholes(tp);
+
+ /* Allow the CC algorithm to clean up after itself. */
+ if (CC_ALGO(tp)->cb_destroy != NULL)
+ CC_ALGO(tp)->cb_destroy(tp->ccv);
+
+ CC_ALGO(tp) = NULL;
inp->inp_ppcb = NULL;
tp->t_inpcb = NULL;
uma_zfree(V_tcpcb_zone, tp);
@@ -1572,7 +1601,7 @@ tcp_mtudisc(struct inpcb *inp, int errno)
tcp_free_sackholes(tp);
tp->snd_recover = tp->snd_max;
if (tp->t_flags & TF_SACK_PERMIT)
- EXIT_FASTRECOVERY(tp);
+ EXIT_FASTRECOVERY(tp->t_flags);
tcp_output_send(tp);
return (inp);
}
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 65c6a3e..2748e64 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
+#include <netinet/cc.h>
#include <netinet/in.h>
#include <netinet/in_pcb.h>
#include <netinet/in_systm.h>
@@ -58,7 +59,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/in6_pcb.h>
#endif
#include <netinet/ip_var.h>
-#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
@@ -515,10 +515,14 @@ tcp_timer_rexmt(void * xtp)
tp->snd_cwnd_prev = tp->snd_cwnd;
tp->snd_ssthresh_prev = tp->snd_ssthresh;
tp->snd_recover_prev = tp->snd_recover;
- if (IN_FASTRECOVERY(tp))
- tp->t_flags |= TF_WASFRECOVERY;
+ if (IN_FASTRECOVERY(tp->t_flags))
+ tp->t_flags |= TF_WASFRECOVERY;
else
- tp->t_flags &= ~TF_WASFRECOVERY;
+ tp->t_flags &= ~TF_WASFRECOVERY;
+ if (IN_CONGRECOVERY(tp->t_flags))
+ tp->t_flags |= TF_WASCRECOVERY;
+ else
+ tp->t_flags &= ~TF_WASCRECOVERY;
tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
}
TCPSTAT_INC(tcps_rexmttimeo);
@@ -562,40 +566,9 @@ tcp_timer_rexmt(void * xtp)
* If timing a segment in this window, stop the timer.
*/
tp->t_rtttime = 0;
- /*
- * Close the congestion window down to one segment
- * (we'll open it by one segment for each ack we get).
- * Since we probably have a window's worth of unacked
- * data accumulated, this "slow start" keeps us from
- * dumping all that data as back-to-back packets (which
- * might overwhelm an intermediate gateway).
- *
- * There are two phases to the opening: Initially we
- * open by one mss on each ack. This makes the window
- * size increase exponentially with time. If the
- * window is larger than the path can handle, this
- * exponential growth results in dropped packet(s)
- * almost immediately. To get more time between
- * drops but still "push" the network to take advantage
- * of improving conditions, we switch from exponential
- * to linear window opening at some threshhold size.
- * For a threshhold, we use half the current window
- * size, truncated to a multiple of the mss.
- *
- * (the minimum cwnd that will give us exponential
- * growth is 2 mss. We don't allow the threshhold
- * to go below this.)
- */
- {
- u_int win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg;
- if (win < 2)
- win = 2;
- tp->snd_cwnd = tp->t_maxseg;
- tp->snd_ssthresh = win * tp->t_maxseg;
- tp->t_dupacks = 0;
- }
- EXIT_FASTRECOVERY(tp);
- tp->t_bytes_acked = 0;
+
+ cc_cong_signal(tp, 0, CC_RTO);
+
(void) tcp_output(tp);
out:
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index f35890b..a28ddef 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -62,6 +62,7 @@ __FBSDID("$FreeBSD$");
#include <net/route.h>
#include <net/vnet.h>
+#include <netinet/cc.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#ifdef INET6
@@ -77,7 +78,6 @@ __FBSDID("$FreeBSD$");
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
#endif
-#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
@@ -1242,6 +1242,8 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
struct inpcb *inp;
struct tcpcb *tp;
struct tcp_info ti;
+ char buf[TCP_CA_NAME_MAX];
+ struct cc_algo *algo;
error = 0;
inp = sotoinpcb(so);
@@ -1351,6 +1353,54 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
error = EINVAL;
break;
+ case TCP_CONGESTION:
+ INP_WUNLOCK(inp);
+ bzero(buf, sizeof(buf));
+ error = sooptcopyin(sopt, &buf, sizeof(buf), 1);
+ if (error)
+ break;
+ INP_WLOCK_RECHECK(inp);
+ /*
+ * Return EINVAL if we can't find the requested cc algo.
+ */
+ error = EINVAL;
+ CC_LIST_RLOCK();
+ STAILQ_FOREACH(algo, &cc_list, entries) {
+ if (strncmp(buf, algo->name, TCP_CA_NAME_MAX)
+ == 0) {
+ /* We've found the requested algo. */
+ error = 0;
+ /*
+ * We hold a write lock over the tcb
+ * so it's safe to do these things
+ * without ordering concerns.
+ */
+ if (CC_ALGO(tp)->cb_destroy != NULL)
+ CC_ALGO(tp)->cb_destroy(tp->ccv);
+ CC_ALGO(tp) = algo;
+ /*
+ * If something goes pear shaped
+ * initialising the new algo,
+ * fall back to newreno (which
+ * does not require initialisation).
+ */
+ if (algo->cb_init != NULL)
+ if (algo->cb_init(tp->ccv) > 0) {
+ CC_ALGO(tp) = &newreno_cc_algo;
+ /*
+ * The only reason init
+ * should fail is
+ * because of malloc.
+ */
+ error = ENOMEM;
+ }
+ break; /* Break the STAILQ_FOREACH. */
+ }
+ }
+ CC_LIST_RUNLOCK();
+ INP_WUNLOCK(inp);
+ break;
+
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -1394,6 +1444,12 @@ tcp_ctloutput(struct socket *so, struct sockopt *sopt)
INP_WUNLOCK(inp);
error = sooptcopyout(sopt, &ti, sizeof ti);
break;
+ case TCP_CONGESTION:
+ bzero(buf, sizeof(buf));
+ strlcpy(buf, CC_ALGO(tp)->name, TCP_CA_NAME_MAX);
+ INP_WUNLOCK(inp);
+ error = sooptcopyout(sopt, buf, TCP_CA_NAME_MAX);
+ break;
default:
INP_WUNLOCK(inp);
error = ENOPROTOOPT;
@@ -1707,6 +1763,10 @@ db_print_tflags(u_int t_flags)
db_printf("%sTF_FASTRECOVERY", comma ? ", " : "");
comma = 1;
}
+ if (t_flags & TF_CONGRECOVERY) {
+ db_printf("%sTF_CONGRECOVERY", comma ? ", " : "");
+ comma = 1;
+ }
if (t_flags & TF_WASFRECOVERY) {
db_printf("%sTF_WASFRECOVERY", comma ? ", " : "");
comma = 1;
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 0b28681..442c736 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -195,9 +195,11 @@ struct tcpcb {
struct toe_usrreqs *t_tu; /* offload operations vector */
void *t_toe; /* TOE pcb pointer */
int t_bytes_acked; /* # bytes acked during current RTT */
+ struct cc_algo *cc_algo; /* congestion control algorithm */
+ struct cc_var *ccv;
int t_ispare; /* explicit pad for 64bit alignment */
- void *t_pspare2[6]; /* 2 CC / 4 TBD */
+ void *t_pspare2[4]; /* 4 TBD */
uint64_t _pad[12]; /* 7 UTO, 5 TBD (1-2 CC/RTT?) */
};
@@ -230,10 +232,22 @@ struct tcpcb {
#define TF_ECN_PERMIT 0x4000000 /* connection ECN-ready */
#define TF_ECN_SND_CWR 0x8000000 /* ECN CWR in queue */
#define TF_ECN_SND_ECE 0x10000000 /* ECN ECE in queue */
+#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
+#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
-#define IN_FASTRECOVERY(tp) (tp->t_flags & TF_FASTRECOVERY)
-#define ENTER_FASTRECOVERY(tp) tp->t_flags |= TF_FASTRECOVERY
-#define EXIT_FASTRECOVERY(tp) tp->t_flags &= ~TF_FASTRECOVERY
+#define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY)
+#define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY
+#define EXIT_FASTRECOVERY(t_flags) t_flags &= ~TF_FASTRECOVERY
+
+#define IN_CONGRECOVERY(t_flags) (t_flags & TF_CONGRECOVERY)
+#define ENTER_CONGRECOVERY(t_flags) t_flags |= TF_CONGRECOVERY
+#define EXIT_CONGRECOVERY(t_flags) t_flags &= ~TF_CONGRECOVERY
+
+#define IN_RECOVERY(t_flags) (t_flags & (TF_CONGRECOVERY | TF_FASTRECOVERY))
+#define ENTER_RECOVERY(t_flags) t_flags |= (TF_CONGRECOVERY | TF_FASTRECOVERY)
+#define EXIT_RECOVERY(t_flags) t_flags &= ~(TF_CONGRECOVERY | TF_FASTRECOVERY)
+
+#define BYTES_THIS_ACK(tp, th) (th->th_ack - tp->snd_una)
/*
* Flags for the t_oobflags field.
@@ -562,10 +576,11 @@ VNET_DECLARE(int, tcp_mssdflt); /* XXX */
VNET_DECLARE(int, tcp_minmss);
VNET_DECLARE(int, tcp_delack_enabled);
VNET_DECLARE(int, tcp_do_rfc3390);
-VNET_DECLARE(int, tcp_do_newreno);
VNET_DECLARE(int, path_mtu_discovery);
VNET_DECLARE(int, ss_fltsz);
VNET_DECLARE(int, ss_fltsz_local);
+VNET_DECLARE(int, tcp_do_rfc3465);
+VNET_DECLARE(int, tcp_abc_l_var);
#define V_tcb VNET(tcb)
#define V_tcbinfo VNET(tcbinfo)
#define V_tcpstat VNET(tcpstat)
@@ -573,10 +588,11 @@ VNET_DECLARE(int, ss_fltsz_local);
#define V_tcp_minmss VNET(tcp_minmss)
#define V_tcp_delack_enabled VNET(tcp_delack_enabled)
#define V_tcp_do_rfc3390 VNET(tcp_do_rfc3390)
-#define V_tcp_do_newreno VNET(tcp_do_newreno)
#define V_path_mtu_discovery VNET(path_mtu_discovery)
#define V_ss_fltsz VNET(ss_fltsz)
#define V_ss_fltsz_local VNET(ss_fltsz_local)
+#define V_tcp_do_rfc3465 VNET(tcp_do_rfc3465)
+#define V_tcp_abc_l_var VNET(tcp_abc_l_var)
VNET_DECLARE(int, tcp_do_sack); /* SACK enabled/disabled */
VNET_DECLARE(int, tcp_sc_rst_sock_fail); /* RST on sock alloc failure */
@@ -678,6 +694,8 @@ void tcp_free_sackholes(struct tcpcb *tp);
int tcp_newreno(struct tcpcb *, struct tcphdr *);
u_long tcp_seq_subtract(u_long, u_long );
+void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+
#endif /* _KERNEL */
#endif /* _NETINET_TCP_VAR_H_ */
diff --git a/sys/sys/param.h b/sys/sys/param.h
index a64c77b..acd1f51 100644
--- a/sys/sys/param.h
+++ b/sys/sys/param.h
@@ -58,7 +58,7 @@
* in the range 5 to 9.
*/
#undef __FreeBSD_version
-#define __FreeBSD_version 900024 /* Master, propagated to newvers */
+#define __FreeBSD_version 900025 /* Master, propagated to newvers */
#ifndef LOCORE
#include <sys/types.h>
OpenPOWER on IntegriCloud