summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/conf/NOTES2
-rw-r--r--sys/dev/cxgb/common/cxgb_t3_cpl.h12
-rw-r--r--sys/dev/cxgb/cxgb_adapter.h85
-rw-r--r--sys/dev/cxgb/cxgb_config.h1
-rw-r--r--sys/dev/cxgb/cxgb_l2t.c26
-rw-r--r--sys/dev/cxgb/cxgb_l2t.h2
-rw-r--r--sys/dev/cxgb/cxgb_main.c301
-rw-r--r--sys/dev/cxgb/cxgb_offload.c72
-rw-r--r--sys/dev/cxgb/cxgb_offload.h2
-rw-r--r--sys/dev/cxgb/cxgb_osdep.h134
-rw-r--r--sys/dev/cxgb/cxgb_sge.c1035
-rw-r--r--sys/dev/cxgb/sys/cxgb_support.c6
-rw-r--r--sys/dev/cxgb/sys/mvec.h3
-rw-r--r--sys/dev/cxgb/t3cdev.h2
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c15
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c15
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_listen.c2
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tcp.h44
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c694
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c1362
-rw-r--r--sys/dev/cxgb/ulp/tom/cxgb_tom.c78
-rw-r--r--sys/modules/cxgb/Makefile2
-rw-r--r--sys/modules/cxgb/cxgb/Makefile6
-rw-r--r--sys/modules/cxgb/tom/Makefile8
24 files changed, 3309 insertions, 600 deletions
diff --git a/sys/conf/NOTES b/sys/conf/NOTES
index 4c88ca3..56c2885 100644
--- a/sys/conf/NOTES
+++ b/sys/conf/NOTES
@@ -1878,7 +1878,7 @@ device xe
device bce # Broadcom BCM5706/BCM5708 Gigabit Ethernet
device bfe # Broadcom BCM440x 10/100 Ethernet
device bge # Broadcom BCM570xx Gigabit Ethernet
-#device cxgb # Chelsio T3 10 Gigabit Ethernet
+device cxgb # Chelsio T3 10 Gigabit Ethernet
device dc # DEC/Intel 21143 and various workalikes
device fxp # Intel EtherExpress PRO/100B (82557, 82558)
hint.fxp.0.prefer_iomap="0"
diff --git a/sys/dev/cxgb/common/cxgb_t3_cpl.h b/sys/dev/cxgb/common/cxgb_t3_cpl.h
index e1b4030..672823c 100644
--- a/sys/dev/cxgb/common/cxgb_t3_cpl.h
+++ b/sys/dev/cxgb/common/cxgb_t3_cpl.h
@@ -1131,6 +1131,18 @@ struct cpl_tx_pkt_lso {
__be32 lso_info;
};
+struct cpl_tx_pkt_batch_entry {
+ __be32 cntrl;
+ __be32 len;
+ __be64 addr;
+};
+
+struct cpl_tx_pkt_batch {
+ WR_HDR;
+ struct cpl_tx_pkt_batch_entry pkt_entry[7];
+};
+
+
/* cpl_tx_pkt*.cntrl fields */
#define S_TXPKT_VLAN 0
#define M_TXPKT_VLAN 0xFFFF
diff --git a/sys/dev/cxgb/cxgb_adapter.h b/sys/dev/cxgb/cxgb_adapter.h
index 23db259..542668e 100644
--- a/sys/dev/cxgb/cxgb_adapter.h
+++ b/sys/dev/cxgb/cxgb_adapter.h
@@ -31,7 +31,6 @@ $FreeBSD$
***************************************************************************/
-
#ifndef _CXGB_ADAPTER_H_
#define _CXGB_ADAPTER_H_
@@ -42,6 +41,7 @@ $FreeBSD$
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/sockio.h>
+#include <sys/condvar.h>
#include <net/ethernet.h>
#include <net/if.h>
@@ -49,6 +49,7 @@ $FreeBSD$
#include <machine/bus.h>
#include <machine/resource.h>
+
#include <sys/bus_dma.h>
#include <dev/pci/pcireg.h>
#include <dev/pci/pcivar.h>
@@ -56,8 +57,8 @@ $FreeBSD$
#ifdef CONFIG_DEFINED
#include <cxgb_osdep.h>
#include <t3cdev.h>
-#include <sys/mbufq.h>
#include <ulp/toecore/cxgb_toedev.h>
+#include <sys/mbufq.h>
#else
#include <dev/cxgb/cxgb_osdep.h>
#include <dev/cxgb/t3cdev.h>
@@ -128,10 +129,12 @@ struct port_info {
struct task timer_reclaim_task;
struct cdev *port_cdev;
-#define PORT_NAME_LEN 32
+#define PORT_LOCK_NAME_LEN 32
#define TASKQ_NAME_LEN 32
- char lockbuf[PORT_NAME_LEN];
+#define PORT_NAME_LEN 32
+ char lockbuf[PORT_LOCK_NAME_LEN];
char taskqbuf[TASKQ_NAME_LEN];
+ char namebuf[PORT_NAME_LEN];
};
enum { /* adapter flags */
@@ -143,19 +146,14 @@ enum { /* adapter flags */
TPS_UPTODATE = (1 << 5),
};
-
#define FL_Q_SIZE 4096
-#define JUMBO_Q_SIZE 512
+#define JUMBO_Q_SIZE 1024
#define RSPQ_Q_SIZE 1024
#define TX_ETH_Q_SIZE 1024
-
-
-/*
- * Types of Tx queues in each queue set. Order here matters, do not change.
- * XXX TOE is not implemented yet, so the extra queues are just placeholders.
- */
-enum { TXQ_ETH, TXQ_OFLD, TXQ_CTRL };
+enum { TXQ_ETH = 0,
+ TXQ_OFLD = 1,
+ TXQ_CTRL = 2, };
/* careful, the following are set on priv_flags and must not collide with
@@ -275,7 +273,22 @@ struct sge_txq {
bus_dmamap_t desc_map;
bus_dma_tag_t entry_tag;
struct mbuf_head sendq;
+ /*
+ * cleanq should really be an buf_ring to avoid extra
+ * mbuf touches
+ */
+ struct mbuf_head cleanq;
+ struct buf_ring txq_mr;
+ struct mbuf *immpkt;
+ uint32_t txq_drops;
+ uint32_t txq_skipped;
+ uint32_t txq_coalesced;
+ uint32_t txq_enqueued;
+ unsigned long txq_frees;
struct mtx lock;
+ struct sg_ent txq_sgl[TX_MAX_SEGS / 2 + 1];
+ bus_dma_segment_t txq_segs[TX_MAX_SEGS];
+ struct mbuf *txq_m_vec[TX_WR_COUNT_MAX];
#define TXQ_NAME_LEN 32
char lockbuf[TXQ_NAME_LEN];
};
@@ -294,6 +307,10 @@ enum {
#define SGE_PSTAT_MAX (SGE_PSTATS_LRO_X_STREAMS+1)
+#define QS_EXITING 0x1
+#define QS_RUNNING 0x2
+#define QS_BOUND 0x4
+
struct sge_qset {
struct sge_rspq rspq;
struct sge_fl fl[SGE_RXQ_PER_SET];
@@ -303,6 +320,12 @@ struct sge_qset {
uint64_t port_stats[SGE_PSTAT_MAX];
struct port_info *port;
int idx; /* qset # */
+ int qs_cpuid;
+ int qs_flags;
+ struct cv qs_cv;
+ struct mtx qs_mtx;
+#define QS_NAME_LEN 32
+ char namebuf[QS_NAME_LEN];
};
struct sge {
@@ -344,7 +367,15 @@ struct adapter {
void *msix_intr_tag[SGE_QSETS];
uint8_t rxpkt_map[8]; /* maps RX_PKT interface values to port ids */
uint8_t rrss_map[SGE_QSETS]; /* revers RSS map table */
-
+ uint16_t rspq_map[RSS_TABLE_SIZE]; /* maps 7-bit cookie to qidx */
+ union {
+ uint8_t fill[SGE_QSETS];
+ uint64_t coalesce;
+ } u;
+
+#define tunq_fill u.fill
+#define tunq_coalesce u.coalesce
+
struct filter_info *filters;
/* Tasks */
@@ -474,7 +505,7 @@ t3_get_next_mcaddr(struct t3_rx_mode *rm)
uint8_t *macaddr = NULL;
if (rm->idx == 0)
- macaddr = rm->port->hw_addr;
+ macaddr = (uint8_t *)rm->port->hw_addr;
rm->idx++;
return (macaddr);
@@ -515,18 +546,21 @@ void t3_sge_stop(adapter_t *);
void t3b_intr(void *data);
void t3_intr_msi(void *data);
void t3_intr_msix(void *data);
-int t3_encap(struct port_info *, struct mbuf **, int *free);
+int t3_encap(struct sge_qset *, struct mbuf **, int);
int t3_sge_init_adapter(adapter_t *);
int t3_sge_init_port(struct port_info *);
void t3_sge_deinit_sw(adapter_t *);
+void t3_free_tx_desc(struct sge_txq *q, int n);
+void t3_free_tx_desc_all(struct sge_txq *q);
void t3_rx_eth_lro(adapter_t *adap, struct sge_rspq *rq, struct mbuf *m,
int ethpad, uint32_t rss_hash, uint32_t rss_csum, int lro);
void t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad);
void t3_lro_flush(adapter_t *adap, struct sge_qset *qs, struct lro_state *state);
-void t3_add_sysctls(adapter_t *sc);
+void t3_add_attach_sysctls(adapter_t *sc);
+void t3_add_configured_sysctls(adapter_t *sc);
int t3_get_desc(const struct sge_qset *qs, unsigned int qnum, unsigned int idx,
unsigned char *data);
void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
@@ -535,7 +569,7 @@ void t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p);
*/
#define desc_reclaimable(q) ((int)((q)->processed - (q)->cleaned - TX_MAX_DESC))
-#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field)))
+#define container_of(p, stype, field) ((stype *)(((uint8_t *)(p)) - offsetof(stype, field)))
static __inline struct sge_qset *
fl_to_qset(struct sge_fl *q, int qidx)
@@ -569,5 +603,20 @@ static inline int offload_running(adapter_t *adapter)
return isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT);
}
+#ifdef IFNET_MULTIQUEUE
+int cxgb_pcpu_enqueue_packet(struct ifnet *ifp, struct mbuf *m);
+int cxgb_pcpu_start(struct ifnet *ifp, struct mbuf *m);
+int32_t cxgb_pcpu_get_cookie(struct ifnet *ifp, struct in6_addr *lip, uint16_t lport,
+ struct in6_addr *rip, uint16_t rport, int ipv6);
+void cxgb_pcpu_shutdown_threads(struct adapter *sc);
+void cxgb_pcpu_startup_threads(struct adapter *sc);
+#endif
+
+int process_responses(adapter_t *adap, struct sge_qset *qs, int budget);
+int cxgb_tx_common(struct ifnet *ifp, struct sge_qset *qs, uint32_t txmax);
+void t3_free_qset(adapter_t *sc, struct sge_qset *q);
+int cxgb_dequeue_packet(struct ifnet *, struct sge_txq *, struct mbuf **);
+void cxgb_start(struct ifnet *ifp);
+void refill_fl_service(adapter_t *adap, struct sge_fl *fl);
#endif
diff --git a/sys/dev/cxgb/cxgb_config.h b/sys/dev/cxgb/cxgb_config.h
index a12753f..6b072c3 100644
--- a/sys/dev/cxgb/cxgb_config.h
+++ b/sys/dev/cxgb/cxgb_config.h
@@ -34,7 +34,6 @@ $FreeBSD$
#ifndef CONFIG_DEFINED
#define CONFIG_CHELSIO_T3_CORE
-#define DISABLE_MBUF_IOVEC
#endif
#endif
diff --git a/sys/dev/cxgb/cxgb_l2t.c b/sys/dev/cxgb/cxgb_l2t.c
index 0bb0695..f3e02f2 100644
--- a/sys/dev/cxgb/cxgb_l2t.c
+++ b/sys/dev/cxgb/cxgb_l2t.c
@@ -115,7 +115,7 @@ neigh_replace(struct l2t_entry *e, struct rtentry *rt)
*/
static int
setup_l2e_send_pending(struct t3cdev *dev, struct mbuf *m,
- struct l2t_entry *e)
+ struct l2t_entry *e)
{
struct cpl_l2t_write_req *req;
@@ -183,7 +183,7 @@ t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m, struct l2t_entry *e)
again:
switch (e->state) {
case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
- arpresolve(rt->rt_ifp, rt, NULL, (struct sockaddr *)&sin, e->dmac);
+ arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac);
mtx_lock(&e->lock);
if (e->state == L2T_STATE_STALE)
e->state = L2T_STATE_VALID;
@@ -208,8 +208,8 @@ again:
* A better way would be to use a work request to retry L2T
* entries when there's no memory.
*/
- printf("doing arpresolve on 0x%x \n", e->addr);
- if (arpresolve(rt->rt_ifp, rt, NULL, (struct sockaddr *)&sin, e->dmac) == 0) {
+ printf("doing arpresolve2 on 0x%x \n", e->addr);
+ if (arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac) == 0) {
printf("mac=%x:%x:%x:%x:%x:%x\n",
e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
@@ -223,7 +223,7 @@ again:
m_freem(m);
mtx_unlock(&e->lock);
} else
- printf("arpresolve returned non-zero\n");
+ printf("arpresolve2 returned non-zero\n");
}
return 0;
}
@@ -245,7 +245,7 @@ t3_l2t_send_event(struct t3cdev *dev, struct l2t_entry *e)
again:
switch (e->state) {
case L2T_STATE_STALE: /* entry is stale, kick off revalidation */
- arpresolve(rt->rt_ifp, rt, m0, (struct sockaddr *)&sin, e->dmac);
+ arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac);
mtx_lock(&e->lock);
if (e->state == L2T_STATE_STALE) {
e->state = L2T_STATE_VALID;
@@ -262,8 +262,6 @@ again:
}
mtx_unlock(&e->lock);
- if ((m0 = m_gethdr(M_NOWAIT, MT_DATA)) == NULL)
- return;
/*
* Only the first packet added to the arpq should kick off
* resolution. However, because the alloc_skb below can fail,
@@ -272,7 +270,7 @@ again:
* A better way would be to use a work request to retry L2T
* entries when there's no memory.
*/
- arpresolve(rt->rt_ifp, rt, m0, (struct sockaddr *)&sin, e->dmac);
+ arpresolve2(rt->rt_ifp, rt, (struct sockaddr *)&sin, e->dmac);
}
return;
@@ -459,7 +457,8 @@ handle_failed_resolution(struct t3cdev *dev, struct mbuf *arpq)
}
void
-t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
+t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh,
+ uint8_t *enaddr, struct sockaddr *sa)
{
struct l2t_entry *e;
struct mbuf *arpq = NULL;
@@ -468,8 +467,6 @@ t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
int ifidx = neigh->rt_ifp->if_index;
int hash = arp_hash(addr, ifidx, d);
struct llinfo_arp *la;
- u_char edst[ETHER_ADDR_LEN];
-
printf("t3_l2t_update called with arp info\n");
@@ -485,10 +482,11 @@ t3_l2t_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
found:
printf("found 0x%08x\n", addr);
- arpresolve(neigh->rt_ifp, neigh, NULL, sa, edst);
rw_runlock(&d->lock);
- memcpy(e->dmac, edst, ETHER_ADDR_LEN);
+ memcpy(e->dmac, enaddr, ETHER_ADDR_LEN);
+ printf("mac=%x:%x:%x:%x:%x:%x\n",
+ e->dmac[0], e->dmac[1], e->dmac[2], e->dmac[3], e->dmac[4], e->dmac[5]);
if (atomic_load_acq_int(&e->refcnt)) {
if (neigh != e->neigh)
diff --git a/sys/dev/cxgb/cxgb_l2t.h b/sys/dev/cxgb/cxgb_l2t.h
index 9b4effd..a5d469b 100644
--- a/sys/dev/cxgb/cxgb_l2t.h
+++ b/sys/dev/cxgb/cxgb_l2t.h
@@ -118,7 +118,7 @@ static __inline void set_arp_failure_handler(struct mbuf *m,
#define L2DATA(dev) ((dev)->l2opt)
void t3_l2e_free(struct l2t_data *d, struct l2t_entry *e);
-void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, struct sockaddr *sa);
+void t3_l2t_update(struct t3cdev *dev, struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa);
struct l2t_entry *t3_l2t_get(struct t3cdev *dev, struct rtentry *neigh,
struct ifnet *ifp, struct sockaddr *sa);
int t3_l2t_send_slow(struct t3cdev *dev, struct mbuf *m,
diff --git a/sys/dev/cxgb/cxgb_main.c b/sys/dev/cxgb/cxgb_main.c
index ef77dd5..92e5f2f 100644
--- a/sys/dev/cxgb/cxgb_main.c
+++ b/sys/dev/cxgb/cxgb_main.c
@@ -44,14 +44,15 @@ __FBSDID("$FreeBSD$");
#include <sys/ioccom.h>
#include <sys/mbuf.h>
#include <sys/linker.h>
-#include <sys/syslog.h>
#include <sys/firmware.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/smp.h>
#include <sys/sysctl.h>
+#include <sys/syslog.h>
#include <sys/queue.h>
#include <sys/taskqueue.h>
+#include <sys/proc.h>
#include <net/bpf.h>
#include <net/ethernet.h>
@@ -73,23 +74,18 @@ __FBSDID("$FreeBSD$");
#include <dev/pci/pcivar.h>
#include <dev/pci/pci_private.h>
-
-#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
-
#ifdef CONFIG_DEFINED
#include <cxgb_include.h>
-#include <sys/mvec.h>
#else
#include <dev/cxgb/cxgb_include.h>
-#include <dev/cxgb/sys/mvec.h>
#endif
#ifdef PRIV_SUPPORTED
#include <sys/priv.h>
#endif
+#include <machine/intr_machdep.h>
+
static int cxgb_setup_msix(adapter_t *, int);
static void cxgb_teardown_msix(adapter_t *);
static void cxgb_init(void *);
@@ -97,8 +93,6 @@ static void cxgb_init_locked(struct port_info *);
static void cxgb_stop_locked(struct port_info *);
static void cxgb_set_rxmode(struct port_info *);
static int cxgb_ioctl(struct ifnet *, unsigned long, caddr_t);
-static void cxgb_start(struct ifnet *);
-static void cxgb_start_proc(void *, int ncount);
static int cxgb_media_change(struct ifnet *);
static void cxgb_media_status(struct ifnet *, struct ifmediareq *);
static int setup_sge_qsets(adapter_t *);
@@ -109,6 +103,10 @@ static void cxgb_down_locked(struct adapter *sc);
static void cxgb_tick(void *);
static void setup_rss(adapter_t *sc);
+#ifndef IFNET_MULTIQUEUE
+static void cxgb_start_proc(void *, int ncount);
+#endif
+
/* Attachment glue for the PCI controller end of the device. Each port of
* the device is attached separately, as defined later.
*/
@@ -122,11 +120,7 @@ static void cxgb_get_regs(adapter_t *sc, struct ifconf_regs *regs, uint8_t *buf)
static int cxgb_get_regs_len(void);
static int offload_open(struct port_info *pi);
static void touch_bars(device_t dev);
-
-#ifdef notyet
static int offload_close(struct t3cdev *tdev);
-#endif
-
static device_method_t cxgb_controller_methods[] = {
DEVMETHOD(device_probe, cxgb_controller_probe),
@@ -188,7 +182,6 @@ DRIVER_MODULE(cxgb, cxgbc, cxgb_port_driver, cxgb_port_devclass, 0, 0);
#define SGE_MSIX_COUNT (SGE_QSETS + 1)
-extern int collapse_mbufs;
/*
* The driver uses the best interrupt scheme available on a platform in the
* order MSI-X, MSI, legacy pin interrupts. This parameter determines which
@@ -218,11 +211,15 @@ SYSCTL_UINT(_hw_cxgb, OID_AUTO, ofld_disable, CTLFLAG_RDTUN, &ofld_disable, 0,
* The driver uses an auto-queue algorithm by default.
* To disable it and force a single queue-set per port, use singleq = 1.
*/
-static int singleq = 1;
+static int singleq = 0;
TUNABLE_INT("hw.cxgb.singleq", &singleq);
SYSCTL_UINT(_hw_cxgb, OID_AUTO, singleq, CTLFLAG_RDTUN, &singleq, 0,
"use a single queue-set per port");
+#ifndef IFNET_MULTIQUEUE
+int cxgb_txq_buf_ring_size = 0;
+#endif
+
enum {
MAX_TXQ_ENTRIES = 16384,
MAX_CTRL_TXQ_ENTRIES = 1024,
@@ -281,10 +278,24 @@ struct cxgb_ident {
{0, 0, 0, NULL}
};
-
static int set_eeprom(struct port_info *pi, const uint8_t *data, int len, int offset);
-static inline char
+static __inline void
+check_pkt_coalesce(struct sge_qset *qs)
+{
+ struct adapter *sc;
+ struct sge_txq *txq;
+
+ txq = &qs->txq[TXQ_ETH];
+ sc = qs->port->adapter;
+
+ if (sc->tunq_fill[qs->idx] && (txq->in_use < (txq->size - (txq->size>>2))))
+ sc->tunq_fill[qs->idx] = 0;
+ else if (!sc->tunq_fill[qs->idx] && (txq->in_use > (txq->size - (txq->size>>2))))
+ sc->tunq_fill[qs->idx] = 1;
+}
+
+static __inline char
t3rev2char(struct adapter *adapter)
{
char rev = 'z';
@@ -582,6 +593,7 @@ cxgb_controller_attach(device_t dev)
pi->tx_chan = i >= ai->nports0;
pi->txpkt_intf = pi->tx_chan ? 2 * (i - ai->nports0) + 1 : 2 * i;
sc->rxpkt_map[pi->txpkt_intf] = i;
+ sc->port[i].tx_chan = i >= ai->nports0;
sc->portdev[i] = child;
device_set_softc(child, pi);
}
@@ -611,7 +623,7 @@ cxgb_controller_attach(device_t dev)
G_FW_VERSION_MAJOR(vers), G_FW_VERSION_MINOR(vers),
G_FW_VERSION_MICRO(vers));
- t3_add_sysctls(sc);
+ t3_add_attach_sysctls(sc);
out:
if (error)
cxgb_free(sc);
@@ -636,10 +648,14 @@ cxgb_free(struct adapter *sc)
{
int i;
+
+#ifdef IFNET_MULTIQUEUE
+ cxgb_pcpu_shutdown_threads(sc);
+#endif
ADAPTER_LOCK(sc);
- /*
- * drops the lock
- */
+/*
+ * drops the lock
+ */
cxgb_down_locked(sc);
#ifdef MSI_SUPPORTED
@@ -664,7 +680,7 @@ cxgb_free(struct adapter *sc)
* Wait for last callout
*/
- tsleep(&sc, 0, "cxgb unload", 3*hz);
+ DELAY(hz*100);
for (i = 0; i < (sc)->params.nports; ++i) {
if (sc->portdev[i] != NULL)
@@ -674,15 +690,17 @@ cxgb_free(struct adapter *sc)
bus_generic_detach(sc->dev);
if (sc->tq != NULL)
taskqueue_free(sc->tq);
-#ifdef notyet
if (is_offload(sc)) {
cxgb_adapter_unofld(sc);
if (isset(&sc->open_device_map, OFFLOAD_DEVMAP_BIT))
offload_close(&sc->tdev);
- }
-#endif
-
+ else
+ printf("cxgb_free: DEVMAP_BIT not set\n");
+ } else
+ printf("not offloading set\n");
+#ifndef IFNET_MULTIQUEUE
t3_free_sge_resources(sc);
+#endif
free(sc->filters, M_DEVBUF);
t3_sge_free(sc);
@@ -696,8 +714,6 @@ cxgb_free(struct adapter *sc)
MTX_DESTROY(&sc->sge.reg_lock);
MTX_DESTROY(&sc->elmer_lock);
ADAPTER_LOCK_DEINIT(sc);
-
- return;
}
/**
@@ -803,7 +819,7 @@ cxgb_setup_msix(adapter_t *sc, int msix_count)
printf("setting up interrupt for port=%d\n",
qs->port->port_id);
if (bus_setup_intr(sc->dev, sc->msix_irq_res[k],
- INTR_MPSAFE|INTR_TYPE_NET,
+ INTR_MPSAFE|INTR_TYPE_NET,
#ifdef INTR_FILTERS
NULL,
#endif
@@ -812,10 +828,17 @@ cxgb_setup_msix(adapter_t *sc, int msix_count)
"interrupt for message %d\n", rid);
return (EINVAL);
}
+#ifdef IFNET_MULTIQUEUE
+ if (singleq == 0) {
+ int vector = rman_get_start(sc->msix_irq_res[k]);
+ if (bootverbose)
+ device_printf(sc->dev, "binding vector=%d to cpu=%d\n", vector, k % mp_ncpus);
+ intr_bind(vector, k % mp_ncpus);
+ }
+#endif
}
}
-
return (0);
}
@@ -892,6 +915,12 @@ cxgb_port_attach(device_t dev)
ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
ifp->if_ioctl = cxgb_ioctl;
ifp->if_start = cxgb_start;
+
+#ifdef IFNET_MULTIQUEUE
+ ifp->if_flags |= IFF_MULTIQ;
+ ifp->if_mq_start = cxgb_pcpu_start;
+#endif
+
ifp->if_timer = 0; /* Disable ifnet watchdog */
ifp->if_watchdog = NULL;
@@ -965,7 +994,7 @@ cxgb_port_attach(device_t dev)
p->tq = taskqueue_create_fast(p->taskqbuf, M_NOWAIT,
taskqueue_thread_enqueue, &p->tq);
#endif
-
+#ifndef IFNET_MULTIQUEUE
if (p->tq == NULL) {
device_printf(dev, "failed to allocate port task queue\n");
return (ENOMEM);
@@ -974,7 +1003,7 @@ cxgb_port_attach(device_t dev)
device_get_nameunit(dev));
TASK_INIT(&p->start_task, 0, cxgb_start_proc, ifp);
-
+#endif
t3_sge_init_port(p);
return (0);
@@ -999,6 +1028,9 @@ cxgb_port_detach(device_t dev)
}
ether_ifdetach(p->ifp);
+ printf("waiting for callout to stop ...");
+ DELAY(1000000);
+ printf("done\n");
/*
* the lock may be acquired in ifdetach
*/
@@ -1247,9 +1279,7 @@ offload_tx(struct t3cdev *tdev, struct mbuf *m)
{
int ret;
- critical_enter();
ret = t3_offload_tx(tdev, m);
- critical_exit();
return (ret);
}
@@ -1264,6 +1294,8 @@ write_smt_entry(struct adapter *adapter, int idx)
return (ENOMEM);
req = mtod(m, struct cpl_smt_write_req *);
+ m->m_pkthdr.len = m->m_len = sizeof(struct cpl_smt_write_req);
+
req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SMT_WRITE_REQ, idx));
req->mtu_idx = NMTUS - 1; /* should be 0 but there's a T3 bug */
@@ -1325,6 +1357,10 @@ bind_qsets(adapter_t *sc)
{
int i, j;
+#ifdef IFNET_MULTIQUEUE
+ cxgb_pcpu_startup_threads(sc);
+#endif
+
for (i = 0; i < (sc)->params.nports; ++i) {
const struct port_info *pi = adap2pinfo(sc, i);
@@ -1473,6 +1509,7 @@ cxgb_up(struct adapter *sc)
goto out;
setup_rss(sc);
+ t3_add_configured_sysctls(sc);
sc->flags |= FULL_INIT_DONE;
}
@@ -1545,6 +1582,8 @@ cxgb_down_locked(struct adapter *sc)
cxgb_teardown_msix(sc);
ADAPTER_UNLOCK(sc);
+ callout_stop(&sc->cxgb_tick_ch);
+ callout_stop(&sc->sge_timer_ch);
callout_drain(&sc->cxgb_tick_ch);
callout_drain(&sc->sge_timer_ch);
@@ -1553,26 +1592,28 @@ cxgb_down_locked(struct adapter *sc)
for (i = 0; i < sc->params.nports; i++)
taskqueue_drain(sc->tq, &sc->port[i].timer_reclaim_task);
}
-#ifdef notyet
-
- if (sc->port[i].tq != NULL)
-#endif
-
}
static int
offload_open(struct port_info *pi)
{
struct adapter *adapter = pi->adapter;
- struct t3cdev *tdev = TOEDEV(pi->ifp);
+ struct t3cdev *tdev = &adapter->tdev;
+#ifdef notyet
+ T3CDEV(pi->ifp);
+#endif
int adap_up = adapter->open_device_map & PORT_MASK;
int err = 0;
+ printf("device_map=0x%x\n", adapter->open_device_map);
if (atomic_cmpset_int(&adapter->open_device_map,
- (adapter->open_device_map & ~OFFLOAD_DEVMAP_BIT),
- (adapter->open_device_map | OFFLOAD_DEVMAP_BIT)) == 0)
+ (adapter->open_device_map & ~(1<<OFFLOAD_DEVMAP_BIT)),
+ (adapter->open_device_map | (1<<OFFLOAD_DEVMAP_BIT))) == 0)
return (0);
+
+ if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT))
+ printf("offload_open: DEVMAP_BIT did not get set 0x%x\n", adapter->open_device_map);
ADAPTER_LOCK(pi->adapter);
if (!adap_up)
err = cxgb_up(adapter);
@@ -1581,7 +1622,7 @@ offload_open(struct port_info *pi)
return (err);
t3_tp_set_offload_mode(adapter, 1);
- tdev->lldev = adapter->port[0].ifp;
+ tdev->lldev = pi->ifp;
err = cxgb_offload_activate(adapter);
if (err)
goto out;
@@ -1605,15 +1646,18 @@ out:
}
return (err);
}
-#ifdef notyet
+
static int
-offload_close(struct t3cev *tdev)
+offload_close(struct t3cdev *tdev)
{
struct adapter *adapter = tdev2adap(tdev);
- if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT))
+ if (!isset(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT)) {
+ printf("offload_close: DEVMAP_BIT not set\n");
+
return (0);
-
+ }
+
/* Call back all registered clients */
cxgb_remove_clients(tdev);
tdev->lldev = NULL;
@@ -1621,13 +1665,15 @@ offload_close(struct t3cev *tdev)
t3_tp_set_offload_mode(adapter, 0);
clrbit(&adapter->open_device_map, OFFLOAD_DEVMAP_BIT);
+ ADAPTER_LOCK(adapter);
if (!adapter->open_device_map)
- cxgb_down(adapter);
-
+ cxgb_down_locked(adapter);
+ else
+ ADAPTER_UNLOCK(adapter);
cxgb_offload_deactivate(adapter);
return (0);
}
-#endif
+
static void
cxgb_init(void *arg)
@@ -1667,6 +1713,8 @@ cxgb_init_locked(struct port_info *p)
if (err)
log(LOG_WARNING,
"Could not initialize offload capabilities\n");
+ else
+ printf("offload opened\n");
}
cxgb_link_start(p);
t3_link_changed(sc, p->port_id);
@@ -1675,8 +1723,7 @@ cxgb_init_locked(struct port_info *p)
device_printf(sc->dev, "enabling interrupts on port=%d\n", p->port_id);
t3_port_intr_enable(sc, p->port_id);
- callout_reset(&sc->cxgb_tick_ch, sc->params.stats_update_period * hz,
- cxgb_tick, sc);
+ callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc);
ifp->if_drv_flags |= IFF_DRV_RUNNING;
ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
@@ -1703,7 +1750,6 @@ cxgb_stop_locked(struct port_info *p)
ADAPTER_LOCK_ASSERT_NOTOWNED(p->adapter);
ifp = p->ifp;
-
t3_port_intr_disable(p->adapter, p->port_id);
ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
p->phy.ops->power_down(&p->phy, 1);
@@ -1712,7 +1758,6 @@ cxgb_stop_locked(struct port_info *p)
ADAPTER_LOCK(p->adapter);
clrbit(&p->adapter->open_device_map, p->port_id);
-
if (p->adapter->open_device_map == 0) {
cxgb_down_locked(p->adapter);
} else
@@ -1786,8 +1831,7 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data)
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
adapter_t *sc = p->adapter;
- callout_reset(&sc->cxgb_tick_ch,
- sc->params.stats_update_period * hz,
+ callout_reset(&sc->cxgb_tick_ch, hz,
cxgb_tick, sc);
}
PORT_UNLOCK(p);
@@ -1838,77 +1882,92 @@ cxgb_ioctl(struct ifnet *ifp, unsigned long command, caddr_t data)
return (error);
}
-static int
-cxgb_start_tx(struct ifnet *ifp, uint32_t txmax)
+int
+cxgb_tx_common(struct ifnet *ifp, struct sge_qset *qs, uint32_t txmax)
{
- struct sge_qset *qs;
struct sge_txq *txq;
- struct port_info *p = ifp->if_softc;
- struct mbuf *m = NULL;
- int err, in_use_init, free;
-
- if (!p->link_config.link_ok)
- return (ENXIO);
-
- if (IFQ_DRV_IS_EMPTY(&ifp->if_snd))
- return (ENOBUFS);
+ int err, in_use_init, count;
+ struct mbuf **m_vec;
- qs = &p->adapter->sge.qs[p->first_qset];
txq = &qs->txq[TXQ_ETH];
- err = 0;
-
- if (txq->flags & TXQ_TRANSMITTING)
- return (EINPROGRESS);
-
- mtx_lock(&txq->lock);
- txq->flags |= TXQ_TRANSMITTING;
+ m_vec = txq->txq_m_vec;
in_use_init = txq->in_use;
+ err = 0;
while ((txq->in_use - in_use_init < txmax) &&
(txq->size > txq->in_use + TX_MAX_DESC)) {
- free = 0;
- IFQ_DRV_DEQUEUE(&ifp->if_snd, m);
- if (m == NULL)
+ check_pkt_coalesce(qs);
+ count = cxgb_dequeue_packet(ifp, txq, m_vec);
+ if (count == 0)
break;
- /*
- * Convert chain to M_IOVEC
- */
- KASSERT((m->m_flags & M_IOVEC) == 0, ("IOVEC set too early"));
-#ifdef notyet
- m0 = m;
- if (collapse_mbufs && m->m_pkthdr.len > MCLBYTES &&
- m_collapse(m, TX_MAX_SEGS, &m0) == EFBIG) {
- if ((m0 = m_defrag(m, M_NOWAIT)) != NULL) {
- m = m0;
- m_collapse(m, TX_MAX_SEGS, &m0);
- } else
- break;
- }
- m = m0;
-#endif
- if ((err = t3_encap(p, &m, &free)) != 0)
+ ETHER_BPF_MTAP(ifp, m_vec[0]);
+
+ if ((err = t3_encap(qs, m_vec, count)) != 0)
break;
- BPF_MTAP(ifp, m);
- if (free)
- m_freem(m);
+ txq->txq_enqueued += count;
}
- txq->flags &= ~TXQ_TRANSMITTING;
- mtx_unlock(&txq->lock);
-
+#ifndef IFNET_MULTIQUEUE
if (__predict_false(err)) {
if (err == ENOMEM) {
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
IFQ_LOCK(&ifp->if_snd);
- IFQ_DRV_PREPEND(&ifp->if_snd, m);
+ IFQ_DRV_PREPEND(&ifp->if_snd, m_vec[0]);
IFQ_UNLOCK(&ifp->if_snd);
}
}
- if (err == 0 && m == NULL)
+ if (err == 0 && m_vec[0] == NULL) {
err = ENOBUFS;
+ }
else if ((err == 0) && (txq->size <= txq->in_use + TX_MAX_DESC) &&
(ifp->if_drv_flags & IFF_DRV_OACTIVE) == 0) {
ifp->if_drv_flags |= IFF_DRV_OACTIVE;
err = ENOSPC;
}
+#else
+ if ((err == 0) && (txq->size <= txq->in_use + TX_MAX_DESC)) {
+ err = ENOSPC;
+ setbit(&qs->txq_stopped, TXQ_ETH);
+ }
+ if (err == ENOMEM) {
+ int i;
+ /*
+ * Sub-optimal :-/
+ */
+ for (i = 0; i < count; i++)
+ m_freem(m_vec[i]);
+ }
+#endif
+ return (err);
+}
+
+#ifndef IFNET_MULTIQUEUE
+static int
+cxgb_start_tx(struct ifnet *ifp, uint32_t txmax)
+{
+ struct sge_qset *qs;
+ struct sge_txq *txq;
+ struct port_info *p = ifp->if_softc;
+ int err;
+
+ if (!p->link_config.link_ok)
+ return (ENXIO);
+
+ if (IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+ return (ENOBUFS);
+ }
+
+ qs = &p->adapter->sge.qs[p->first_qset];
+ txq = &qs->txq[TXQ_ETH];
+ err = 0;
+
+ if (txq->flags & TXQ_TRANSMITTING)
+ return (EINPROGRESS);
+
+ mtx_lock(&txq->lock);
+ txq->flags |= TXQ_TRANSMITTING;
+ cxgb_tx_common(ifp, qs, txmax);
+ txq->flags &= ~TXQ_TRANSMITTING;
+ mtx_unlock(&txq->lock);
+
return (err);
}
@@ -1932,7 +1991,15 @@ cxgb_start_proc(void *arg, int ncount)
} while (error == 0);
}
-static void
+int
+cxgb_dequeue_packet(struct ifnet *ifp, struct sge_txq *unused, struct mbuf **m_vec)
+{
+
+ IFQ_DRV_DEQUEUE(&ifp->if_snd, m_vec[0]);
+ return (m_vec[0] ? 1 : 0);
+}
+
+void
cxgb_start(struct ifnet *ifp)
{
struct port_info *pi = ifp->if_softc;
@@ -1952,7 +2019,7 @@ cxgb_start(struct ifnet *ifp)
if (err == 0)
taskqueue_enqueue(pi->tq, &pi->start_task);
}
-
+#endif
static int
cxgb_media_change(struct ifnet *ifp)
@@ -2078,12 +2145,26 @@ static void
cxgb_tick(void *arg)
{
adapter_t *sc = (adapter_t *)arg;
+ int i, running = 0;
+
+ for_each_port(sc, i) {
+
+ struct port_info *p = &sc->port[i];
+ struct ifnet *ifp = p->ifp;
+ PORT_LOCK(p);
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING))
+ running = 1;
+ PORT_UNLOCK(p);
+ }
+
+ if (running == 0)
+ return;
+
taskqueue_enqueue(sc->tq, &sc->tick_task);
if (sc->open_device_map != 0)
- callout_reset(&sc->cxgb_tick_ch, sc->params.stats_update_period * hz,
- cxgb_tick, sc);
+ callout_reset(&sc->cxgb_tick_ch, hz, cxgb_tick, sc);
}
static void
@@ -2478,7 +2559,7 @@ cxgb_extension_ioctl(struct cdev *dev, unsigned long cmd, caddr_t data,
* Read 256 bytes at a time as len can be large and we don't
* want to use huge intermediate buffers.
*/
- useraddr = (uint8_t *)(t + 1); /* advance to start of buffer */
+ useraddr = (uint8_t *)t->buf;
while (t->len) {
unsigned int chunk = min(t->len, sizeof(buf));
diff --git a/sys/dev/cxgb/cxgb_offload.c b/sys/dev/cxgb/cxgb_offload.c
index d0b9b32..3ce1a11 100644
--- a/sys/dev/cxgb/cxgb_offload.c
+++ b/sys/dev/cxgb/cxgb_offload.c
@@ -108,9 +108,12 @@ cxgb_register_client(struct cxgb_client *client)
printf("client->add set\n");
TAILQ_FOREACH(tdev, &ofld_dev_list, entry) {
- if (offload_activated(tdev))
+ if (offload_activated(tdev)) {
+ printf("calling add=%p on %p\n",
+ client->add, tdev);
+
client->add(tdev);
- else
+ } else
printf("%p not activated\n", tdev);
}
@@ -477,7 +480,8 @@ rx_offload_blackhole(struct t3cdev *dev, struct mbuf **m, int n)
}
static void
-dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa)
+dummy_neigh_update(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr,
+ struct sockaddr *sa)
{
}
@@ -895,17 +899,32 @@ do_term(struct t3cdev *dev, struct mbuf *m)
}
static void
-cxgb_route_event(void *unused, int event, struct rtentry *rt0,
+cxgb_arp_update_event(void *unused, struct rtentry *rt0,
+ uint8_t *enaddr, struct sockaddr *sa)
+{
+
+ if (TOEDEV(rt0->rt_ifp) == NULL)
+ return;
+
+ RT_ADDREF(rt0);
+ RT_UNLOCK(rt0);
+ cxgb_neigh_update(rt0, enaddr, sa);
+ RT_LOCK(rt0);
+ RT_REMREF(rt0);
+}
+
+
+static void
+cxgb_redirect_event(void *unused, int event, struct rtentry *rt0,
struct rtentry *rt1, struct sockaddr *sa)
{
- struct toedev *tdev0, *tdev1 = NULL;
+ struct toedev *tdev0, *tdev1;
/*
* ignore events on non-offloaded interfaces
*/
tdev0 = TOEDEV(rt0->rt_ifp);
- if (rt1)
- tdev1 = TOEDEV(rt1->rt_ifp);
+ tdev1 = TOEDEV(rt1->rt_ifp);
if (tdev0 == NULL && tdev1 == NULL)
return;
/*
@@ -914,34 +933,16 @@ cxgb_route_event(void *unused, int event, struct rtentry *rt0,
*/
RT_ADDREF(rt0);
RT_UNLOCK(rt0);
- if (rt1) {
- RT_ADDREF(rt1);
- RT_UNLOCK(rt1);
- }
-
- switch (event) {
- case RTEVENT_ARP_UPDATE: {
- cxgb_neigh_update(rt0, sa);
- break;
- }
- case RTEVENT_REDIRECT_UPDATE: {
- cxgb_redirect(rt0, rt1, sa);
- cxgb_neigh_update(rt1, sa);
+ RT_ADDREF(rt1);
+ RT_UNLOCK(rt1);
- break;
- }
- case RTEVENT_PMTU_UPDATE:
- default:
- break;
- }
+ cxgb_redirect(rt0, rt1, sa);
+ cxgb_neigh_update(rt1, NULL, sa);
RT_LOCK(rt0);
RT_REMREF(rt0);
- if (rt1) {
- RT_LOCK(rt1);
- RT_REMREF(rt1);
- }
-
+ RT_LOCK(rt1);
+ RT_REMREF(rt1);
}
/*
@@ -1048,14 +1049,14 @@ cxgb_ofld_recv(struct t3cdev *dev, struct mbuf **m, int n)
}
void
-cxgb_neigh_update(struct rtentry *rt, struct sockaddr *sa)
+cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa)
{
if (is_offloading(rt->rt_ifp)) {
struct t3cdev *tdev = T3CDEV(rt->rt_ifp);
PANIC_IF(!tdev);
- t3_l2t_update(tdev, rt, sa);
+ t3_l2t_update(tdev, rt, enaddr, sa);
}
}
@@ -1425,7 +1426,10 @@ cxgb_offload_init(void)
t3_register_cpl_handler(CPL_RX_DDP_COMPLETE, do_hwtid_rpl);
t3_register_cpl_handler(CPL_ISCSI_HDR, do_hwtid_rpl);
- EVENTHANDLER_REGISTER(route_event, cxgb_route_event, NULL, EVENTHANDLER_PRI_ANY);
+ EVENTHANDLER_REGISTER(route_arp_update_event, cxgb_arp_update_event,
+ NULL, EVENTHANDLER_PRI_ANY);
+ EVENTHANDLER_REGISTER(route_redirect_event, cxgb_redirect_event,
+ NULL, EVENTHANDLER_PRI_ANY);
#if 0
if (offload_proc_init())
diff --git a/sys/dev/cxgb/cxgb_offload.h b/sys/dev/cxgb/cxgb_offload.h
index 59afe6b..8c84d07 100644
--- a/sys/dev/cxgb/cxgb_offload.h
+++ b/sys/dev/cxgb/cxgb_offload.h
@@ -253,7 +253,7 @@ static inline struct toe_tid_entry *lookup_atid(const struct tid_info *t,
void *cxgb_alloc_mem(unsigned long size);
void cxgb_free_mem(void *addr);
-void cxgb_neigh_update(struct rtentry *rt, struct sockaddr *sa);
+void cxgb_neigh_update(struct rtentry *rt, uint8_t *enaddr, struct sockaddr *sa);
void cxgb_redirect(struct rtentry *old, struct rtentry *new, struct sockaddr *sa);
int process_rx(struct t3cdev *dev, struct mbuf **m, int n);
int attach_t3cdev(struct t3cdev *dev);
diff --git a/sys/dev/cxgb/cxgb_osdep.h b/sys/dev/cxgb/cxgb_osdep.h
index cf5a8b6..7f75779 100644
--- a/sys/dev/cxgb/cxgb_osdep.h
+++ b/sys/dev/cxgb/cxgb_osdep.h
@@ -36,6 +36,9 @@ $FreeBSD$
#include <sys/endian.h>
#include <sys/bus.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
#include <dev/mii/mii.h>
#ifdef CONFIG_DEFINED
@@ -52,18 +55,17 @@ $FreeBSD$
typedef struct adapter adapter_t;
struct sge_rspq;
+
struct t3_mbuf_hdr {
struct mbuf *mh_head;
struct mbuf *mh_tail;
};
-
#define PANIC_IF(exp) do { \
if (exp) \
panic("BUG: %s", #exp); \
} while (0)
-
#define m_get_priority(m) ((uintptr_t)(m)->m_pkthdr.rcvif)
#define m_set_priority(m, pri) ((m)->m_pkthdr.rcvif = (struct ifnet *)((uintptr_t)pri))
#define m_set_sgl(m, sgl) ((m)->m_pkthdr.header = (sgl))
@@ -113,6 +115,7 @@ struct t3_mbuf_hdr {
#define CXGB_TX_CLEANUP_THRESHOLD 32
+
#ifdef DEBUG_PRINT
#define DPRINTF printf
#else
@@ -121,19 +124,25 @@ struct t3_mbuf_hdr {
#define TX_MAX_SIZE (1 << 16) /* 64KB */
#define TX_MAX_SEGS 36 /* maximum supported by card */
+
#define TX_MAX_DESC 4 /* max descriptors per packet */
+
#define TX_START_MIN_DESC (TX_MAX_DESC << 2)
-#if 0
-#define TX_START_MAX_DESC (TX_ETH_Q_SIZE >> 2) /* maximum number of descriptors */
-#endif
+
#define TX_START_MAX_DESC (TX_MAX_DESC << 3) /* maximum number of descriptors
* call to start used per */
#define TX_CLEAN_MAX_DESC (TX_MAX_DESC << 4) /* maximum tx descriptors
* to clean per iteration */
+#define TX_WR_SIZE_MAX 11*1024 /* the maximum total size of packets aggregated into a single
+ * TX WR
+ */
+#define TX_WR_COUNT_MAX 7 /* the maximum total number of packets that can be
+ * aggregated into a single TX WR
+ */
#if defined(__i386__) || defined(__amd64__)
@@ -142,7 +151,7 @@ struct t3_mbuf_hdr {
#define wmb() __asm volatile("sfence" ::: "memory")
#define smp_mb() mb()
-#define L1_CACHE_BYTES 64
+#define L1_CACHE_BYTES 128
static __inline
void prefetch(void *x)
{
@@ -167,6 +176,107 @@ extern void kdb_backtrace(void);
#define prefetch(x)
#define L1_CACHE_BYTES 32
#endif
+
+struct buf_ring {
+ caddr_t *br_ring;
+ volatile uint32_t br_cons;
+ volatile uint32_t br_prod;
+ int br_size;
+ struct mtx br_lock;
+};
+
+struct buf_ring *buf_ring_alloc(int count, int flags);
+void buf_ring_free(struct buf_ring *);
+
+static __inline int
+buf_ring_count(struct buf_ring *mr)
+{
+ int size = mr->br_size;
+ int mask = size - 1;
+
+ return ((size + mr->br_prod - mr->br_cons) & mask);
+}
+
+static __inline int
+buf_ring_empty(struct buf_ring *mr)
+{
+ return (mr->br_cons == mr->br_prod);
+}
+
+/*
+ * The producer and consumer are independently locked
+ * this relies on the consumer providing his own serialization
+ *
+ */
+static __inline void *
+buf_ring_dequeue(struct buf_ring *mr)
+{
+ int prod, cons, mask;
+ caddr_t *ring, m;
+
+ ring = (caddr_t *)mr->br_ring;
+ mask = mr->br_size - 1;
+ cons = mr->br_cons;
+ prod = mr->br_prod;
+ m = NULL;
+ if (cons != prod) {
+ m = ring[cons];
+ mr->br_cons = (cons + 1) & mask;
+ mb();
+ }
+ return (m);
+}
+
+
+static __inline int
+__buf_ring_enqueue(struct buf_ring *mr, void *m)
+{
+
+ int prod, cons, mask, err;
+
+ cons = mr->br_cons;
+ prod = mr->br_prod;
+ mask = mr->br_size - 1;
+ if (((prod + 1) & mask) != cons) {
+ mr->br_ring[prod] = m;
+ mb();
+ mr->br_prod = (prod + 1) & mask;
+ err = 0;
+ } else
+ err = ENOBUFS;
+
+ return (err);
+}
+
+static __inline int
+buf_ring_enqueue(struct buf_ring *mr, void *m)
+{
+ int err;
+
+ mtx_lock(&mr->br_lock);
+ err = __buf_ring_enqueue(mr, m);
+ mtx_unlock(&mr->br_lock);
+
+ return (err);
+}
+
+static __inline void *
+buf_ring_peek(struct buf_ring *mr)
+{
+ int prod, cons, mask;
+ caddr_t *ring, m;
+
+ ring = (caddr_t *)mr->br_ring;
+ mask = mr->br_size - 1;
+ cons = mr->br_cons;
+ prod = mr->br_prod;
+ m = NULL;
+ if (cons != prod)
+ m = ring[cons];
+
+ return (m);
+}
+
#define DBG_RX (1 << 0)
static const int debug_flags = DBG_RX;
@@ -189,15 +299,12 @@ static const int debug_flags = DBG_RX;
#define t3_os_sleep(x) DELAY((x) * 1000)
-#define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | bit), ((*(p)) & ~bit))
-
+#define test_and_clear_bit(bit, p) atomic_cmpset_int((p), ((*(p)) | (1<<bit)), ((*(p)) & ~(1<<bit)))
#define max_t(type, a, b) (type)max((a), (b))
#define net_device ifnet
#define cpu_to_be32 htobe32
-
-
/* Standard PHY definitions */
#define BMCR_LOOPBACK BMCR_LOOP
#define BMCR_ISOLATE BMCR_ISO
@@ -247,13 +354,13 @@ static const int debug_flags = DBG_RX;
#define swab32(x) bswap32(x)
#define simple_strtoul strtoul
-/* More types and endian definitions */
+
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
-
-typedef uint8_t __u8;
+
+typedef uint8_t __u8;
typedef uint16_t __u16;
typedef uint32_t __u32;
typedef uint8_t __be8;
@@ -261,6 +368,7 @@ typedef uint16_t __be16;
typedef uint32_t __be32;
typedef uint64_t __be64;
+
#if BYTE_ORDER == BIG_ENDIAN
#define __BIG_ENDIAN_BITFIELD
#elif BYTE_ORDER == LITTLE_ENDIAN
diff --git a/sys/dev/cxgb/cxgb_sge.c b/sys/dev/cxgb/cxgb_sge.c
index e41148a..a079679 100644
--- a/sys/dev/cxgb/cxgb_sge.c
+++ b/sys/dev/cxgb/cxgb_sge.c
@@ -42,13 +42,13 @@ __FBSDID("$FreeBSD$");
#include <sys/rman.h>
#include <sys/queue.h>
#include <sys/sysctl.h>
-#include <sys/syslog.h>
#include <sys/taskqueue.h>
#include <sys/proc.h>
#include <sys/sched.h>
#include <sys/smp.h>
#include <sys/systm.h>
+#include <sys/syslog.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
@@ -59,8 +59,7 @@ __FBSDID("$FreeBSD$");
#include <dev/pci/pcivar.h>
#include <vm/vm.h>
-#include <vm/vm_page.h>
-#include <vm/vm_map.h>
+#include <vm/pmap.h>
#ifdef CONFIG_DEFINED
#include <cxgb_include.h>
@@ -70,14 +69,15 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/sys/mvec.h>
#endif
-uint32_t collapse_free = 0;
-uint32_t mb_free_vec_free = 0;
int txq_fills = 0;
-int collapse_mbufs = 0;
static int bogus_imm = 0;
#ifndef DISABLE_MBUF_IOVEC
static int recycle_enable = 1;
#endif
+extern int cxgb_txq_buf_ring_size;
+int cxgb_cached_allocations;
+int cxgb_cached;
+int cxgb_ext_freed;
#define USE_GTS 0
@@ -134,15 +134,17 @@ struct rsp_desc { /* response queue descriptor */
#define RSPQ_SOP_EOP G_RSPD_SOP_EOP(F_RSPD_SOP|F_RSPD_EOP)
struct tx_sw_desc { /* SW state per Tx descriptor */
- struct mbuf *m;
+ struct mbuf_iovec mi;
bus_dmamap_t map;
int flags;
};
struct rx_sw_desc { /* SW state per Rx descriptor */
- void *cl;
- bus_dmamap_t map;
- int flags;
+ caddr_t rxsd_cl;
+ uint32_t *rxsd_ref;
+ caddr_t data;
+ bus_dmamap_t map;
+ int flags;
};
struct txq_state {
@@ -186,11 +188,9 @@ static uint8_t flit_desc_map[] = {
static int lro_default = 0;
int cxgb_debug = 0;
-static void t3_free_qset(adapter_t *sc, struct sge_qset *q);
static void sge_timer_cb(void *arg);
static void sge_timer_reclaim(void *arg, int ncount);
static void sge_txq_reclaim_handler(void *arg, int ncount);
-static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec);
/**
* reclaim_completed_tx - reclaims completed Tx descriptors
@@ -202,19 +202,17 @@ static int free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec);
* queue's lock held.
*/
static __inline int
-reclaim_completed_tx(struct sge_txq *q, int nbufs, struct mbuf **mvec)
+reclaim_completed_tx(struct sge_txq *q)
{
- int reclaimed, reclaim = desc_reclaimable(q);
- int n = 0;
+ int reclaim = desc_reclaimable(q);
mtx_assert(&q->lock, MA_OWNED);
if (reclaim > 0) {
- n = free_tx_desc(q, min(reclaim, nbufs), mvec);
- reclaimed = min(reclaim, nbufs);
- q->cleaned += reclaimed;
- q->in_use -= reclaimed;
+ t3_free_tx_desc(q, reclaim);
+ q->cleaned += reclaim;
+ q->in_use -= reclaim;
}
- return (n);
+ return (reclaim);
}
/**
@@ -298,38 +296,14 @@ sgl_len(unsigned int n)
static __inline int
get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *mh)
{
- struct mbuf *m;
- int len;
- uint32_t flags = ntohl(resp->flags);
- uint8_t sopeop = G_RSPD_SOP_EOP(flags);
-
- /*
- * would be a firmware bug
- */
- if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP)
return (0);
- m = m_gethdr(M_NOWAIT, MT_DATA);
- len = G_RSPD_LEN(ntohl(resp->len_cq));
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
+ len = IMMED_PKT_SIZE;
if (m) {
- MH_ALIGN(m, IMMED_PKT_SIZE);
memcpy(m->m_data, resp->imm_data, IMMED_PKT_SIZE);
- m->m_len = len;
-
- switch (sopeop) {
- case RSPQ_SOP_EOP:
- mh->mh_head = mh->mh_tail = m;
- m->m_pkthdr.len = len;
- m->m_flags |= M_PKTHDR;
- break;
- case RSPQ_EOP:
- m->m_flags &= ~M_PKTHDR;
- mh->mh_head->m_pkthdr.len += len;
- mh->mh_tail->m_next = m;
- mh->mh_tail = m;
- break;
- }
+ m->m_pkthdr.len = m->m_len = len;
}
return (m != NULL);
}
@@ -338,35 +312,11 @@ get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct t3_mbuf_hdr *m
static int
get_imm_packet(adapter_t *sc, const struct rsp_desc *resp, struct mbuf *m, void *cl, uint32_t flags)
{
- int len, error;
- uint8_t sopeop = G_RSPD_SOP_EOP(flags);
-
- /*
- * would be a firmware bug
- */
- len = G_RSPD_LEN(ntohl(resp->len_cq));
- if (sopeop == RSPQ_NSOP_NEOP || sopeop == RSPQ_SOP) {
- if (cxgb_debug)
- device_printf(sc->dev, "unexpected value sopeop=%d flags=0x%x len=%din get_imm_packet\n", sopeop, flags, len);
- bogus_imm++;
- return (EINVAL);
- }
- error = 0;
- switch (sopeop) {
- case RSPQ_SOP_EOP:
- m->m_len = m->m_pkthdr.len = len;
- memcpy(mtod(m, uint8_t *), resp->imm_data, len);
- break;
- case RSPQ_EOP:
- memcpy(cl, resp->imm_data, len);
- m_iovappend(m, cl, MSIZE, len, 0);
- break;
- default:
- bogus_imm++;
- error = EINVAL;
- }
- return (error);
+ m->m_len = m->m_pkthdr.len = IMMED_PKT_SIZE;
+ memcpy(mtod(m, uint8_t *), resp->imm_data, IMMED_PKT_SIZE);
+ return (0);
+
}
#endif
@@ -413,11 +363,15 @@ t3_sge_prep(adapter_t *adap, struct sge_params *p)
q->polling = adap->params.rev > 0;
- if (adap->params.nports > 2)
+ if (adap->params.nports > 2) {
q->coalesce_nsecs = 50000;
- else
+ } else {
+#ifdef INVARIANTS
+ q->coalesce_nsecs = 20000;
+#else
q->coalesce_nsecs = 5000;
-
+#endif
+ }
q->rspq_size = RSPQ_Q_SIZE;
q->fl_size = FL_Q_SIZE;
q->jumbo_size = JUMBO_Q_SIZE;
@@ -509,6 +463,7 @@ t3_update_qset_coalesce(struct sge_qset *qs, const struct qset_params *p)
qs->rspq.polling = 0 /* p->polling */;
}
+#if !defined(__i386__) && !defined(__amd64__)
static void
refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
{
@@ -519,7 +474,7 @@ refill_fl_cb(void *arg, bus_dma_segment_t *segs, int nseg, int error)
cb_arg->nseg = nseg;
}
-
+#endif
/**
* refill_fl - refill an SGE free-buffer list
* @sc: the controller softc
@@ -535,7 +490,7 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
struct rx_sw_desc *sd = &q->sdesc[q->pidx];
struct rx_desc *d = &q->desc[q->pidx];
struct refill_fl_cb_arg cb_arg;
- void *cl;
+ caddr_t cl;
int err;
cb_arg.error = 0;
@@ -543,10 +498,11 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
/*
* We only allocate a cluster, mbuf allocation happens after rx
*/
- if ((cl = m_cljget(NULL, M_DONTWAIT, q->buf_size)) == NULL) {
+ if ((cl = cxgb_cache_get(q->zone)) == NULL) {
log(LOG_WARNING, "Failed to allocate cluster\n");
goto done;
}
+
if ((sd->flags & RX_SW_DESC_MAP_CREATED) == 0) {
if ((err = bus_dmamap_create(q->entry_tag, 0, &sd->map))) {
log(LOG_WARNING, "bus_dmamap_create failed %d\n", err);
@@ -555,7 +511,9 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
}
sd->flags |= RX_SW_DESC_MAP_CREATED;
}
- err = bus_dmamap_load(q->entry_tag, sd->map, cl, q->buf_size,
+#if !defined(__i386__) && !defined(__amd64__)
+ err = bus_dmamap_load(q->entry_tag, sd->map,
+ cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t), q->buf_size,
refill_fl_cb, &cb_arg, 0);
if (err != 0 || cb_arg.error) {
@@ -565,9 +523,14 @@ refill_fl(adapter_t *sc, struct sge_fl *q, int n)
*/
return;
}
-
+#else
+ cb_arg.seg.ds_addr = pmap_kextract((vm_offset_t)(cl + sizeof(struct m_hdr) +
+ sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t)));
+#endif
sd->flags |= RX_SW_DESC_INUSE;
- sd->cl = cl;
+ sd->rxsd_cl = cl;
+ sd->rxsd_ref = (uint32_t *)(cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_));
+ sd->data = cl + sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
d->addr_lo = htobe32(cb_arg.seg.ds_addr & 0xffffffff);
d->addr_hi = htobe32(((uint64_t)cb_arg.seg.ds_addr >>32) & 0xffffffff);
d->len_gen = htobe32(V_FLD_GEN1(q->gen));
@@ -609,9 +572,9 @@ free_rx_bufs(adapter_t *sc, struct sge_fl *q)
if (d->flags & RX_SW_DESC_INUSE) {
bus_dmamap_unload(q->entry_tag, d->map);
bus_dmamap_destroy(q->entry_tag, d->map);
- uma_zfree(q->zone, d->cl);
+ uma_zfree(q->zone, d->rxsd_cl);
}
- d->cl = NULL;
+ d->rxsd_cl = NULL;
if (++cidx == q->size)
cidx = 0;
}
@@ -623,6 +586,19 @@ __refill_fl(adapter_t *adap, struct sge_fl *fl)
refill_fl(adap, fl, min(16U, fl->size - fl->credits));
}
+static __inline void
+__refill_fl_lt(adapter_t *adap, struct sge_fl *fl, int max)
+{
+ if ((fl->size - fl->credits) < max)
+ refill_fl(adap, fl, min(max, fl->size - fl->credits));
+}
+
+void
+refill_fl_service(adapter_t *adap, struct sge_fl *fl)
+{
+ __refill_fl_lt(adap, fl, 512);
+}
+
#ifndef DISABLE_MBUF_IOVEC
/**
* recycle_rx_buf - recycle a receive buffer
@@ -753,12 +729,13 @@ static void
sge_timer_cb(void *arg)
{
adapter_t *sc = arg;
- struct port_info *p;
+#ifndef IFNET_MULTIQUEUE
+ struct port_info *pi;
struct sge_qset *qs;
struct sge_txq *txq;
int i, j;
int reclaim_eth, reclaim_ofl, refill_rx;
-
+
for (i = 0; i < sc->params.nports; i++)
for (j = 0; j < sc->port[i].nqsets; j++) {
qs = &sc->sge.qs[i + j];
@@ -768,11 +745,12 @@ sge_timer_cb(void *arg)
refill_rx = ((qs->fl[0].credits < qs->fl[0].size) ||
(qs->fl[1].credits < qs->fl[1].size));
if (reclaim_eth || reclaim_ofl || refill_rx) {
- p = &sc->port[i];
- taskqueue_enqueue(p->tq, &p->timer_reclaim_task);
+ pi = &sc->port[i];
+ taskqueue_enqueue(pi->tq, &pi->timer_reclaim_task);
break;
}
}
+#endif
if (sc->params.nports > 2) {
int i;
@@ -799,13 +777,15 @@ t3_sge_init_adapter(adapter_t *sc)
callout_init(&sc->sge_timer_ch, CALLOUT_MPSAFE);
callout_reset(&sc->sge_timer_ch, TX_RECLAIM_PERIOD, sge_timer_cb, sc);
TASK_INIT(&sc->slow_intr_task, 0, sge_slow_intr_handler, sc);
+ mi_init();
+ cxgb_cache_init();
return (0);
}
int
-t3_sge_init_port(struct port_info *p)
+t3_sge_init_port(struct port_info *pi)
{
- TASK_INIT(&p->timer_reclaim_task, 0, sge_timer_reclaim, p);
+ TASK_INIT(&pi->timer_reclaim_task, 0, sge_timer_reclaim, pi);
return (0);
}
@@ -820,6 +800,8 @@ t3_sge_deinit_sw(adapter_t *sc)
for (i = 0; i < sc->params.nports; i++)
if (sc->port[i].tq != NULL)
taskqueue_drain(sc->port[i].tq, &sc->port[i].timer_reclaim_task);
+
+ mi_deinit();
}
/**
@@ -843,29 +825,22 @@ refill_rspq(adapter_t *sc, const struct sge_rspq *q, u_int credits)
static __inline void
sge_txq_reclaim_(struct sge_txq *txq)
{
- int reclaimable, i, n;
- struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
- struct port_info *p;
+ int reclaimable, n;
+ struct port_info *pi;
- p = txq->port;
+ pi = txq->port;
reclaim_more:
n = 0;
reclaimable = desc_reclaimable(txq);
if (reclaimable > 0 && mtx_trylock(&txq->lock)) {
- n = reclaim_completed_tx(txq, TX_CLEAN_MAX_DESC, m_vec);
+ n = reclaim_completed_tx(txq);
mtx_unlock(&txq->lock);
}
- if (n == 0)
- return;
-
- for (i = 0; i < n; i++) {
- m_freem(m_vec[i]);
- }
- if (p && p->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
+ if (pi && pi->ifp->if_drv_flags & IFF_DRV_OACTIVE &&
txq->size - txq->in_use >= TX_START_MAX_DESC) {
txq_fills++;
- p->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
- taskqueue_enqueue(p->tq, &p->start_task);
+ pi->ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+ taskqueue_enqueue(pi->tq, &pi->start_task);
}
if (n)
@@ -883,13 +858,16 @@ sge_txq_reclaim_handler(void *arg, int ncount)
static void
sge_timer_reclaim(void *arg, int ncount)
{
- struct port_info *p = arg;
- int i, nqsets = p->nqsets;
- adapter_t *sc = p->adapter;
+ struct port_info *pi = arg;
+ int i, nqsets = pi->nqsets;
+ adapter_t *sc = pi->adapter;
struct sge_qset *qs;
struct sge_txq *txq;
struct mtx *lock;
+#ifdef IFNET_MULTIQUEUE
+ panic("%s should not be called with multiqueue support\n", __FUNCTION__);
+#endif
for (i = 0; i < nqsets; i++) {
qs = &sc->sge.qs[i];
txq = &qs->txq[TXQ_ETH];
@@ -942,6 +920,10 @@ init_qset_cntxt(struct sge_qset *qs, u_int id)
qs->txq[TXQ_OFLD].cntxt_id = FW_OFLD_SGEEC_START + id;
qs->txq[TXQ_CTRL].cntxt_id = FW_CTRL_SGEEC_START + id;
qs->txq[TXQ_CTRL].token = FW_CTRL_TID_START + id;
+
+ mbufq_init(&qs->txq[TXQ_ETH].sendq);
+ mbufq_init(&qs->txq[TXQ_OFLD].sendq);
+ mbufq_init(&qs->txq[TXQ_CTRL].sendq);
}
@@ -985,7 +967,7 @@ calc_tx_descs(const struct mbuf *m, int nsegs)
flits = sgl_len(nsegs) + 2;
#ifdef TSO_SUPPORTED
- if (m->m_pkthdr.csum_flags & (CSUM_TSO))
+ if (m->m_pkthdr.csum_flags & CSUM_TSO)
flits++;
#endif
return flits_to_desc(flits);
@@ -993,28 +975,27 @@ calc_tx_descs(const struct mbuf *m, int nsegs)
static unsigned int
busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
- struct tx_sw_desc *stx, bus_dma_segment_t *segs, int *nsegs)
+ struct tx_sw_desc *txsd, bus_dma_segment_t *segs, int *nsegs)
{
struct mbuf *m0;
- int err, pktlen;
+ int err, pktlen, pass = 0;
+retry:
+ err = 0;
m0 = *m;
pktlen = m0->m_pkthdr.len;
+#if defined(__i386__) || defined(__amd64__)
+ if (busdma_map_sg_collapse(m, segs, nsegs) == 0) {
+ goto done;
+ } else
+#endif
+ err = bus_dmamap_load_mbuf_sg(txq->entry_tag, txsd->map, m0, segs, nsegs, 0);
- err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
-#ifdef DEBUG
- if (err) {
- int n = 0;
- struct mbuf *mtmp = m0;
- while(mtmp) {
- n++;
- mtmp = mtmp->m_next;
- }
- printf("map_mbufs: bus_dmamap_load_mbuf_sg failed with %d - pkthdr.len==%d nmbufs=%d\n",
- err, m0->m_pkthdr.len, n);
+ if (err == 0) {
+ goto done;
}
-#endif
- if (err == EFBIG) {
+ if (err == EFBIG && pass == 0) {
+ pass = 1;
/* Too many segments, try to defrag */
m0 = m_defrag(m0, M_DONTWAIT);
if (m0 == NULL) {
@@ -1023,23 +1004,21 @@ busdma_map_mbufs(struct mbuf **m, struct sge_txq *txq,
return (ENOBUFS);
}
*m = m0;
- err = bus_dmamap_load_mbuf_sg(txq->entry_tag, stx->map, m0, segs, nsegs, 0);
- }
-
- if (err == ENOMEM) {
+ goto retry;
+ } else if (err == ENOMEM) {
return (err);
- }
-
- if (err) {
+ } if (err) {
if (cxgb_debug)
printf("map failure err=%d pktlen=%d\n", err, pktlen);
m_freem(m0);
*m = NULL;
return (err);
}
-
- bus_dmamap_sync(txq->entry_tag, stx->map, BUS_DMASYNC_PREWRITE);
- stx->flags |= TX_SW_DESC_MAPPED;
+done:
+#if !defined(__i386__) && !defined(__amd64__)
+ bus_dmamap_sync(txq->entry_tag, txsd->map, BUS_DMASYNC_PREWRITE);
+#endif
+ txsd->flags |= TX_SW_DESC_MAPPED;
return (0);
}
@@ -1059,12 +1038,18 @@ make_sgl(struct sg_ent *sgp, bus_dma_segment_t *segs, int nsegs)
{
int i, idx;
- for (idx = 0, i = 0; i < nsegs; i++, idx ^= 1) {
+ for (idx = 0, i = 0; i < nsegs; i++) {
+ /*
+ * firmware doesn't like empty segments
+ */
+ if (segs[i].ds_len == 0)
+ continue;
if (i && idx == 0)
++sgp;
-
+
sgp->len[idx] = htobe32(segs[i].ds_len);
sgp->addr[idx] = htobe64(segs[i].ds_addr);
+ idx ^= 1;
}
if (idx)
@@ -1112,6 +1097,20 @@ wr_gen2(struct tx_desc *d, unsigned int gen)
#endif
}
+#if 0
+static int print_wr = 0;
+static __inline void
+do_print_wr(struct tx_desc *d, int flits)
+{
+ int i = 0;
+
+ if (print_wr)
+ while (flits--) {
+ printf("flit[%d]: 0x%016lx\n", i, d->flit[i]);
+ i++;
+ }
+}
+#endif
/**
@@ -1131,7 +1130,6 @@ wr_gen2(struct tx_desc *d, unsigned int gen)
* and we just need to write the WR header. Otherwise we distribute the
* SGL across the number of descriptors it spans.
*/
-
static void
write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs,
const struct sge_txq *txq, const struct sg_ent *sgl, unsigned int flits,
@@ -1149,6 +1147,7 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs
V_WR_GEN(txqs->gen)) | wr_lo;
/* XXX gen? */
wr_gen2(txd, txqs->gen);
+
} else {
unsigned int ogen = txqs->gen;
const uint64_t *fp = (const uint64_t *)sgl;
@@ -1183,7 +1182,7 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs
* is freed all clusters will be freed
* with it
*/
- txsd->m = NULL;
+ txsd->mi.mi_base = NULL;
wrp = (struct work_request_hdr *)txd;
wrp->wr_hi = htonl(V_WR_DATATYPE(1) |
V_WR_SGLSFLT(1)) | wr_hi;
@@ -1200,80 +1199,151 @@ write_wr_hdr_sgl(unsigned int ndesc, struct tx_desc *txd, struct txq_state *txqs
}
}
-
/* sizeof(*eh) + sizeof(*vhdr) + sizeof(*ip) + sizeof(*tcp) */
#define TCPPKTHDRSIZE (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN + 20 + 20)
+#ifdef VLAN_SUPPORTED
+#define GET_VTAG(cntrl, m) \
+do { \
+ if ((m)->m_flags & M_VLANTAG) \
+ cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((m)->m_pkthdr.ether_vtag); \
+} while (0)
+
+#define GET_VTAG_MI(cntrl, mi) \
+do { \
+ if ((mi)->mi_flags & M_VLANTAG) \
+ cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN((mi)->mi_ether_vtag); \
+} while (0)
+#else
+#define GET_VTAG(cntrl, m)
+#define GET_VTAG_MI(cntrl, m)
+#endif
+
int
-t3_encap(struct port_info *p, struct mbuf **m, int *free)
+t3_encap(struct sge_qset *qs, struct mbuf **m, int count)
{
adapter_t *sc;
struct mbuf *m0;
- struct sge_qset *qs;
struct sge_txq *txq;
- struct tx_sw_desc *stx;
struct txq_state txqs;
+ struct port_info *pi;
unsigned int ndesc, flits, cntrl, mlen;
int err, nsegs, tso_info = 0;
struct work_request_hdr *wrp;
struct tx_sw_desc *txsd;
- struct sg_ent *sgp, sgl[TX_MAX_SEGS / 2 + 1];
- bus_dma_segment_t segs[TX_MAX_SEGS];
+ struct sg_ent *sgp, *sgl;
+ bus_dma_segment_t *segs;
uint32_t wr_hi, wr_lo, sgl_flits;
struct tx_desc *txd;
- struct cpl_tx_pkt *cpl;
-
- m0 = *m;
- sc = p->adapter;
-
- DPRINTF("t3_encap port_id=%d qsidx=%d ", p->port_id, p->first_qset);
-
- /* port_id=1 qsid=1 txpkt_intf=2 tx_chan=0 */
-
- qs = &sc->sge.qs[p->first_qset];
+ struct mbuf_vec *mv;
+ struct mbuf_iovec *mi;
+
+ DPRINTF("t3_encap cpu=%d ", curcpu);
+ pi = qs->port;
+ sc = pi->adapter;
txq = &qs->txq[TXQ_ETH];
- stx = &txq->sdesc[txq->pidx];
+ txsd = &txq->sdesc[txq->pidx];
txd = &txq->desc[txq->pidx];
- cpl = (struct cpl_tx_pkt *)txd;
- mlen = m0->m_pkthdr.len;
- cpl->len = htonl(mlen | 0x80000000);
-
- DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", mlen, p->txpkt_intf, p->tx_chan);
- /*
- * XXX handle checksum, TSO, and VLAN here
- *
- */
- cntrl = V_TXPKT_INTF(p->txpkt_intf);
+ sgl = txq->txq_sgl;
+ segs = txq->txq_segs;
+ m0 = *m;
+ DPRINTF("t3_encap port_id=%d qsidx=%d ", pi->port_id, pi->first_qset);
+ DPRINTF("mlen=%d txpkt_intf=%d tx_chan=%d\n", m[0]->m_pkthdr.len, pi->txpkt_intf, pi->tx_chan);
- /*
- * XXX need to add VLAN support for 6.x
- */
+ cntrl = V_TXPKT_INTF(pi->txpkt_intf);
+/*
+ * XXX need to add VLAN support for 6.x
+ */
#ifdef VLAN_SUPPORTED
- if (m0->m_flags & M_VLANTAG)
- cntrl |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
if (m0->m_pkthdr.csum_flags & (CSUM_TSO))
tso_info = V_LSO_MSS(m0->m_pkthdr.tso_segsz);
-#endif
- if (tso_info) {
- int eth_type;
- struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *) cpl;
+#endif
+
+ if (count > 1) {
+ if ((err = busdma_map_sg_vec(m, &m0, segs, count)))
+ return (err);
+ nsegs = count;
+ } else if ((err = busdma_map_sg_collapse(&m0, segs, &nsegs))) {
+ if (cxgb_debug)
+ printf("failed ... err=%d\n", err);
+ return (err);
+ }
+ KASSERT(m0->m_pkthdr.len, ("empty packet nsegs=%d count=%d", nsegs, count));
+
+ if (m0->m_type == MT_DATA)
+ DPRINTF("mbuf type=%d tags:%d head=%p", m0->m_type, !SLIST_EMPTY(&m0->m_pkthdr.tags),
+ SLIST_FIRST(&m0->m_pkthdr.tags));
+
+ mi_collapse_mbuf(&txsd->mi, m0);
+ mi = &txsd->mi;
+
+ if (count > 1) {
+ struct cpl_tx_pkt_batch *cpl_batch = (struct cpl_tx_pkt_batch *)txd;
+ int i, fidx;
+ struct mbuf_iovec *batchmi;
+
+ mv = mtomv(m0);
+ batchmi = mv->mv_vec;
+
+ wrp = (struct work_request_hdr *)txd;
+
+ flits = count*2 + 1;
+ txq_prod(txq, 1, &txqs);
+
+ for (fidx = 1, i = 0; i < count; i++, batchmi++, fidx += 2) {
+ struct cpl_tx_pkt_batch_entry *cbe = &cpl_batch->pkt_entry[i];
+
+ cntrl = V_TXPKT_INTF(pi->txpkt_intf);
+ GET_VTAG_MI(cntrl, batchmi);
+ cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
+ cbe->cntrl = htonl(cntrl);
+ cbe->len = htonl(batchmi->mi_len | 0x80000000);
+ cbe->addr = htobe64(segs[i].ds_addr);
+ txd->flit[fidx] |= htobe64(1 << 24);
+ }
+
+ wrp->wr_hi = htonl(F_WR_SOP | F_WR_EOP | V_WR_DATATYPE(1) |
+ V_WR_SGLSFLT(flits)) | htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
+ wmb();
+ wrp->wr_lo = htonl(V_WR_LEN(flits) |
+ V_WR_GEN(txqs.gen)) | htonl(V_WR_TID(txq->token));
+ /* XXX gen? */
+ wr_gen2(txd, txqs.gen);
+ check_ring_tx_db(sc, txq);
+
+ return (0);
+ } else if (tso_info) {
+ int undersized, eth_type;
+ struct cpl_tx_pkt_lso *hdr = (struct cpl_tx_pkt_lso *)txd;
struct ip *ip;
struct tcphdr *tcp;
- char *pkthdr, tmp[TCPPKTHDRSIZE]; /* is this too large for the stack? */
+ char *pkthdr, tmp[TCPPKTHDRSIZE];
+ struct mbuf_vec *mv;
+ struct mbuf_iovec *tmpmi;
+
+ mv = mtomv(m0);
+ tmpmi = mv->mv_vec;
txd->flit[2] = 0;
+ GET_VTAG_MI(cntrl, mi);
cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT_LSO);
hdr->cntrl = htonl(cntrl);
-
- if (__predict_false(m0->m_len < TCPPKTHDRSIZE)) {
- pkthdr = &tmp[0];
- m_copydata(m0, 0, TCPPKTHDRSIZE, pkthdr);
- } else {
- pkthdr = mtod(m0, char *);
- }
+ mlen = m0->m_pkthdr.len;
+ hdr->len = htonl(mlen | 0x80000000);
+
+ DPRINTF("tso buf len=%d\n", mlen);
+ undersized = (((tmpmi->mi_len < TCPPKTHDRSIZE) &&
+ (m0->m_flags & M_VLANTAG)) ||
+ (tmpmi->mi_len < TCPPKTHDRSIZE - ETHER_VLAN_ENCAP_LEN));
+ if (__predict_false(undersized)) {
+ pkthdr = tmp;
+ dump_mi(mi);
+ panic("discontig packet - fixxorz");
+ } else
+ pkthdr = m0->m_data;
if (__predict_false(m0->m_flags & M_VLANTAG)) {
eth_type = CPL_ETH_II_VLAN;
@@ -1292,19 +1362,33 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free)
hdr->lso_info = htonl(tso_info);
flits = 3;
} else {
+ struct cpl_tx_pkt *cpl = (struct cpl_tx_pkt *)txd;
+
+ GET_VTAG(cntrl, m0);
cntrl |= V_TXPKT_OPCODE(CPL_TX_PKT);
cpl->cntrl = htonl(cntrl);
-
+ mlen = m0->m_pkthdr.len;
+ cpl->len = htonl(mlen | 0x80000000);
+
if (mlen <= WR_LEN - sizeof(*cpl)) {
txq_prod(txq, 1, &txqs);
- txq->sdesc[txqs.pidx].m = NULL;
- if (m0->m_len == m0->m_pkthdr.len)
- memcpy(&txd->flit[2], mtod(m0, uint8_t *), mlen);
- else
+ DPRINTF("mlen==%d max=%ld\n", mlen, (WR_LEN - sizeof(*cpl)));
+ if (mi->mi_type != MT_IOVEC &&
+ mi->mi_type != MT_CLIOVEC)
+ memcpy(&txd->flit[2], mi->mi_data, mlen);
+ else {
+ /*
+ * XXX mbuf_iovec
+ */
+#if 0
m_copydata(m0, 0, mlen, (caddr_t)&txd->flit[2]);
+#endif
+ printf("bailing on m_copydata\n");
+ }
+ m_freem_iovec(&txsd->mi);
+ txsd->mi.mi_base = NULL;
- *free = 1;
flits = (mlen + 7) / 8 + 2;
cpl->wr.wr_hi = htonl(V_WR_BCNTLFLT(mlen & 7) |
V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) |
@@ -1315,17 +1399,23 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free)
wr_gen2(txd, txqs.gen);
check_ring_tx_db(sc, txq);
+ DPRINTF("pio buf\n");
return (0);
}
+ DPRINTF("regular buf\n");
flits = 2;
}
-
wrp = (struct work_request_hdr *)txd;
-
- if ((err = busdma_map_mbufs(m, txq, stx, segs, &nsegs)) != 0) {
+
+#ifdef nomore
+ /*
+ * XXX need to move into one of the helper routines above
+ *
+ */
+ if ((err = busdma_map_mbufs(m, txq, txsd, segs, &nsegs)) != 0)
return (err);
- }
m0 = *m;
+#endif
ndesc = calc_tx_descs(m0, nsegs);
sgp = (ndesc == 1) ? (struct sg_ent *)&txd->flit[flits] : sgl;
@@ -1335,15 +1425,16 @@ t3_encap(struct port_info *p, struct mbuf **m, int *free)
DPRINTF("make_sgl success nsegs==%d ndesc==%d\n", nsegs, ndesc);
txq_prod(txq, ndesc, &txqs);
- txsd = &txq->sdesc[txqs.pidx];
wr_hi = htonl(V_WR_OP(FW_WROPCODE_TUNNEL_TX_PKT) | txqs.compl);
wr_lo = htonl(V_WR_TID(txq->token));
- txsd->m = m0;
- m_set_priority(m0, txqs.pidx);
-
write_wr_hdr_sgl(ndesc, txd, &txqs, txq, sgl, flits, sgl_flits, wr_hi, wr_lo);
- check_ring_tx_db(p->adapter, txq);
+ check_ring_tx_db(pi->adapter, txq);
+ if ((m0->m_type == MT_DATA) && ((m0->m_flags & (M_EXT|M_NOFREE)) == M_EXT)) {
+ m0->m_flags &= ~M_EXT ;
+ m_free(m0);
+ }
+
return (0);
}
@@ -1367,6 +1458,11 @@ write_imm(struct tx_desc *d, struct mbuf *m,
struct work_request_hdr *from = mtod(m, struct work_request_hdr *);
struct work_request_hdr *to = (struct work_request_hdr *)d;
+ if (len > WR_LEN)
+ panic("len too big %d\n", len);
+ if (len < sizeof(*from))
+ panic("len too small %d", len);
+
memcpy(&to[1], &from[1], len - sizeof(*from));
to->wr_hi = from->wr_hi | htonl(F_WR_SOP | F_WR_EOP |
V_WR_BCNTLFLT(len & 7));
@@ -1374,7 +1470,14 @@ write_imm(struct tx_desc *d, struct mbuf *m,
to->wr_lo = from->wr_lo | htonl(V_WR_GEN(gen) |
V_WR_LEN((len + 7) / 8));
wr_gen2(d, gen);
- m_freem(m);
+
+ /*
+ * This check is a hack we should really fix the logic so
+ * that this can't happen
+ */
+ if (m->m_type != MT_DONTFREE)
+ m_freem(m);
+
}
/**
@@ -1413,6 +1516,8 @@ addq_exit: mbufq_tail(&q->sendq, m);
struct sge_qset *qs = txq_to_qset(q, qid);
+ printf("stopping q\n");
+
setbit(&qs->txq_stopped, qid);
smp_mb();
@@ -1472,7 +1577,7 @@ ctrl_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
m_freem(m);
return 0;
}
-
+
wrp->wr_hi |= htonl(F_WR_SOP | F_WR_EOP);
wrp->wr_lo = htonl(V_WR_TID(q->token));
@@ -1483,13 +1588,14 @@ again: reclaim_completed_tx_imm(q);
if (__predict_false(ret)) {
if (ret == 1) {
mtx_unlock(&q->lock);
- return (-1);
+ log(LOG_ERR, "no desc available\n");
+
+ return (ENOSPC);
}
goto again;
}
-
write_imm(&q->desc[q->pidx], m, m->m_len, q->gen);
-
+
q->in_use++;
if (++q->pidx >= q->size) {
q->pidx = 0;
@@ -1517,6 +1623,8 @@ restart_ctrlq(void *data, int npending)
struct sge_txq *q = &qs->txq[TXQ_CTRL];
adapter_t *adap = qs->port->adapter;
+ log(LOG_WARNING, "Restart_ctrlq in_use=%d\n", q->in_use);
+
mtx_lock(&q->lock);
again: reclaim_completed_tx_imm(q);
@@ -1555,6 +1663,7 @@ t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
return ctrl_xmit(adap, &adap->sge.qs[0].txq[TXQ_CTRL], m);
}
+
/**
* free_qset - free the resources of an SGE queue set
* @sc: the controller owning the queue set
@@ -1564,11 +1673,18 @@ t3_mgmt_tx(struct adapter *adap, struct mbuf *m)
* as HW contexts, packet buffers, and descriptor rings. Traffic to the
* queue set must be quiesced prior to calling this.
*/
-static void
+void
t3_free_qset(adapter_t *sc, struct sge_qset *q)
{
int i;
-
+
+ t3_free_tx_desc_all(&q->txq[TXQ_ETH]);
+
+ for (i = 0; i < SGE_TXQ_PER_SET; i++)
+ if (q->txq[i].txq_mr.br_ring != NULL) {
+ free(q->txq[i].txq_mr.br_ring, M_DEVBUF);
+ mtx_destroy(&q->txq[i].txq_mr.br_lock);
+ }
for (i = 0; i < SGE_RXQ_PER_SET; ++i) {
if (q->fl[i].desc) {
mtx_lock(&sc->sge.reg_lock);
@@ -1629,10 +1745,13 @@ void
t3_free_sge_resources(adapter_t *sc)
{
int i, nqsets;
-
+
+#ifdef IFNET_MULTIQUEUE
+ panic("%s should not be called when IFNET_MULTIQUEUE is defined", __FUNCTION__);
+#endif
for (nqsets = i = 0; i < (sc)->params.nports; i++)
nqsets += sc->port[i].nqsets;
-
+
for (i = 0; i < nqsets; ++i)
t3_free_qset(sc, &sc->sge.qs[i]);
}
@@ -1686,52 +1805,76 @@ t3_sge_stop(adapter_t *sc)
/**
- * free_tx_desc - reclaims Tx descriptors and their buffers
+ * t3_free_tx_desc - reclaims Tx descriptors and their buffers
* @adapter: the adapter
* @q: the Tx queue to reclaim descriptors from
- * @n: the number of descriptors to reclaim
+ * @reclaimable: the number of descriptors to reclaim
+ * @m_vec_size: maximum number of buffers to reclaim
+ * @desc_reclaimed: returns the number of descriptors reclaimed
*
* Reclaims Tx descriptors from an SGE Tx queue and frees the associated
* Tx buffers. Called with the Tx queue lock held.
+ *
+ * Returns number of buffers of reclaimed
*/
-int
-free_tx_desc(struct sge_txq *q, int n, struct mbuf **m_vec)
+void
+t3_free_tx_desc(struct sge_txq *q, int reclaimable)
{
- struct tx_sw_desc *d;
- unsigned int cidx = q->cidx;
- int nbufs = 0;
+ struct tx_sw_desc *txsd;
+ unsigned int cidx;
#ifdef T3_TRACE
T3_TRACE2(sc->tb[q->cntxt_id & 7],
- "reclaiming %u Tx descriptors at cidx %u", n, cidx);
+ "reclaiming %u Tx descriptors at cidx %u", reclaimable, cidx);
#endif
- d = &q->sdesc[cidx];
-
- while (n-- > 0) {
- DPRINTF("cidx=%d d=%p\n", cidx, d);
- if (d->m) {
- if (d->flags & TX_SW_DESC_MAPPED) {
- bus_dmamap_unload(q->entry_tag, d->map);
- bus_dmamap_destroy(q->entry_tag, d->map);
- d->flags &= ~TX_SW_DESC_MAPPED;
- }
- if (m_get_priority(d->m) == cidx) {
- m_vec[nbufs] = d->m;
- d->m = NULL;
- nbufs++;
- } else {
- printf("pri=%d cidx=%d\n", (int)m_get_priority(d->m), cidx);
+ cidx = q->cidx;
+ txsd = &q->sdesc[cidx];
+ DPRINTF("reclaiming %d WR\n", reclaimable);
+ while (reclaimable--) {
+ DPRINTF("cidx=%d d=%p\n", cidx, txsd);
+ if (txsd->mi.mi_base != NULL) {
+ if (txsd->flags & TX_SW_DESC_MAPPED) {
+ bus_dmamap_unload(q->entry_tag, txsd->map);
+ txsd->flags &= ~TX_SW_DESC_MAPPED;
}
- }
- ++d;
+ m_freem_iovec(&txsd->mi);
+ txsd->mi.mi_base = NULL;
+
+#if defined(DIAGNOSTIC) && 0
+ if (m_get_priority(txsd->m[0]) != cidx)
+ printf("pri=%d cidx=%d\n", (int)m_get_priority(txsd->m[0]), cidx);
+#endif
+
+ } else
+ q->txq_skipped++;
+
+ ++txsd;
if (++cidx == q->size) {
cidx = 0;
- d = q->sdesc;
+ txsd = q->sdesc;
}
}
q->cidx = cidx;
- return (nbufs);
+}
+
+void
+t3_free_tx_desc_all(struct sge_txq *q)
+{
+ int i;
+ struct tx_sw_desc *txsd;
+
+ for (i = 0; i < q->size; i++) {
+ txsd = &q->sdesc[i];
+ if (txsd->mi.mi_base != NULL) {
+ if (txsd->flags & TX_SW_DESC_MAPPED) {
+ bus_dmamap_unload(q->entry_tag, txsd->map);
+ txsd->flags &= ~TX_SW_DESC_MAPPED;
+ }
+ m_freem_iovec(&txsd->mi);
+ bzero(&txsd->mi, sizeof(txsd->mi));
+ }
+ }
}
/**
@@ -1782,31 +1925,31 @@ write_ofld_wr(adapter_t *adap, struct mbuf *m,
struct tx_desc *d = &q->desc[pidx];
struct txq_state txqs;
- if (immediate(m)) {
- q->sdesc[pidx].m = NULL;
+ if (immediate(m) && segs == NULL) {
write_imm(d, m, m->m_len, gen);
return;
}
/* Only TX_DATA builds SGLs */
-
from = mtod(m, struct work_request_hdr *);
- memcpy(&d->flit[1], &from[1],
- (uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *) - sizeof(*from));
+ memcpy(&d->flit[1], &from[1], m->m_len - sizeof(*from));
- flits = ((uint8_t *)m->m_pkthdr.header - mtod(m, uint8_t *)) / 8;
+ flits = m->m_len / 8;
sgp = (ndesc == 1) ? (struct sg_ent *)&d->flit[flits] : sgl;
make_sgl(sgp, segs, nsegs);
sgl_flits = sgl_len(nsegs);
- txqs.gen = q->gen;
- txqs.pidx = q->pidx;
- txqs.compl = (q->unacked & 8) << (S_WR_COMPL - 3);
+ txqs.gen = gen;
+ txqs.pidx = pidx;
+ txqs.compl = 0;
+
write_wr_hdr_sgl(ndesc, d, &txqs, q, sgl, flits, sgl_flits,
from->wr_hi, from->wr_lo);
}
+
+
/**
* calc_tx_descs_ofld - calculate # of Tx descriptors for an offload packet
* @m: the packet
@@ -1845,25 +1988,27 @@ ofld_xmit(adapter_t *adap, struct sge_txq *q, struct mbuf *m)
int ret, nsegs;
unsigned int ndesc;
unsigned int pidx, gen;
- struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
- bus_dma_segment_t segs[TX_MAX_SEGS];
- int i, cleaned;
- struct tx_sw_desc *stx = &q->sdesc[q->pidx];
+ bus_dma_segment_t segs[TX_MAX_SEGS], *vsegs;
+ struct tx_sw_desc *stx;
- mtx_lock(&q->lock);
- if ((ret = busdma_map_mbufs(&m, q, stx, segs, &nsegs)) != 0) {
- mtx_unlock(&q->lock);
- return (ret);
- }
+ nsegs = m_get_sgllen(m);
+ vsegs = m_get_sgl(m);
ndesc = calc_tx_descs_ofld(m, nsegs);
-again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
+ busdma_map_sgl(vsegs, segs, nsegs);
+ stx = &q->sdesc[q->pidx];
+ KASSERT(stx->mi.mi_base == NULL, ("mi_base set"));
+
+ mtx_lock(&q->lock);
+again: reclaim_completed_tx(q);
ret = check_desc_avail(adap, q, m, ndesc, TXQ_OFLD);
if (__predict_false(ret)) {
if (ret == 1) {
+ printf("no ofld desc avail\n");
+
m_set_priority(m, ndesc); /* save for restart */
mtx_unlock(&q->lock);
- return EINTR;
+ return (EINTR);
}
goto again;
}
@@ -1886,10 +2031,7 @@ again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
write_ofld_wr(adap, m, q, pidx, gen, ndesc, segs, nsegs);
check_ring_tx_db(adap, q);
-
- for (i = 0; i < cleaned; i++) {
- m_freem(m_vec[i]);
- }
+
return (0);
}
@@ -1902,18 +2044,16 @@ again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
static void
restart_offloadq(void *data, int npending)
{
-
struct mbuf *m;
struct sge_qset *qs = data;
struct sge_txq *q = &qs->txq[TXQ_OFLD];
adapter_t *adap = qs->port->adapter;
- struct mbuf *m_vec[TX_CLEAN_MAX_DESC];
bus_dma_segment_t segs[TX_MAX_SEGS];
- int nsegs, i, cleaned;
struct tx_sw_desc *stx = &q->sdesc[q->pidx];
+ int nsegs, cleaned;
mtx_lock(&q->lock);
-again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
+again: cleaned = reclaim_completed_tx(q);
while ((m = mbufq_peek(&q->sendq)) != NULL) {
unsigned int gen, pidx;
@@ -1953,10 +2093,12 @@ again: cleaned = reclaim_completed_tx(q, TX_CLEAN_MAX_DESC, m_vec);
#endif
t3_write_reg(adap, A_SG_KDOORBELL,
F_SELEGRCNTX | V_EGRCNTX(q->cntxt_id));
+#if 0
for (i = 0; i < cleaned; i++) {
- m_freem(m_vec[i]);
+ m_freem_vec(m_vec[i]);
}
+#endif
}
/**
@@ -2000,7 +2142,7 @@ t3_offload_tx(struct t3cdev *tdev, struct mbuf *m)
adapter_t *adap = tdev2adap(tdev);
struct sge_qset *qs = &adap->sge.qs[queue_set(m)];
- if (__predict_false(is_ctrl_pkt(m)))
+ if (__predict_false(is_ctrl_pkt(m)))
return ctrl_xmit(adap, &qs->txq[TXQ_CTRL], m);
return ofld_xmit(adap, &qs->txq[TXQ_OFLD], m);
@@ -2031,9 +2173,9 @@ rx_offload(struct t3cdev *tdev, struct sge_rspq *rq,
struct mbuf *m, struct mbuf *rx_gather[],
unsigned int gather_idx)
{
+
rq->offload_pkts++;
m->m_pkthdr.header = mtod(m, void *);
-
rx_gather[gather_idx++] = m;
if (gather_idx == RX_BUNDLE_SIZE) {
cxgb_ofld_recv(tdev, rx_gather, RX_BUNDLE_SIZE);
@@ -2048,16 +2190,24 @@ restart_tx(struct sge_qset *qs)
{
struct adapter *sc = qs->port->adapter;
+
if (isset(&qs->txq_stopped, TXQ_OFLD) &&
should_restart_tx(&qs->txq[TXQ_OFLD]) &&
test_and_clear_bit(TXQ_OFLD, &qs->txq_stopped)) {
qs->txq[TXQ_OFLD].restarts++;
+ printf("restarting TXQ_OFLD\n");
taskqueue_enqueue(sc->tq, &qs->txq[TXQ_OFLD].qresume_task);
}
+ printf("stopped=0x%x restart=%d processed=%d cleaned=%d in_use=%d\n",
+ qs->txq_stopped, should_restart_tx(&qs->txq[TXQ_CTRL]),
+ qs->txq[TXQ_CTRL].processed, qs->txq[TXQ_CTRL].cleaned,
+ qs->txq[TXQ_CTRL].in_use);
+
if (isset(&qs->txq_stopped, TXQ_CTRL) &&
should_restart_tx(&qs->txq[TXQ_CTRL]) &&
test_and_clear_bit(TXQ_CTRL, &qs->txq_stopped)) {
qs->txq[TXQ_CTRL].restarts++;
+ printf("restarting TXQ_CTRL\n");
taskqueue_enqueue(sc->tq, &qs->txq[TXQ_CTRL].qresume_task);
}
}
@@ -2084,6 +2234,17 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
struct sge_qset *q = &sc->sge.qs[id];
int i, ret = 0;
+ for (i = 0; i < SGE_TXQ_PER_SET; i++) {
+ if ((q->txq[i].txq_mr.br_ring = malloc(cxgb_txq_buf_ring_size*sizeof(struct mbuf *),
+ M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
+ device_printf(sc->dev, "failed to allocate mbuf ring\n");
+ goto err;
+ }
+ q->txq[i].txq_mr.br_prod = q->txq[i].txq_mr.br_cons = 0;
+ q->txq[i].txq_mr.br_size = cxgb_txq_buf_ring_size;
+ mtx_init(&q->txq[i].txq_mr.br_lock, "txq mbuf ring", NULL, MTX_DEF);
+ }
+
init_qset_cntxt(q, id);
if ((ret = alloc_ring(sc, p->fl_size, sizeof(struct rx_desc),
@@ -2155,13 +2316,18 @@ t3_sge_alloc_qset(adapter_t *sc, u_int id, int nports, int irq_vec_idx,
q->txq[TXQ_ETH].stop_thres = nports *
flits_to_desc(sgl_len(TX_MAX_SEGS + 1) + 3);
- q->fl[0].buf_size = MCLBYTES;
+ q->fl[0].buf_size = (MCLBYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_));
q->fl[0].zone = zone_clust;
q->fl[0].type = EXT_CLUSTER;
- q->fl[1].buf_size = MJUMPAGESIZE;
- q->fl[1].zone = zone_jumbop;
- q->fl[1].type = EXT_JUMBOP;
-
+#if __FreeBSD_version > 800000
+ q->fl[1].buf_size = MJUM16BYTES - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_);
+ q->fl[1].zone = zone_jumbo16;
+ q->fl[1].type = EXT_JUMBO16;
+#else
+ q->fl[1].buf_size = MJUMPAGESIZE - sizeof(uint32_t) - sizeof(struct m_hdr) - sizeof(struct pkthdr) - sizeof(struct m_ext_);
+ q->fl[1].zone = zone_jumbop;
+ q->fl[1].type = EXT_JUMBOP;
+#endif
q->lro.enabled = lro_default;
mtx_lock(&sc->sge.reg_lock);
@@ -2269,11 +2435,15 @@ t3_rx_eth(struct adapter *adap, struct sge_rspq *rq, struct mbuf *m, int ethpad)
m->m_pkthdr.rcvif = ifp;
m->m_pkthdr.header = mtod(m, uint8_t *) + sizeof(*cpl) + ethpad;
+#ifndef DISABLE_MBUF_IOVEC
m_explode(m);
+#endif
/*
* adjust after conversion to mbuf chain
*/
- m_adj(m, sizeof(*cpl) + ethpad);
+ m->m_pkthdr.len -= (sizeof(*cpl) + ethpad);
+ m->m_len -= (sizeof(*cpl) + ethpad);
+ m->m_data += (sizeof(*cpl) + ethpad);
(*ifp->if_input)(ifp, m);
}
@@ -2307,17 +2477,24 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
uint32_t len = G_RSPD_LEN(len_cq);
uint32_t flags = ntohl(r->flags);
uint8_t sopeop = G_RSPD_SOP_EOP(flags);
+ uint32_t *ref;
int ret = 0;
- prefetch(sd->cl);
+ prefetch(sd->rxsd_cl);
fl->credits--;
bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
bus_dmamap_unload(fl->entry_tag, sd->map);
- m_cljset(m, sd->cl, fl->type);
+ ref = sd->rxsd_ref;
+ m_cljset(m, sd->rxsd_cl, fl->type, sd->rxsd_ref);
+ *ref = 1;
m->m_len = len;
-
+ /*
+ * bump past the refcnt address
+ */
+ m->m_data = sd->data;
+
switch(sopeop) {
case RSPQ_SOP_EOP:
DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
@@ -2363,9 +2540,48 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
}
#else
+static void
+ext_free_handler(void *cl, void * arg)
+{
+ uintptr_t type = (uintptr_t)arg;
+ uma_zone_t zone;
+ struct mbuf *m;
+
+ m = cl;
+ zone = m_getzonefromtype(type);
+ m->m_ext.ext_type = (int)type;
+ cxgb_ext_freed++;
+ cxgb_cache_put(zone, cl);
+}
+
+static void
+init_cluster_mbuf(caddr_t cl, int flags, int type, uma_zone_t zone)
+{
+ struct mbuf *m;
+ int header_size;
+
+ header_size = sizeof(struct m_hdr) + sizeof(struct pkthdr) + sizeof(struct m_ext_) + sizeof(uint32_t);
+
+ bzero(cl, header_size);
+ m = (struct mbuf *)cl;
+
+ SLIST_INIT(&m->m_pkthdr.tags);
+ m->m_type = MT_DATA;
+ m->m_flags = flags | M_NOFREE | M_EXT;
+ m->m_data = cl + header_size;
+ m->m_ext.ext_buf = cl;
+ m->m_ext.ref_cnt = (uint32_t *)(cl + header_size - sizeof(uint32_t));
+ m->m_ext.ext_size = m_getsizefromtype(type);
+ m->m_ext.ext_free = ext_free_handler;
+ m->m_ext.ext_args = (void *)(uintptr_t)type;
+ m->m_ext.ext_type = EXT_EXTREF;
+ *(m->m_ext.ref_cnt) = 1;
+ DPRINTF("data=%p ref_cnt=%p\n", m->m_data, m->m_ext.ref_cnt);
+}
+
static int
get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
- struct mbuf *m, struct rsp_desc *r)
+ struct mbuf **m, struct rsp_desc *r)
{
unsigned int len_cq = ntohl(r->len_cq);
@@ -2376,45 +2592,61 @@ get_packet(adapter_t *adap, unsigned int drop_thres, struct sge_qset *qs,
uint8_t sopeop = G_RSPD_SOP_EOP(flags);
void *cl;
int ret = 0;
-
- prefetch(sd->cl);
-
+ struct mbuf *m0;
+#if 0
+ if ((sd + 1 )->rxsd_cl)
+ prefetch((sd + 1)->rxsd_cl);
+ if ((sd + 2)->rxsd_cl)
+ prefetch((sd + 2)->rxsd_cl);
+#endif
+ DPRINTF("rx cpu=%d\n", curcpu);
fl->credits--;
bus_dmamap_sync(fl->entry_tag, sd->map, BUS_DMASYNC_POSTREAD);
if (recycle_enable && len <= SGE_RX_COPY_THRES && sopeop == RSPQ_SOP_EOP) {
- cl = mtod(m, void *);
- memcpy(cl, sd->cl, len);
+ if ((m0 = m_gethdr(M_DONTWAIT, MT_DATA)) == NULL)
+ goto skip_recycle;
+ cl = mtod(m0, void *);
+ memcpy(cl, sd->data, len);
recycle_rx_buf(adap, fl, fl->cidx);
+ *m = m0;
} else {
- cl = sd->cl;
+ skip_recycle:
bus_dmamap_unload(fl->entry_tag, sd->map);
+ cl = sd->rxsd_cl;
+ *m = m0 = (struct mbuf *)cl;
}
+
switch(sopeop) {
case RSPQ_SOP_EOP:
DBG(DBG_RX, ("get_packet: SOP-EOP m %p\n", m));
- if (cl == sd->cl)
- m_cljset(m, cl, fl->type);
- m->m_len = m->m_pkthdr.len = len;
+ if (cl == sd->rxsd_cl)
+ init_cluster_mbuf(cl, M_PKTHDR, fl->type, fl->zone);
+ m0->m_len = m0->m_pkthdr.len = len;
ret = 1;
goto done;
break;
case RSPQ_NSOP_NEOP:
DBG(DBG_RX, ("get_packet: NO_SOP-NO_EOP m %p\n", m));
+ panic("chaining unsupported");
ret = 0;
break;
case RSPQ_SOP:
DBG(DBG_RX, ("get_packet: SOP m %p\n", m));
- m_iovinit(m);
+ panic("chaining unsupported");
+ m_iovinit(m0);
ret = 0;
break;
case RSPQ_EOP:
DBG(DBG_RX, ("get_packet: EOP m %p\n", m));
+ panic("chaining unsupported");
ret = 1;
break;
}
- m_iovappend(m, cl, fl->buf_size, len, 0);
-
+ panic("append not supported");
+#if 0
+ m_iovappend(m0, cl, fl->buf_size, len, sizeof(uint32_t), sd->rxsd_ref);
+#endif
done:
if (++fl->cidx == fl->size)
fl->cidx = 0;
@@ -2443,9 +2675,11 @@ handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
credits = G_RSPD_TXQ0_CR(flags);
if (credits) {
qs->txq[TXQ_ETH].processed += credits;
+#ifndef IFNET_MULTIQUEUE
if (desc_reclaimable(&qs->txq[TXQ_ETH]) > TX_START_MAX_DESC)
taskqueue_enqueue(qs->port->adapter->tq,
&qs->port->timer_reclaim_task);
+#endif
}
credits = G_RSPD_TXQ2_CR(flags);
@@ -2459,6 +2693,7 @@ handle_rsp_cntrl_info(struct sge_qset *qs, uint32_t flags)
credits = G_RSPD_TXQ1_CR(flags);
if (credits)
qs->txq[TXQ_OFLD].processed += credits;
+
}
static void
@@ -2483,7 +2718,7 @@ check_ring_db(adapter_t *adap, struct sge_qset *qs,
* on this queue. If the system is under memory shortage use a fairly
* long delay to help recovery.
*/
-static int
+int
process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
{
struct sge_rspq *rspq = &qs->rspq;
@@ -2506,7 +2741,7 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
int eth, eop = 0, ethpad = 0;
uint32_t flags = ntohl(r->flags);
uint32_t rss_csum = *(const uint32_t *)r;
- uint32_t rss_hash = r->rss_hdr.rss_hash_val;
+ uint32_t rss_hash = be32toh(r->rss_hdr.rss_hash_val);
eth = (r->rss_hdr.opcode == CPL_RX_PKT);
@@ -2517,8 +2752,7 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
} else if (flags & F_RSPD_IMM_DATA_VALID) {
#ifdef DISABLE_MBUF_IOVEC
- if (cxgb_debug)
- printf("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
+ DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
if(get_imm_packet(adap, r, &rspq->rspq_mh) == 0) {
rspq->next_holdoff = NOMEM_INTR_DELAY;
@@ -2529,10 +2763,11 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
}
#else
struct mbuf *m = NULL;
-
+
+ DPRINTF("IMM DATA VALID opcode=0x%x rspq->cidx=%d\n", r->rss_hdr.opcode, rspq->cidx);
if (rspq->rspq_mbuf == NULL)
rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
- else
+ else
m = m_gethdr(M_DONTWAIT, MT_DATA);
/*
@@ -2543,82 +2778,79 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
budget_left--;
break;
}
- if (get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags))
- goto skip;
+ get_imm_packet(adap, r, rspq->rspq_mbuf, m, flags);
+
eop = 1;
-#endif
rspq->imm_data++;
+#endif
} else if (r->len_cq) {
int drop_thresh = eth ? SGE_RX_DROP_THRES : 0;
#ifdef DISABLE_MBUF_IOVEC
struct mbuf *m;
- m = m_gethdr(M_NOWAIT, MT_DATA);
+ m = m_gethdr(M_DONTWAIT, MT_DATA);
if (m == NULL) {
log(LOG_WARNING, "failed to get mbuf for packet\n");
break;
+ } else {
+ m->m_next = m->m_nextpkt = NULL;
}
-
+
eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mh, r, m);
#else
- if (rspq->rspq_mbuf == NULL)
- rspq->rspq_mbuf = m_gethdr(M_DONTWAIT, MT_DATA);
- if (rspq->rspq_mbuf == NULL) {
- log(LOG_WARNING, "failed to get mbuf for packet\n");
- break;
- }
- eop = get_packet(adap, drop_thresh, qs, rspq->rspq_mbuf, r);
+ eop = get_packet(adap, drop_thresh, qs, &rspq->rspq_mbuf, r);
+#ifdef IFNET_MULTIQUEUE
+ rspq->rspq_mbuf->m_pkthdr.rss_hash = rss_hash;
+#endif
#endif
ethpad = 2;
} else {
DPRINTF("pure response\n");
rspq->pure_rsps++;
}
-
if (flags & RSPD_CTRL_MASK) {
sleeping |= flags & RSPD_GTS_MASK;
handle_rsp_cntrl_info(qs, flags);
}
-#ifndef DISABLE_MBUF_IOVEC
- skip:
-#endif
+
r++;
if (__predict_false(++rspq->cidx == rspq->size)) {
rspq->cidx = 0;
rspq->gen ^= 1;
r = rspq->desc;
}
-
prefetch(r);
if (++rspq->credits >= (rspq->size / 4)) {
refill_rspq(adap, rspq, rspq->credits);
rspq->credits = 0;
}
-
- if (eop) {
- prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *));
- prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
+ DPRINTF("eth=%d eop=%d flags=0x%x\n", eth, eop, flags);
- if (eth) {
- t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
- rss_hash, rss_csum, lro);
+ if (!eth && eop) {
+ rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
+ /*
+ * XXX size mismatch
+ */
+ m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
+
+ ngathered = rx_offload(&adap->tdev, rspq,
+ rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
+ rspq->rspq_mh.mh_head = NULL;
+ DPRINTF("received offload packet\n");
+
+ } else if (eth && eop) {
+ prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *));
+ prefetch(mtod(rspq->rspq_mh.mh_head, uint8_t *) + L1_CACHE_BYTES);
+ t3_rx_eth_lro(adap, rspq, rspq->rspq_mh.mh_head, ethpad,
+ rss_hash, rss_csum, lro);
+ DPRINTF("received tunnel packet\n");
rspq->rspq_mh.mh_head = NULL;
- } else {
- rspq->rspq_mh.mh_head->m_pkthdr.csum_data = rss_csum;
- /*
- * XXX size mismatch
- */
- m_set_priority(rspq->rspq_mh.mh_head, rss_hash);
-
- ngathered = rx_offload(&adap->tdev, rspq,
- rspq->rspq_mh.mh_head, offload_mbufs, ngathered);
- }
- __refill_fl(adap, &qs->fl[0]);
- __refill_fl(adap, &qs->fl[1]);
}
+ __refill_fl_lt(adap, &qs->fl[0], 32);
+ __refill_fl_lt(adap, &qs->fl[1], 32);
--budget_left;
}
@@ -2629,9 +2861,14 @@ process_responses(adapter_t *adap, struct sge_qset *qs, int budget)
check_ring_db(adap, qs, sleeping);
smp_mb(); /* commit Tx queue processed updates */
- if (__predict_false(qs->txq_stopped != 0))
+ if (__predict_false(qs->txq_stopped != 0)) {
+ printf("restarting tx on %p\n", qs);
+
restart_tx(qs);
-
+ }
+
+ __refill_fl_lt(adap, &qs->fl[0], 512);
+ __refill_fl_lt(adap, &qs->fl[1], 512);
budget -= budget_left;
return (budget);
}
@@ -2718,10 +2955,11 @@ t3_intr_msix(void *data)
adapter_t *adap = qs->port->adapter;
struct sge_rspq *rspq = &qs->rspq;
- mtx_lock(&rspq->lock);
- if (process_responses_gts(adap, rspq) == 0)
- rspq->unhandled_irqs++;
- mtx_unlock(&rspq->lock);
+ if (mtx_trylock(&rspq->lock)) {
+ if (process_responses_gts(adap, rspq) == 0)
+ rspq->unhandled_irqs++;
+ mtx_unlock(&rspq->lock);
+ }
}
/*
@@ -2765,7 +3003,10 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
struct sge_qset *qs;
int i, j, err, nqsets = 0;
struct mtx *lock;
-
+
+ if ((sc->flags & FULL_INIT_DONE) == 0)
+ return (ENXIO);
+
coalesce_nsecs = qsp->coalesce_nsecs;
err = sysctl_handle_int(oidp, &coalesce_nsecs, arg2, req);
@@ -2801,11 +3042,11 @@ t3_set_coalesce_nsecs(SYSCTL_HANDLER_ARGS)
void
-t3_add_sysctls(adapter_t *sc)
+t3_add_attach_sysctls(adapter_t *sc)
{
struct sysctl_ctx_list *ctx;
struct sysctl_oid_list *children;
-
+
ctx = device_get_sysctl_ctx(sc->dev);
children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
@@ -2821,28 +3062,13 @@ t3_add_sysctls(adapter_t *sc)
0, t3_lro_enable,
"I", "enable large receive offload");
- SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
- "intr_coal",
- CTLTYPE_INT|CTLFLAG_RW, sc,
- 0, t3_set_coalesce_nsecs,
- "I", "interrupt coalescing timer (ns)");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"enable_debug",
CTLFLAG_RW, &cxgb_debug,
0, "enable verbose debugging output");
-
- SYSCTL_ADD_INT(ctx, children, OID_AUTO,
- "collapse_free",
- CTLFLAG_RD, &collapse_free,
- 0, "frees during collapse");
- SYSCTL_ADD_INT(ctx, children, OID_AUTO,
- "mb_free_vec_free",
- CTLFLAG_RD, &mb_free_vec_free,
- 0, "frees during mb_free_vec");
- SYSCTL_ADD_INT(ctx, children, OID_AUTO,
- "collapse_mbufs",
- CTLFLAG_RW, &collapse_mbufs,
- 0, "collapse mbuf chains into iovecs");
+ SYSCTL_ADD_ULONG(ctx, children, OID_AUTO, "tunq_coalesce",
+ CTLFLAG_RD, &sc->tunq_coalesce,
+ "#tunneled packets freed");
SYSCTL_ADD_INT(ctx, children, OID_AUTO,
"txq_overrun",
CTLFLAG_RD, &txq_fills,
@@ -2851,8 +3077,103 @@ t3_add_sysctls(adapter_t *sc)
"bogus_imm",
CTLFLAG_RD, &bogus_imm,
0, "#times a bogus immediate response was seen");
+ SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+ "cache_alloc",
+ CTLFLAG_RD, &cxgb_cached_allocations,
+ 0, "#times a cluster was allocated from cache");
+ SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+ "cached",
+ CTLFLAG_RD, &cxgb_cached,
+ 0, "#times a cluster was cached");
+ SYSCTL_ADD_INT(ctx, children, OID_AUTO,
+ "ext_freed",
+ CTLFLAG_RD, &cxgb_ext_freed,
+ 0, "#times a cluster was freed through ext_free");
+
}
+void
+t3_add_configured_sysctls(adapter_t *sc)
+{
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid_list *children;
+ int i, j;
+
+ ctx = device_get_sysctl_ctx(sc->dev);
+ children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->dev));
+
+ SYSCTL_ADD_PROC(ctx, children, OID_AUTO,
+ "intr_coal",
+ CTLTYPE_INT|CTLFLAG_RW, sc,
+ 0, t3_set_coalesce_nsecs,
+ "I", "interrupt coalescing timer (ns)");
+
+ for (i = 0; i < sc->params.nports; i++) {
+ struct port_info *pi = &sc->port[i];
+ struct sysctl_oid *poid;
+ struct sysctl_oid_list *poidlist;
+
+ snprintf(pi->namebuf, PORT_NAME_LEN, "port%d", i);
+ poid = SYSCTL_ADD_NODE(ctx, children, OID_AUTO,
+ pi->namebuf, CTLFLAG_RD, NULL, "port statistics");
+ poidlist = SYSCTL_CHILDREN(poid);
+ SYSCTL_ADD_INT(ctx, poidlist, OID_AUTO,
+ "nqsets", CTLFLAG_RD, &pi->nqsets,
+ 0, "#queue sets");
+
+ for (j = 0; j < pi->nqsets; j++) {
+ struct sge_qset *qs = &sc->sge.qs[pi->first_qset + j];
+ struct sysctl_oid *qspoid;
+ struct sysctl_oid_list *qspoidlist;
+ struct sge_txq *txq = &qs->txq[TXQ_ETH];
+
+ snprintf(qs->namebuf, QS_NAME_LEN, "qs%d", j);
+
+ qspoid = SYSCTL_ADD_NODE(ctx, poidlist, OID_AUTO,
+ qs->namebuf, CTLFLAG_RD, NULL, "qset statistics");
+ qspoidlist = SYSCTL_CHILDREN(qspoid);
+
+ SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "dropped",
+ CTLFLAG_RD, &qs->txq[TXQ_ETH].txq_drops,
+ 0, "#tunneled packets dropped");
+ SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "sendqlen",
+ CTLFLAG_RD, &qs->txq[TXQ_ETH].sendq.qlen,
+ 0, "#tunneled packets waiting to be sent");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_pidx",
+ CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_prod,
+ 0, "#tunneled packets queue producer index");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "queue_cidx",
+ CTLFLAG_RD, (uint32_t *)(uintptr_t)&qs->txq[TXQ_ETH].txq_mr.br_cons,
+ 0, "#tunneled packets queue consumer index");
+ SYSCTL_ADD_INT(ctx, qspoidlist, OID_AUTO, "processed",
+ CTLFLAG_RD, &qs->txq[TXQ_ETH].processed,
+ 0, "#tunneled packets processed by the card");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "cleaned",
+ CTLFLAG_RD, &txq->cleaned,
+ 0, "#tunneled packets cleaned");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "in_use",
+ CTLFLAG_RD, &txq->in_use,
+ 0, "#tunneled packet slots in use");
+ SYSCTL_ADD_ULONG(ctx, qspoidlist, OID_AUTO, "frees",
+ CTLFLAG_RD, &txq->txq_frees,
+ "#tunneled packets freed");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "skipped",
+ CTLFLAG_RD, &txq->txq_skipped,
+ 0, "#tunneled packet descriptors skipped");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "coalesced",
+ CTLFLAG_RD, &txq->txq_coalesced,
+ 0, "#tunneled packets coalesced");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "enqueued",
+ CTLFLAG_RD, &txq->txq_enqueued,
+ 0, "#tunneled packets enqueued to hardware");
+ SYSCTL_ADD_UINT(ctx, qspoidlist, OID_AUTO, "stopped_flags",
+ CTLFLAG_RD, &qs->txq_stopped,
+ 0, "tx queues stopped");
+
+ }
+ }
+}
+
/**
* t3_get_desc - dump an SGE descriptor for debugging purposes
* @qs: the queue set
diff --git a/sys/dev/cxgb/sys/cxgb_support.c b/sys/dev/cxgb/sys/cxgb_support.c
index 7a28556..176206c 100644
--- a/sys/dev/cxgb/sys/cxgb_support.c
+++ b/sys/dev/cxgb/sys/cxgb_support.c
@@ -126,11 +126,11 @@ cxgb_cache_pcpu_init(struct cxgb_cache_pcpu *ccp)
if ((err = buf_stack_init(&ccp->ccp_cluster_free, (FL_Q_SIZE >> 1))))
return (err);
- if (jumbo_phys_contig)
+#if __FreeBSD_version > 800000
ccp->ccp_jumbo_zone = zone_jumbo16;
- else
+#else
ccp->ccp_jumbo_zone = zone_jumbop;
-
+#endif
return (0);
}
diff --git a/sys/dev/cxgb/sys/mvec.h b/sys/dev/cxgb/sys/mvec.h
index 2ef7ecd..04b6449 100644
--- a/sys/dev/cxgb/sys/mvec.h
+++ b/sys/dev/cxgb/sys/mvec.h
@@ -63,6 +63,9 @@ struct m_ext_ {
int ext_type; /* type of external storage */
};
+#define MT_IOVEC 9
+#define MT_CLIOVEC 10
+
#define EXT_IOVEC 8
#define EXT_CLIOVEC 9
#define EXT_JMPIOVEC 10
diff --git a/sys/dev/cxgb/t3cdev.h b/sys/dev/cxgb/t3cdev.h
index 8223f98..67db552 100644
--- a/sys/dev/cxgb/t3cdev.h
+++ b/sys/dev/cxgb/t3cdev.h
@@ -50,7 +50,7 @@ struct t3cdev {
int (*send)(struct t3cdev *dev, struct mbuf *m);
int (*recv)(struct t3cdev *dev, struct mbuf **m, int n);
int (*ctl)(struct t3cdev *dev, unsigned int req, void *data);
- void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, struct sockaddr *sa);
+ void (*arp_update)(struct t3cdev *dev, struct rtentry *neigh, uint8_t *enaddr, struct sockaddr *sa);
void *priv; /* driver private data */
void *l2opt; /* optional layer 2 data */
void *l3opt; /* optional layer 3 data */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
index 0c796b5..4b17f8e 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_io.c
@@ -60,7 +60,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/ip.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_syncache.h>
#include <net/route.h>
@@ -82,6 +82,7 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/ulp/tom/cxgb_tom.h>
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
@@ -559,7 +560,7 @@ cxgb_toe_disconnect(struct tcpcb *tp)
}
static int
-cxgb_toe_abort(struct tcpcb *tp)
+cxgb_toe_reset(struct tcpcb *tp)
{
struct toepcb *toep = tp->t_toe;
@@ -620,7 +621,7 @@ cxgb_toe_detach(struct tcpcb *tp)
static struct toe_usrreqs cxgb_toe_usrreqs = {
.tu_disconnect = cxgb_toe_disconnect,
- .tu_abort = cxgb_toe_abort,
+ .tu_reset = cxgb_toe_reset,
.tu_send = cxgb_toe_send,
.tu_rcvd = cxgb_toe_rcvd,
.tu_detach = cxgb_toe_detach,
@@ -1145,7 +1146,7 @@ fail_act_open(struct toepcb *toep, int errno)
t3_release_offload_resources(toep);
if (tp) {
INP_LOCK_ASSERT(tp->t_inpcb);
- tcp_drop(tp, errno);
+ cxgb_tcp_drop(tp, errno);
}
#ifdef notyet
@@ -1957,7 +1958,7 @@ process_close_con_rpl(struct socket *so, struct mbuf *m)
wakeup(&so->so_timeo);
} else if ((so->so_options & SO_LINGER) && so->so_linger == 0 &&
(toep->tp_flags & TP_ABORT_SHUTDOWN) == 0) {
- tp = tcp_drop(tp, 0);
+ tp = cxgb_tcp_drop(tp, 0);
}
break;
@@ -2483,7 +2484,7 @@ handle_syncache_event(int event, void *arg)
struct toepcb *toep = arg;
switch (event) {
- case SC_ENTRY_PRESENT:
+ case TOE_SC_ENTRY_PRESENT:
/*
* entry already exists - free toepcb
* and l2t
@@ -2491,7 +2492,7 @@ handle_syncache_event(int event, void *arg)
printf("syncache entry present\n");
toepcb_release(toep);
break;
- case SC_DROP:
+ case TOE_SC_DROP:
/*
* The syncache has given up on this entry
* either it timed out, or it was evicted
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
index 8cb42e1..e411ab4 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_cpl_socket.c
@@ -62,7 +62,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
#include <net/route.h>
#include <dev/cxgb/t3cdev.h>
@@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/ulp/tom/cxgb_tom.h>
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
static int (*pru_sosend)(struct socket *so, struct sockaddr *addr,
struct uio *uio, struct mbuf *top, struct mbuf *control,
@@ -99,9 +100,6 @@ static int vm_fault_hold_user_pages(vm_offset_t addr, int len, vm_page_t *mp,
int *count, int flags);
#endif
static void vm_fault_unhold_pages(vm_page_t *m, int count);
-
-
-
#define TMP_IOV_MAX 16
void
@@ -112,6 +110,15 @@ t3_init_socket_ops(void)
prp = pffindtype(AF_INET, SOCK_STREAM);
pru_sosend = prp->pr_usrreqs->pru_sosend;
pru_soreceive = prp->pr_usrreqs->pru_soreceive;
+ tcp_usrreqs.pru_connect = cxgb_tcp_usrreqs.pru_connect;
+ tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
+ tcp_usrreqs.pru_listen = cxgb_tcp_usrreqs.pru_listen;
+ tcp_usrreqs.pru_send = cxgb_tcp_usrreqs.pru_send;
+ tcp_usrreqs.pru_abort = cxgb_tcp_usrreqs.pru_abort;
+ tcp_usrreqs.pru_disconnect = cxgb_tcp_usrreqs.pru_disconnect;
+ tcp_usrreqs.pru_close = cxgb_tcp_usrreqs.pru_close;
+ tcp_usrreqs.pru_shutdown = cxgb_tcp_usrreqs.pru_shutdown;
+ tcp_usrreqs.pru_rcvd = cxgb_tcp_usrreqs.pru_rcvd;
}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_listen.c b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
index e785790..a88b26e 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_listen.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_listen.c
@@ -57,7 +57,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/tcp_var.h>
#include <netinet/tcp_fsm.h>
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
#include <net/route.h>
#include <dev/cxgb/t3cdev.h>
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp.h b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h
new file mode 100644
index 0000000..feb2916
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp.h
@@ -0,0 +1,44 @@
+
+/*-
+ * Copyright (c) 2007, Chelsio Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Neither the name of the Chelsio Corporation nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+#ifndef CXGB_TCP_H_
+#define CXGB_TCP_H_
+
+struct tcpcb *cxgb_tcp_drop(struct tcpcb *tp, int errno);
+void cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip);
+struct tcpcb *cxgb_tcp_close(struct tcpcb *tp);
+
+extern struct pr_usrreqs cxgb_tcp_usrreqs;
+#ifdef INET6
+extern struct pr_usrreqs cxgb_tcp6_usrreqs;
+#endif
+
+#include <sys/sysctl.h>
+SYSCTL_DECL(_net_inet_tcp_cxgb);
+#endif /* CXGB_TCP_H_ */
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
new file mode 100644
index 0000000..2eca099
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_subr.c
@@ -0,0 +1,694 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_mac.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#ifdef INET6
+#include <sys/domain.h>
+#endif
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/random.h>
+
+#include <vm/uma.h>
+
+#include <net/route.h>
+#include <net/if.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#include <netinet6/scope6_var.h>
+#include <netinet6/nd6.h>
+#endif
+#include <netinet/ip_icmp.h>
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcp_syncache.h>
+#include <netinet/tcp_offload.h>
+#ifdef INET6
+#include <netinet6/tcp6_var.h>
+#endif
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+#include <netinet6/ip6protosw.h>
+
+#ifdef IPSEC
+#include <netipsec/ipsec.h>
+#include <netipsec/xform.h>
+#ifdef INET6
+#include <netipsec/ipsec6.h>
+#endif
+#include <netipsec/key.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+#include <sys/md5.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
+
+SYSCTL_NODE(_net_inet_tcp, 0, cxgb, CTLFLAG_RW, 0, "chelsio TOE");
+
+static int tcp_log_debug = 0;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, log_debug, CTLFLAG_RW,
+ &tcp_log_debug, 0, "Log errors caused by incoming TCP segments");
+
+static int tcp_tcbhashsize = 0;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, tcbhashsize, CTLFLAG_RDTUN,
+ &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
+
+static int do_tcpdrain = 1;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, do_tcpdrain, CTLFLAG_RW,
+ &do_tcpdrain, 0,
+ "Enable tcp_drain routine for extra help when low on mbufs");
+
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, pcbcount, CTLFLAG_RD,
+ &tcbinfo.ipi_count, 0, "Number of active PCBs");
+
+static int icmp_may_rst = 1;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, icmp_may_rst, CTLFLAG_RW,
+ &icmp_may_rst, 0,
+ "Certain ICMP unreachable messages may abort connections in SYN_SENT");
+
+static int tcp_isn_reseed_interval = 0;
+SYSCTL_INT(_net_inet_tcp_cxgb, OID_AUTO, isn_reseed_interval, CTLFLAG_RW,
+ &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
+
+/*
+ * TCP bandwidth limiting sysctls. Note that the default lower bound of
+ * 1024 exists only for debugging. A good production default would be
+ * something like 6100.
+ */
+SYSCTL_NODE(_net_inet_tcp, OID_AUTO, inflight, CTLFLAG_RW, 0,
+ "TCP inflight data limiting");
+
+static int tcp_inflight_enable = 1;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, enable, CTLFLAG_RW,
+ &tcp_inflight_enable, 0, "Enable automatic TCP inflight data limiting");
+
+static int tcp_inflight_debug = 0;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, debug, CTLFLAG_RW,
+ &tcp_inflight_debug, 0, "Debug TCP inflight calculations");
+
+static int tcp_inflight_rttthresh;
+SYSCTL_PROC(_net_inet_tcp_inflight, OID_AUTO, rttthresh, CTLTYPE_INT|CTLFLAG_RW,
+ &tcp_inflight_rttthresh, 0, sysctl_msec_to_ticks, "I",
+ "RTT threshold below which inflight will deactivate itself");
+
+static int tcp_inflight_min = 6144;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, min, CTLFLAG_RW,
+ &tcp_inflight_min, 0, "Lower-bound for TCP inflight window");
+
+static int tcp_inflight_max = TCP_MAXWIN << TCP_MAX_WINSHIFT;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, max, CTLFLAG_RW,
+ &tcp_inflight_max, 0, "Upper-bound for TCP inflight window");
+
+static int tcp_inflight_stab = 20;
+SYSCTL_INT(_net_inet_tcp_inflight, OID_AUTO, stab, CTLFLAG_RW,
+ &tcp_inflight_stab, 0, "Inflight Algorithm Stabilization 20 = 2 packets");
+
+uma_zone_t sack_hole_zone;
+
+static struct inpcb *tcp_notify(struct inpcb *, int);
+static struct inpcb *cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno);
+
+/*
+ * Target size of TCP PCB hash tables. Must be a power of two.
+ *
+ * Note that this can be overridden by the kernel environment
+ * variable net.inet.tcp.tcbhashsize
+ */
+#ifndef TCBHASHSIZE
+#define TCBHASHSIZE 512
+#endif
+
+/*
+ * XXX
+ * Callouts should be moved into struct tcp directly. They are currently
+ * separate because the tcpcb structure is exported to userland for sysctl
+ * parsing purposes, which do not know about callouts.
+ */
+struct tcpcb_mem {
+ struct tcpcb tcb;
+ struct tcp_timer tt;
+};
+
+MALLOC_DEFINE(M_TCPLOG, "tcplog", "TCP address and flags print buffers");
+
+/*
+ * Drop a TCP connection, reporting
+ * the specified error. If connection is synchronized,
+ * then send a RST to peer.
+ */
+struct tcpcb *
+cxgb_tcp_drop(struct tcpcb *tp, int errno)
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(tp->t_inpcb);
+
+ if (TCPS_HAVERCVDSYN(tp->t_state)) {
+ tp->t_state = TCPS_CLOSED;
+ (void) tcp_gen_reset(tp);
+ tcpstat.tcps_drops++;
+ } else
+ tcpstat.tcps_conndrops++;
+ if (errno == ETIMEDOUT && tp->t_softerror)
+ errno = tp->t_softerror;
+ so->so_error = errno;
+ return (cxgb_tcp_close(tp));
+}
+
+/*
+ * Attempt to close a TCP control block, marking it as dropped, and freeing
+ * the socket if we hold the only reference.
+ */
+struct tcpcb *
+cxgb_tcp_close(struct tcpcb *tp)
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so;
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ if (tp->t_state == TCPS_LISTEN)
+ tcp_gen_listen_close(tp);
+ in_pcbdrop(inp);
+ tcpstat.tcps_closed++;
+ KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL"));
+ so = inp->inp_socket;
+ soisdisconnected(so);
+ if (inp->inp_vflag & INP_SOCKREF) {
+ KASSERT(so->so_state & SS_PROTOREF,
+ ("tcp_close: !SS_PROTOREF"));
+ inp->inp_vflag &= ~INP_SOCKREF;
+ INP_UNLOCK(inp);
+ ACCEPT_LOCK();
+ SOCK_LOCK(so);
+ so->so_state &= ~SS_PROTOREF;
+ sofree(so);
+ return (NULL);
+ }
+ return (tp);
+}
+
+/*
+ * Notify a tcp user of an asynchronous error;
+ * store error as soft error, but wake up user
+ * (for now, won't do anything until can select for soft error).
+ *
+ * Do not wake up user since there currently is no mechanism for
+ * reporting soft errors (yet - a kqueue filter may be added).
+ */
+static struct inpcb *
+tcp_notify(struct inpcb *inp, int error)
+{
+ struct tcpcb *tp;
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ if ((inp->inp_vflag & INP_TIMEWAIT) ||
+ (inp->inp_vflag & INP_DROPPED))
+ return (inp);
+
+ tp = intotcpcb(inp);
+ KASSERT(tp != NULL, ("tcp_notify: tp == NULL"));
+
+ /*
+ * Ignore some errors if we are hooked up.
+ * If connection hasn't completed, has retransmitted several times,
+ * and receives a second error, give up now. This is better
+ * than waiting a long time to establish a connection that
+ * can never complete.
+ */
+ if (tp->t_state == TCPS_ESTABLISHED &&
+ (error == EHOSTUNREACH || error == ENETUNREACH ||
+ error == EHOSTDOWN)) {
+ return (inp);
+ } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
+ tp->t_softerror) {
+ tp = cxgb_tcp_drop(tp, error);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
+ } else {
+ tp->t_softerror = error;
+ return (inp);
+ }
+#if 0
+ wakeup( &so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+#endif
+}
+
+void
+cxgb_tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip)
+{
+ struct ip *ip = vip;
+ struct tcphdr *th;
+ struct in_addr faddr;
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ struct icmp *icp;
+ struct in_conninfo inc;
+ tcp_seq icmp_tcp_seq;
+ int mtu;
+
+ faddr = ((struct sockaddr_in *)sa)->sin_addr;
+ if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
+ return;
+
+ if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
+ cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) && ip)
+ notify = cxgb_tcp_drop_syn_sent;
+ /*
+ * Redirects don't need to be handled up here.
+ */
+ else if (PRC_IS_REDIRECT(cmd))
+ return;
+ /*
+ * Source quench is depreciated.
+ */
+ else if (cmd == PRC_QUENCH)
+ return;
+ /*
+ * Hostdead is ugly because it goes linearly through all PCBs.
+ * XXX: We never get this from ICMP, otherwise it makes an
+ * excellent DoS attack on machines with many connections.
+ */
+ else if (cmd == PRC_HOSTDEAD)
+ ip = NULL;
+ else if ((unsigned)cmd >= PRC_NCMDS || inetctlerrmap[cmd] == 0)
+ return;
+ if (ip != NULL) {
+ icp = (struct icmp *)((caddr_t)ip
+ - offsetof(struct icmp, icmp_ip));
+ th = (struct tcphdr *)((caddr_t)ip
+ + (ip->ip_hl << 2));
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
+ ip->ip_src, th->th_sport, 0, NULL);
+ if (inp != NULL) {
+ INP_LOCK(inp);
+ if (!(inp->inp_vflag & INP_TIMEWAIT) &&
+ !(inp->inp_vflag & INP_DROPPED) &&
+ !(inp->inp_socket == NULL)) {
+ icmp_tcp_seq = htonl(th->th_seq);
+ tp = intotcpcb(inp);
+ if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
+ SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
+ if (cmd == PRC_MSGSIZE) {
+ /*
+ * MTU discovery:
+ * If we got a needfrag set the MTU
+ * in the route to the suggested new
+ * value (if given) and then notify.
+ */
+ bzero(&inc, sizeof(inc));
+ inc.inc_flags = 0; /* IPv4 */
+ inc.inc_faddr = faddr;
+
+ mtu = ntohs(icp->icmp_nextmtu);
+ /*
+ * If no alternative MTU was
+ * proposed, try the next smaller
+ * one. ip->ip_len has already
+ * been swapped in icmp_input().
+ */
+ if (!mtu)
+ mtu = ip_next_mtu(ip->ip_len,
+ 1);
+ if (mtu < max(296, (tcp_minmss)
+ + sizeof(struct tcpiphdr)))
+ mtu = 0;
+ if (!mtu)
+ mtu = tcp_mssdflt
+ + sizeof(struct tcpiphdr);
+ /*
+ * Only cache the the MTU if it
+ * is smaller than the interface
+ * or route MTU. tcp_mtudisc()
+ * will do right thing by itself.
+ */
+ if (mtu <= tcp_maxmtu(&inc, NULL))
+ tcp_hc_updatemtu(&inc, mtu);
+ }
+
+ inp = (*notify)(inp, inetctlerrmap[cmd]);
+ }
+ }
+ if (inp != NULL)
+ INP_UNLOCK(inp);
+ } else {
+ inc.inc_fport = th->th_dport;
+ inc.inc_lport = th->th_sport;
+ inc.inc_faddr = faddr;
+ inc.inc_laddr = ip->ip_src;
+#ifdef INET6
+ inc.inc_isipv6 = 0;
+#endif
+ syncache_unreach(&inc, th);
+ }
+ INP_INFO_WUNLOCK(&tcbinfo);
+ } else
+ in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
+}
+
+#ifdef INET6
+void
+tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d)
+{
+ struct tcphdr th;
+ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify;
+ struct ip6_hdr *ip6;
+ struct mbuf *m;
+ struct ip6ctlparam *ip6cp = NULL;
+ const struct sockaddr_in6 *sa6_src = NULL;
+ int off;
+ struct tcp_portonly {
+ u_int16_t th_sport;
+ u_int16_t th_dport;
+ } *thp;
+
+ if (sa->sa_family != AF_INET6 ||
+ sa->sa_len != sizeof(struct sockaddr_in6))
+ return;
+
+ if (cmd == PRC_MSGSIZE)
+ notify = tcp_mtudisc;
+ else if (!PRC_IS_REDIRECT(cmd) &&
+ ((unsigned)cmd >= PRC_NCMDS || inet6ctlerrmap[cmd] == 0))
+ return;
+ /* Source quench is depreciated. */
+ else if (cmd == PRC_QUENCH)
+ return;
+
+ /* if the parameter is from icmp6, decode it. */
+ if (d != NULL) {
+ ip6cp = (struct ip6ctlparam *)d;
+ m = ip6cp->ip6c_m;
+ ip6 = ip6cp->ip6c_ip6;
+ off = ip6cp->ip6c_off;
+ sa6_src = ip6cp->ip6c_src;
+ } else {
+ m = NULL;
+ ip6 = NULL;
+ off = 0; /* fool gcc */
+ sa6_src = &sa6_any;
+ }
+
+ if (ip6 != NULL) {
+ struct in_conninfo inc;
+ /*
+ * XXX: We assume that when IPV6 is non NULL,
+ * M and OFF are valid.
+ */
+
+ /* check if we can safely examine src and dst ports */
+ if (m->m_pkthdr.len < off + sizeof(*thp))
+ return;
+
+ bzero(&th, sizeof(th));
+ m_copydata(m, off, sizeof(*thp), (caddr_t)&th);
+
+ in6_pcbnotify(&tcbinfo, sa, th.th_dport,
+ (struct sockaddr *)ip6cp->ip6c_src,
+ th.th_sport, cmd, NULL, notify);
+
+ inc.inc_fport = th.th_dport;
+ inc.inc_lport = th.th_sport;
+ inc.inc6_faddr = ((struct sockaddr_in6 *)sa)->sin6_addr;
+ inc.inc6_laddr = ip6cp->ip6c_src->sin6_addr;
+ inc.inc_isipv6 = 1;
+ INP_INFO_WLOCK(&tcbinfo);
+ syncache_unreach(&inc, &th);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ } else
+ in6_pcbnotify(&tcbinfo, sa, 0, (const struct sockaddr *)sa6_src,
+ 0, cmd, NULL, notify);
+}
+#endif /* INET6 */
+
+
+/*
+ * Following is where TCP initial sequence number generation occurs.
+ *
+ * There are two places where we must use initial sequence numbers:
+ * 1. In SYN-ACK packets.
+ * 2. In SYN packets.
+ *
+ * All ISNs for SYN-ACK packets are generated by the syncache. See
+ * tcp_syncache.c for details.
+ *
+ * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
+ * depends on this property. In addition, these ISNs should be
+ * unguessable so as to prevent connection hijacking. To satisfy
+ * the requirements of this situation, the algorithm outlined in
+ * RFC 1948 is used, with only small modifications.
+ *
+ * Implementation details:
+ *
+ * Time is based off the system timer, and is corrected so that it
+ * increases by one megabyte per second. This allows for proper
+ * recycling on high speed LANs while still leaving over an hour
+ * before rollover.
+ *
+ * As reading the *exact* system time is too expensive to be done
+ * whenever setting up a TCP connection, we increment the time
+ * offset in two ways. First, a small random positive increment
+ * is added to isn_offset for each connection that is set up.
+ * Second, the function tcp_isn_tick fires once per clock tick
+ * and increments isn_offset as necessary so that sequence numbers
+ * are incremented at approximately ISN_BYTES_PER_SECOND. The
+ * random positive increments serve only to ensure that the same
+ * exact sequence number is never sent out twice (as could otherwise
+ * happen when a port is recycled in less than the system tick
+ * interval.)
+ *
+ * net.inet.tcp.isn_reseed_interval controls the number of seconds
+ * between seeding of isn_secret. This is normally set to zero,
+ * as reseeding should not be necessary.
+ *
+ * Locking of the global variables isn_secret, isn_last_reseed, isn_offset,
+ * isn_offset_old, and isn_ctx is performed using the TCP pcbinfo lock. In
+ * general, this means holding an exclusive (write) lock.
+ */
+
+#define ISN_BYTES_PER_SECOND 1048576
+#define ISN_STATIC_INCREMENT 4096
+#define ISN_RANDOM_INCREMENT (4096 - 1)
+
+
+/*
+ * When a specific ICMP unreachable message is received and the
+ * connection state is SYN-SENT, drop the connection. This behavior
+ * is controlled by the icmp_may_rst sysctl.
+ */
+static struct inpcb *
+cxgb_tcp_drop_syn_sent(struct inpcb *inp, int errno)
+{
+ struct tcpcb *tp;
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ if ((inp->inp_vflag & INP_TIMEWAIT) ||
+ (inp->inp_vflag & INP_DROPPED))
+ return (inp);
+
+ tp = intotcpcb(inp);
+ if (tp->t_state != TCPS_SYN_SENT)
+ return (inp);
+
+ tp = cxgb_tcp_drop(tp, errno);
+ if (tp != NULL)
+ return (inp);
+ else
+ return (NULL);
+}
+
+static int
+cxgb_sysctl_drop(SYSCTL_HANDLER_ARGS)
+{
+ /* addrs[0] is a foreign socket, addrs[1] is a local one. */
+ struct sockaddr_storage addrs[2];
+ struct inpcb *inp;
+ struct tcpcb *tp;
+ struct tcptw *tw;
+ struct sockaddr_in *fin, *lin;
+#ifdef INET6
+ struct sockaddr_in6 *fin6, *lin6;
+ struct in6_addr f6, l6;
+#endif
+ int error;
+
+ inp = NULL;
+ fin = lin = NULL;
+#ifdef INET6
+ fin6 = lin6 = NULL;
+#endif
+ error = 0;
+
+ if (req->oldptr != NULL || req->oldlen != 0)
+ return (EINVAL);
+ if (req->newptr == NULL)
+ return (EPERM);
+ if (req->newlen < sizeof(addrs))
+ return (ENOMEM);
+ error = SYSCTL_IN(req, &addrs, sizeof(addrs));
+ if (error)
+ return (error);
+
+ switch (addrs[0].ss_family) {
+#ifdef INET6
+ case AF_INET6:
+ fin6 = (struct sockaddr_in6 *)&addrs[0];
+ lin6 = (struct sockaddr_in6 *)&addrs[1];
+ if (fin6->sin6_len != sizeof(struct sockaddr_in6) ||
+ lin6->sin6_len != sizeof(struct sockaddr_in6))
+ return (EINVAL);
+ if (IN6_IS_ADDR_V4MAPPED(&fin6->sin6_addr)) {
+ if (!IN6_IS_ADDR_V4MAPPED(&lin6->sin6_addr))
+ return (EINVAL);
+ in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[0]);
+ in6_sin6_2_sin_in_sock((struct sockaddr *)&addrs[1]);
+ fin = (struct sockaddr_in *)&addrs[0];
+ lin = (struct sockaddr_in *)&addrs[1];
+ break;
+ }
+ error = sa6_embedscope(fin6, ip6_use_defzone);
+ if (error)
+ return (error);
+ error = sa6_embedscope(lin6, ip6_use_defzone);
+ if (error)
+ return (error);
+ break;
+#endif
+ case AF_INET:
+ fin = (struct sockaddr_in *)&addrs[0];
+ lin = (struct sockaddr_in *)&addrs[1];
+ if (fin->sin_len != sizeof(struct sockaddr_in) ||
+ lin->sin_len != sizeof(struct sockaddr_in))
+ return (EINVAL);
+ break;
+ default:
+ return (EINVAL);
+ }
+ INP_INFO_WLOCK(&tcbinfo);
+ switch (addrs[0].ss_family) {
+#ifdef INET6
+ case AF_INET6:
+ inp = in6_pcblookup_hash(&tcbinfo, &f6, fin6->sin6_port,
+ &l6, lin6->sin6_port, 0, NULL);
+ break;
+#endif
+ case AF_INET:
+ inp = in_pcblookup_hash(&tcbinfo, fin->sin_addr, fin->sin_port,
+ lin->sin_addr, lin->sin_port, 0, NULL);
+ break;
+ }
+ if (inp != NULL) {
+ INP_LOCK(inp);
+ if (inp->inp_vflag & INP_TIMEWAIT) {
+ /*
+ * XXXRW: There currently exists a state where an
+ * inpcb is present, but its timewait state has been
+ * discarded. For now, don't allow dropping of this
+ * type of inpcb.
+ */
+ tw = intotw(inp);
+ if (tw != NULL)
+ tcp_twclose(tw, 0);
+ else
+ INP_UNLOCK(inp);
+ } else if (!(inp->inp_vflag & INP_DROPPED) &&
+ !(inp->inp_socket->so_options & SO_ACCEPTCONN)) {
+ tp = intotcpcb(inp);
+ tp = cxgb_tcp_drop(tp, ECONNABORTED);
+ if (tp != NULL)
+ INP_UNLOCK(inp);
+ } else
+ INP_UNLOCK(inp);
+ } else
+ error = ESRCH;
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+
+SYSCTL_PROC(_net_inet_tcp_cxgb, TCPCTL_DROP, drop,
+ CTLTYPE_STRUCT|CTLFLAG_WR|CTLFLAG_SKIP, NULL,
+ 0, cxgb_sysctl_drop, "", "Drop TCP connection");
+
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
new file mode 100644
index 0000000..bd940b2
--- /dev/null
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tcp_usrreq.c
@@ -0,0 +1,1362 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California.
+ * Copyright (c) 2006-2007 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * From: @(#)tcp_usrreq.c 8.2 (Berkeley) 1/3/94
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+#include <sys/mbuf.h>
+#ifdef INET6
+#include <sys/domain.h>
+#endif /* INET6 */
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/protosw.h>
+#include <sys/proc.h>
+#include <sys/jail.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
+#include <netinet/in_pcb.h>
+#ifdef INET6
+#include <netinet6/in6_pcb.h>
+#endif
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+#ifdef INET6
+#include <netinet6/ip6_var.h>
+#include <netinet6/scope6_var.h>
+#endif
+#include <netinet/tcp.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet/tcpip.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif
+#include <netinet/tcp_offload.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
+
+/*
+ * TCP protocol interface to socket abstraction.
+ */
+static int tcp_attach(struct socket *);
+static int tcp_connect(struct tcpcb *, struct sockaddr *,
+ struct thread *td);
+#ifdef INET6
+static int tcp6_connect(struct tcpcb *, struct sockaddr *,
+ struct thread *td);
+#endif /* INET6 */
+static void tcp_disconnect(struct tcpcb *);
+static void tcp_usrclosed(struct tcpcb *);
+
+#ifdef TCPDEBUG
+#define TCPDEBUG0 int ostate = 0
+#define TCPDEBUG1() ostate = tp ? tp->t_state : 0
+#define TCPDEBUG2(req) if (tp && (so->so_options & SO_DEBUG)) \
+ tcp_trace(TA_USER, ostate, tp, 0, 0, req)
+#else
+#define TCPDEBUG0
+#define TCPDEBUG1()
+#define TCPDEBUG2(req)
+#endif
+
+/*
+ * TCP attaches to socket via pru_attach(), reserving space,
+ * and an internet control block.
+ */
+static int
+tcp_usr_attach(struct socket *so, int proto, struct thread *td)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int error;
+ TCPDEBUG0;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp == NULL, ("tcp_usr_attach: inp != NULL"));
+ TCPDEBUG1();
+
+ error = tcp_attach(so);
+ if (error)
+ goto out;
+
+ if ((so->so_options & SO_LINGER) && so->so_linger == 0)
+ so->so_linger = TCP_LINGERTIME;
+
+ inp = sotoinpcb(so);
+ tp = intotcpcb(inp);
+out:
+ TCPDEBUG2(PRU_ATTACH);
+ return error;
+}
+
+/*
+ * tcp_detach is called when the socket layer loses its final reference
+ * to the socket, be it a file descriptor reference, a reference from TCP,
+ * etc. At this point, there is only one case in which we will keep around
+ * inpcb state: time wait.
+ *
+ * This function can probably be re-absorbed back into tcp_usr_detach() now
+ * that there is a single detach path.
+ */
+static void
+tcp_detach(struct socket *so, struct inpcb *inp)
+{
+ struct tcpcb *tp;
+#ifdef INET6
+ int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
+#endif
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ KASSERT(so->so_pcb == inp, ("tcp_detach: so_pcb != inp"));
+ KASSERT(inp->inp_socket == so, ("tcp_detach: inp_socket != so"));
+
+ tp = intotcpcb(inp);
+
+ if (inp->inp_vflag & INP_TIMEWAIT) {
+ /*
+ * There are two cases to handle: one in which the time wait
+ * state is being discarded (INP_DROPPED), and one in which
+ * this connection will remain in timewait. In the former,
+ * it is time to discard all state (except tcptw, which has
+ * already been discarded by the timewait close code, which
+ * should be further up the call stack somewhere). In the
+ * latter case, we detach from the socket, but leave the pcb
+ * present until timewait ends.
+ *
+ * XXXRW: Would it be cleaner to free the tcptw here?
+ */
+ if (inp->inp_vflag & INP_DROPPED) {
+ KASSERT(tp == NULL, ("tcp_detach: INP_TIMEWAIT && "
+ "INP_DROPPED && tp != NULL"));
+#ifdef INET6
+ if (isipv6) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
+#endif
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
+#endif
+ } else {
+#ifdef INET6
+ if (isipv6)
+ in6_pcbdetach(inp);
+ else
+#endif
+ in_pcbdetach(inp);
+ INP_UNLOCK(inp);
+ }
+ } else {
+ /*
+ * If the connection is not in timewait, we consider two
+ * two conditions: one in which no further processing is
+ * necessary (dropped || embryonic), and one in which TCP is
+ * not yet done, but no longer requires the socket, so the
+ * pcb will persist for the time being.
+ *
+ * XXXRW: Does the second case still occur?
+ */
+ if (inp->inp_vflag & INP_DROPPED ||
+ tp->t_state < TCPS_SYN_SENT) {
+ tcp_discardcb(tp);
+#ifdef INET6
+ if (isipv6) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
+#endif
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
+#endif
+ } else {
+#ifdef INET6
+ if (isipv6)
+ in6_pcbdetach(inp);
+ else
+#endif
+ in_pcbdetach(inp);
+ }
+ }
+}
+
+/*
+ * pru_detach() detaches the TCP protocol from the socket.
+ * If the protocol state is non-embryonic, then can't
+ * do this directly: have to initiate a pru_disconnect(),
+ * which may finish later; embryonic TCB's can just
+ * be discarded here.
+ */
+static void
+tcp_usr_detach(struct socket *so)
+{
+ struct inpcb *inp;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL"));
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(inp);
+ KASSERT(inp->inp_socket != NULL,
+ ("tcp_usr_detach: inp_socket == NULL"));
+ tcp_detach(so, inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+}
+
+/*
+ * Give the socket an address.
+ */
+static int
+tcp_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in *sinp;
+
+ sinp = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sinp))
+ return (EINVAL);
+ /*
+ * Must check for multicast addresses and disallow binding
+ * to them.
+ */
+ if (sinp->sin_family == AF_INET &&
+ IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
+ return (EAFNOSUPPORT);
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_bind: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ error = in_pcbbind(inp, nam, td->td_ucred);
+out:
+ TCPDEBUG2(PRU_BIND);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+
+ return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in6 *sin6p;
+
+ sin6p = (struct sockaddr_in6 *)nam;
+ if (nam->sa_len != sizeof (*sin6p))
+ return (EINVAL);
+ /*
+ * Must check for multicast addresses and disallow binding
+ * to them.
+ */
+ if (sin6p->sin6_family == AF_INET6 &&
+ IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
+ return (EAFNOSUPPORT);
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_bind: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ inp->inp_vflag &= ~INP_IPV4;
+ inp->inp_vflag |= INP_IPV6;
+ if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) {
+ if (IN6_IS_ADDR_UNSPECIFIED(&sin6p->sin6_addr))
+ inp->inp_vflag |= INP_IPV4;
+ else if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+ struct sockaddr_in sin;
+
+ in6_sin6_2_sin(&sin, sin6p);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_vflag &= ~INP_IPV6;
+ error = in_pcbbind(inp, (struct sockaddr *)&sin,
+ td->td_ucred);
+ goto out;
+ }
+ }
+ error = in6_pcbbind(inp, nam, td->td_ucred);
+out:
+ TCPDEBUG2(PRU_BIND);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Prepare to accept connections.
+ */
+static int
+tcp_usr_listen(struct socket *so, int backlog, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_listen: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error == 0 && inp->inp_lport == 0)
+ error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ if (error == 0) {
+ tp->t_state = TCPS_LISTEN;
+ solisten_proto(so, backlog);
+ tcp_gen_listen_open(tp);
+ }
+ SOCK_UNLOCK(so);
+
+out:
+ TCPDEBUG2(PRU_LISTEN);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_listen(struct socket *so, int backlog, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_listen: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ SOCK_LOCK(so);
+ error = solisten_proto_check(so);
+ if (error == 0 && inp->inp_lport == 0) {
+ inp->inp_vflag &= ~INP_IPV4;
+ if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0)
+ inp->inp_vflag |= INP_IPV4;
+ error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ }
+ if (error == 0) {
+ tp->t_state = TCPS_LISTEN;
+ solisten_proto(so, backlog);
+ }
+ SOCK_UNLOCK(so);
+
+out:
+ TCPDEBUG2(PRU_LISTEN);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate connection to peer.
+ * Create a template for use in transmissions on this connection.
+ * Enter SYN_SENT state, and mark socket as connecting.
+ * Start keep-alive timer, and seed output sequence space.
+ * Send initial segment on connection.
+ */
+static int
+tcp_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in *sinp;
+
+ sinp = (struct sockaddr_in *)nam;
+ if (nam->sa_len != sizeof (*sinp))
+ return (EINVAL);
+ /*
+ * Must disallow TCP ``connections'' to multicast addresses.
+ */
+ if (sinp->sin_family == AF_INET
+ && IN_MULTICAST(ntohl(sinp->sin_addr.s_addr)))
+ return (EAFNOSUPPORT);
+ if (jailed(td->td_ucred))
+ prison_remote_ip(td->td_ucred, 0, &sinp->sin_addr.s_addr);
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_connect: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if ((error = tcp_connect(tp, nam, td)) != 0)
+ goto out;
+ printf("calling tcp_gen_connect\n");
+
+ error = tcp_gen_connect(so, nam);
+out:
+ TCPDEBUG2(PRU_CONNECT);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+
+#ifdef INET6
+static int
+tcp6_usr_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ struct sockaddr_in6 *sin6p;
+
+ TCPDEBUG0;
+
+ sin6p = (struct sockaddr_in6 *)nam;
+ if (nam->sa_len != sizeof (*sin6p))
+ return (EINVAL);
+ /*
+ * Must disallow TCP ``connections'' to multicast addresses.
+ */
+ if (sin6p->sin6_family == AF_INET6
+ && IN6_IS_ADDR_MULTICAST(&sin6p->sin6_addr))
+ return (EAFNOSUPPORT);
+
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_connect: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = EINVAL;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if (IN6_IS_ADDR_V4MAPPED(&sin6p->sin6_addr)) {
+ struct sockaddr_in sin;
+
+ if ((inp->inp_flags & IN6P_IPV6_V6ONLY) != 0) {
+ error = EINVAL;
+ goto out;
+ }
+
+ in6_sin6_2_sin(&sin, sin6p);
+ inp->inp_vflag |= INP_IPV4;
+ inp->inp_vflag &= ~INP_IPV6;
+ if ((error = tcp_connect(tp, (struct sockaddr *)&sin, td)) != 0)
+ goto out;
+ error = tcp_gen_connect(so, nam);
+ goto out;
+ }
+ inp->inp_vflag &= ~INP_IPV4;
+ inp->inp_vflag |= INP_IPV6;
+ inp->inp_inc.inc_isipv6 = 1;
+ if ((error = tcp6_connect(tp, nam, td)) != 0)
+ goto out;
+ error = tcp_gen_connect(so, nam);
+
+out:
+ TCPDEBUG2(PRU_CONNECT);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+#endif /* INET6 */
+
+/*
+ * Initiate disconnect from peer.
+ * If connection never passed embryonic stage, just drop;
+ * else if don't need to let data drain, then can just drop anyways,
+ * else have to begin TCP shutdown process: mark socket disconnecting,
+ * drain unread data, state switch to reflect user close, and
+ * send segment (e.g. FIN) to peer. Socket will be really disconnected
+ * when peer sends FIN and acks ours.
+ *
+ * SHOULD IMPLEMENT LATER PRU_CONNECT VIA REALLOC TCPCB.
+ */
+static int
+tcp_usr_disconnect(struct socket *so)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int error = 0;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_disconnect(tp);
+out:
+ TCPDEBUG2(PRU_DISCONNECT);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+
+/*
+ * Accept a connection. Essentially all the work is
+ * done at higher levels; just return the address
+ * of the peer, storing through addr.
+ */
+static int
+tcp_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+ int error = 0;
+ struct inpcb *inp = NULL;
+ struct tcpcb *tp = NULL;
+ struct in_addr addr;
+ in_port_t port = 0;
+ TCPDEBUG0;
+
+ if (so->so_state & SS_ISDISCONNECTED)
+ return (ECONNABORTED);
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_accept: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNABORTED;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+
+ /*
+ * We inline in_getpeeraddr and COMMON_END here, so that we can
+ * copy the data of interest and defer the malloc until after we
+ * release the lock.
+ */
+ port = inp->inp_fport;
+ addr = inp->inp_faddr;
+
+out:
+ TCPDEBUG2(PRU_ACCEPT);
+ INP_UNLOCK(inp);
+ if (error == 0)
+ *nam = in_sockaddr(port, &addr);
+ return error;
+}
+
+#ifdef INET6
+static int
+tcp6_usr_accept(struct socket *so, struct sockaddr **nam)
+{
+ struct inpcb *inp = NULL;
+ int error = 0;
+ struct tcpcb *tp = NULL;
+ struct in_addr addr;
+ struct in6_addr addr6;
+ in_port_t port = 0;
+ int v4 = 0;
+ TCPDEBUG0;
+
+ if (so->so_state & SS_ISDISCONNECTED)
+ return (ECONNABORTED);
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNABORTED;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+
+ /*
+ * We inline in6_mapped_peeraddr and COMMON_END here, so that we can
+ * copy the data of interest and defer the malloc until after we
+ * release the lock.
+ */
+ if (inp->inp_vflag & INP_IPV4) {
+ v4 = 1;
+ port = inp->inp_fport;
+ addr = inp->inp_faddr;
+ } else {
+ port = inp->inp_fport;
+ addr6 = inp->in6p_faddr;
+ }
+
+out:
+ TCPDEBUG2(PRU_ACCEPT);
+ INP_UNLOCK(inp);
+ if (error == 0) {
+ if (v4)
+ *nam = in6_v4mapsin6_sockaddr(port, &addr);
+ else
+ *nam = in6_sockaddr(port, &addr6);
+ }
+ return error;
+}
+#endif /* INET6 */
+
+/*
+ * Mark the connection as being incapable of further output.
+ */
+static int
+tcp_usr_shutdown(struct socket *so)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ INP_INFO_WLOCK(&tcbinfo);
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ socantsendmore(so);
+ tcp_usrclosed(tp);
+ error = tcp_gen_disconnect(tp);
+
+out:
+ TCPDEBUG2(PRU_SHUTDOWN);
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+
+ return (error);
+}
+
+/*
+ * After a receive, possibly send window update to peer.
+ */
+static int
+tcp_usr_rcvd(struct socket *so, int flags)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int error = 0;
+
+ TCPDEBUG0;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_rcvd: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_gen_rcvd(tp);
+
+out:
+ TCPDEBUG2(PRU_RCVD);
+ INP_UNLOCK(inp);
+ return (error);
+}
+
+/*
+ * Do a send by putting data in output queue and updating urgent
+ * marker if URG set. Possibly send more data. Unlike the other
+ * pru_*() routines, the mbuf chains are our responsibility. We
+ * must either enqueue them or free them. The other pru_* routines
+ * generally are caller-frees.
+ */
+static int
+tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
+ struct sockaddr *nam, struct mbuf *control, struct thread *td)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ int headlocked = 0;
+#ifdef INET6
+ int isipv6;
+#endif
+ TCPDEBUG0;
+
+ /*
+ * We require the pcbinfo lock in two cases:
+ *
+ * (1) An implied connect is taking place, which can result in
+ * binding IPs and ports and hence modification of the pcb hash
+ * chains.
+ *
+ * (2) PRUS_EOF is set, resulting in explicit close on the send.
+ */
+ if ((nam != NULL) || (flags & PRUS_EOF)) {
+ INP_INFO_WLOCK(&tcbinfo);
+ headlocked = 1;
+ }
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ if (control)
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ error = ECONNRESET;
+ goto out;
+ }
+#ifdef INET6
+ isipv6 = nam && nam->sa_family == AF_INET6;
+#endif /* INET6 */
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if (control) {
+ /* TCP doesn't do control messages (rights, creds, etc) */
+ if (control->m_len) {
+ m_freem(control);
+ if (m)
+ m_freem(m);
+ error = EINVAL;
+ goto out;
+ }
+ m_freem(control); /* empty control, just free it */
+ }
+ if (!(flags & PRUS_OOB)) {
+ sbappendstream(&so->so_snd, m);
+ if (nam && tp->t_state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected,
+ * initialize window to default value, and
+ * initialize maxseg/maxopd using peer's cached
+ * MSS.
+ */
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+#ifdef INET6
+ if (isipv6)
+ error = tcp6_connect(tp, nam, td);
+ else
+#endif /* INET6 */
+ error = tcp_connect(tp, nam, td);
+ if (error)
+ goto out;
+ tp->snd_wnd = TTCP_CLIENT_SND_WND;
+ tcp_mss(tp, -1);
+ }
+ if (flags & PRUS_EOF) {
+ /*
+ * Close the send side of the connection after
+ * the data is sent.
+ */
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ socantsendmore(so);
+ tcp_usrclosed(tp);
+ }
+ if (headlocked) {
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ }
+ if (tp != NULL) {
+ if (flags & PRUS_MORETOCOME)
+ tp->t_flags |= TF_MORETOCOME;
+ error = tcp_gen_send(tp);
+ if (flags & PRUS_MORETOCOME)
+ tp->t_flags &= ~TF_MORETOCOME;
+ }
+ } else {
+ /*
+ * XXXRW: PRUS_EOF not implemented with PRUS_OOB?
+ */
+ SOCKBUF_LOCK(&so->so_snd);
+ if (sbspace(&so->so_snd) < -512) {
+ SOCKBUF_UNLOCK(&so->so_snd);
+ m_freem(m);
+ error = ENOBUFS;
+ goto out;
+ }
+ /*
+ * According to RFC961 (Assigned Protocols),
+ * the urgent pointer points to the last octet
+ * of urgent data. We continue, however,
+ * to consider it to indicate the first octet
+ * of data past the urgent section.
+ * Otherwise, snd_up should be one lower.
+ */
+ sbappendstream_locked(&so->so_snd, m);
+ SOCKBUF_UNLOCK(&so->so_snd);
+ if (nam && tp->t_state < TCPS_SYN_SENT) {
+ /*
+ * Do implied connect if not yet connected,
+ * initialize window to default value, and
+ * initialize maxseg/maxopd using peer's cached
+ * MSS.
+ */
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+#ifdef INET6
+ if (isipv6)
+ error = tcp6_connect(tp, nam, td);
+ else
+#endif /* INET6 */
+ error = tcp_connect(tp, nam, td);
+ if (error)
+ goto out;
+ tp->snd_wnd = TTCP_CLIENT_SND_WND;
+ tcp_mss(tp, -1);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ } else if (nam) {
+ INP_INFO_WUNLOCK(&tcbinfo);
+ headlocked = 0;
+ }
+ tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
+ tp->t_flags |= TF_FORCEDATA;
+ error = tcp_gen_send(tp);
+ tp->t_flags &= ~TF_FORCEDATA;
+ }
+out:
+ TCPDEBUG2((flags & PRUS_OOB) ? PRU_SENDOOB :
+ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND));
+ INP_UNLOCK(inp);
+ if (headlocked)
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+}
+
+/*
+ * Abort the TCP. Drop the connection abruptly.
+ */
+static void
+tcp_usr_abort(struct socket *so)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ TCPDEBUG0;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL"));
+
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(inp);
+ KASSERT(inp->inp_socket != NULL,
+ ("tcp_usr_abort: inp_socket == NULL"));
+
+ /*
+ * If we still have full TCP state, and we're not dropped, drop.
+ */
+ if (!(inp->inp_vflag & INP_TIMEWAIT) &&
+ !(inp->inp_vflag & INP_DROPPED)) {
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ cxgb_tcp_drop(tp, ECONNABORTED);
+ TCPDEBUG2(PRU_ABORT);
+ }
+ if (!(inp->inp_vflag & INP_DROPPED)) {
+ SOCK_LOCK(so);
+ so->so_state |= SS_PROTOREF;
+ SOCK_UNLOCK(so);
+ inp->inp_vflag |= INP_SOCKREF;
+ }
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+}
+
+/*
+ * TCP socket is closed. Start friendly disconnect.
+ */
+static void
+tcp_usr_close(struct socket *so)
+{
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+ TCPDEBUG0;
+
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL"));
+
+ INP_INFO_WLOCK(&tcbinfo);
+ INP_LOCK(inp);
+ KASSERT(inp->inp_socket != NULL,
+ ("tcp_usr_close: inp_socket == NULL"));
+
+ /*
+ * If we still have full TCP state, and we're not dropped, initiate
+ * a disconnect.
+ */
+ if (!(inp->inp_vflag & INP_TIMEWAIT) &&
+ !(inp->inp_vflag & INP_DROPPED)) {
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ tcp_disconnect(tp);
+ TCPDEBUG2(PRU_CLOSE);
+ }
+ if (!(inp->inp_vflag & INP_DROPPED)) {
+ SOCK_LOCK(so);
+ so->so_state |= SS_PROTOREF;
+ SOCK_UNLOCK(so);
+ inp->inp_vflag |= INP_SOCKREF;
+ }
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+}
+
+/*
+ * Receive out-of-band data.
+ */
+static int
+tcp_usr_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+ int error = 0;
+ struct inpcb *inp;
+ struct tcpcb *tp = NULL;
+
+ TCPDEBUG0;
+ inp = sotoinpcb(so);
+ KASSERT(inp != NULL, ("tcp_usr_rcvoob: inp == NULL"));
+ INP_LOCK(inp);
+ if (inp->inp_vflag & (INP_TIMEWAIT | INP_DROPPED)) {
+ error = ECONNRESET;
+ goto out;
+ }
+ tp = intotcpcb(inp);
+ TCPDEBUG1();
+ if ((so->so_oobmark == 0 &&
+ (so->so_rcv.sb_state & SBS_RCVATMARK) == 0) ||
+ so->so_options & SO_OOBINLINE ||
+ tp->t_oobflags & TCPOOB_HADDATA) {
+ error = EINVAL;
+ goto out;
+ }
+ if ((tp->t_oobflags & TCPOOB_HAVEDATA) == 0) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ m->m_len = 1;
+ *mtod(m, caddr_t) = tp->t_iobc;
+ if ((flags & MSG_PEEK) == 0)
+ tp->t_oobflags ^= (TCPOOB_HAVEDATA | TCPOOB_HADDATA);
+
+out:
+ TCPDEBUG2(PRU_RCVOOB);
+ INP_UNLOCK(inp);
+ return (error);
+}
+
+struct pr_usrreqs cxgb_tcp_usrreqs = {
+ .pru_abort = tcp_usr_abort,
+ .pru_accept = tcp_usr_accept,
+ .pru_attach = tcp_usr_attach,
+ .pru_bind = tcp_usr_bind,
+ .pru_connect = tcp_usr_connect,
+ .pru_control = in_control,
+ .pru_detach = tcp_usr_detach,
+ .pru_disconnect = tcp_usr_disconnect,
+ .pru_listen = tcp_usr_listen,
+ .pru_peeraddr = in_getpeeraddr,
+ .pru_rcvd = tcp_usr_rcvd,
+ .pru_rcvoob = tcp_usr_rcvoob,
+ .pru_send = tcp_usr_send,
+ .pru_shutdown = tcp_usr_shutdown,
+ .pru_sockaddr = in_getsockaddr,
+ .pru_sosetlabel = in_pcbsosetlabel,
+ .pru_close = tcp_usr_close,
+};
+
+#ifdef INET6
+struct pr_usrreqs cxgb_tcp6_usrreqs = {
+ .pru_abort = tcp_usr_abort,
+ .pru_accept = tcp6_usr_accept,
+ .pru_attach = tcp_usr_attach,
+ .pru_bind = tcp6_usr_bind,
+ .pru_connect = tcp6_usr_connect,
+ .pru_control = in6_control,
+ .pru_detach = tcp_usr_detach,
+ .pru_disconnect = tcp_usr_disconnect,
+ .pru_listen = tcp6_usr_listen,
+ .pru_peeraddr = in6_mapped_peeraddr,
+ .pru_rcvd = tcp_usr_rcvd,
+ .pru_rcvoob = tcp_usr_rcvoob,
+ .pru_send = tcp_usr_send,
+ .pru_shutdown = tcp_usr_shutdown,
+ .pru_sockaddr = in6_mapped_sockaddr,
+ .pru_sosetlabel = in_pcbsosetlabel,
+ .pru_close = tcp_usr_close,
+};
+#endif /* INET6 */
+
+/*
+ * Common subroutine to open a TCP connection to remote host specified
+ * by struct sockaddr_in in mbuf *nam. Call in_pcbbind to assign a local
+ * port number if needed. Call in_pcbconnect_setup to do the routing and
+ * to choose a local host address (interface). If there is an existing
+ * incarnation of the same connection in TIME-WAIT state and if the remote
+ * host was sending CC options and if the connection duration was < MSL, then
+ * truncate the previous TIME-WAIT state and proceed.
+ * Initialize connection parameters and enter SYN-SENT state.
+ */
+static int
+tcp_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp = tp->t_inpcb, *oinp;
+ struct socket *so = inp->inp_socket;
+ struct in_addr laddr;
+ u_short lport;
+ int error;
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ if (inp->inp_lport == 0) {
+ error = in_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Cannot simply call in_pcbconnect, because there might be an
+ * earlier incarnation of this same connection still in
+ * TIME_WAIT state, creating an ADDRINUSE error.
+ */
+ laddr = inp->inp_laddr;
+ lport = inp->inp_lport;
+ error = in_pcbconnect_setup(inp, nam, &laddr.s_addr, &lport,
+ &inp->inp_faddr.s_addr, &inp->inp_fport, &oinp, td->td_ucred);
+ if (error && oinp == NULL)
+ return error;
+ if (oinp)
+ return EADDRINUSE;
+ inp->inp_laddr = laddr;
+ in_pcbrehash(inp);
+
+ /*
+ * Compute window scaling to request:
+ * Scale to fit into sweet spot. See tcp_syncache.c.
+ * XXX: This should move to tcp_output().
+ */
+ while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << tp->request_r_scale) < sb_max)
+ tp->request_r_scale++;
+
+ soisconnecting(so);
+ tcpstat.tcps_connattempt++;
+ tp->t_state = TCPS_SYN_SENT;
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+ tp->iss = tcp_new_isn(tp);
+ tp->t_bw_rtseq = tp->iss;
+ tcp_sendseqinit(tp);
+
+ return 0;
+}
+
+#ifdef INET6
+static int
+tcp6_connect(struct tcpcb *tp, struct sockaddr *nam, struct thread *td)
+{
+ struct inpcb *inp = tp->t_inpcb, *oinp;
+ struct socket *so = inp->inp_socket;
+ struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)nam;
+ struct in6_addr *addr6;
+ int error;
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ if (inp->inp_lport == 0) {
+ error = in6_pcbbind(inp, (struct sockaddr *)0, td->td_ucred);
+ if (error)
+ return error;
+ }
+
+ /*
+ * Cannot simply call in_pcbconnect, because there might be an
+ * earlier incarnation of this same connection still in
+ * TIME_WAIT state, creating an ADDRINUSE error.
+ * in6_pcbladdr() also handles scope zone IDs.
+ */
+ error = in6_pcbladdr(inp, nam, &addr6);
+ if (error)
+ return error;
+ oinp = in6_pcblookup_hash(inp->inp_pcbinfo,
+ &sin6->sin6_addr, sin6->sin6_port,
+ IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
+ ? addr6
+ : &inp->in6p_laddr,
+ inp->inp_lport, 0, NULL);
+ if (oinp)
+ return EADDRINUSE;
+ if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr))
+ inp->in6p_laddr = *addr6;
+ inp->in6p_faddr = sin6->sin6_addr;
+ inp->inp_fport = sin6->sin6_port;
+ /* update flowinfo - draft-itojun-ipv6-flowlabel-api-00 */
+ inp->in6p_flowinfo &= ~IPV6_FLOWLABEL_MASK;
+ if (inp->in6p_flags & IN6P_AUTOFLOWLABEL)
+ inp->in6p_flowinfo |=
+ (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
+ in_pcbrehash(inp);
+
+ /* Compute window scaling to request. */
+ while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
+ (TCP_MAXWIN << tp->request_r_scale) < so->so_rcv.sb_hiwat)
+ tp->request_r_scale++;
+
+ soisconnecting(so);
+ tcpstat.tcps_connattempt++;
+ tp->t_state = TCPS_SYN_SENT;
+ tcp_timer_activate(tp, TT_KEEP, tcp_keepinit);
+ tp->iss = tcp_new_isn(tp);
+ tp->t_bw_rtseq = tp->iss;
+ tcp_sendseqinit(tp);
+
+ return 0;
+}
+#endif /* INET6 */
+
+/*
+ * tcp_sendspace and tcp_recvspace are the default send and receive window
+ * sizes, respectively. These are obsolescent (this information should
+ * be set by the route).
+ */
+u_long tcp_sendspace = 1024*32;
+SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_SENDSPACE, sendspace, CTLFLAG_RW,
+ &tcp_sendspace , 0, "Maximum outgoing TCP datagram size");
+u_long tcp_recvspace = 1024*64;
+SYSCTL_ULONG(_net_inet_tcp_cxgb, TCPCTL_RECVSPACE, recvspace, CTLFLAG_RW,
+ &tcp_recvspace , 0, "Maximum incoming TCP datagram size");
+
+/*
+ * Attach TCP protocol to socket, allocating
+ * internet protocol control block, tcp control block,
+ * bufer space, and entering LISTEN state if to accept connections.
+ */
+static int
+tcp_attach(struct socket *so)
+{
+ struct tcpcb *tp;
+ struct inpcb *inp;
+ int error;
+#ifdef INET6
+ int isipv6 = INP_CHECK_SOCKAF(so, AF_INET6) != 0;
+#endif
+
+ if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+ error = soreserve(so, tcp_sendspace, tcp_recvspace);
+ if (error)
+ return (error);
+ }
+ so->so_rcv.sb_flags |= SB_AUTOSIZE;
+ so->so_snd.sb_flags |= SB_AUTOSIZE;
+ INP_INFO_WLOCK(&tcbinfo);
+ error = in_pcballoc(so, &tcbinfo);
+ if (error) {
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (error);
+ }
+ inp = sotoinpcb(so);
+#ifdef INET6
+ if (isipv6) {
+ inp->inp_vflag |= INP_IPV6;
+ inp->in6p_hops = -1; /* use kernel default */
+ }
+ else
+#endif
+ inp->inp_vflag |= INP_IPV4;
+ tp = tcp_newtcpcb(inp);
+ if (tp == NULL) {
+#ifdef INET6
+ if (isipv6) {
+ in6_pcbdetach(inp);
+ in6_pcbfree(inp);
+ } else {
+#endif
+ in_pcbdetach(inp);
+ in_pcbfree(inp);
+#ifdef INET6
+ }
+#endif
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (ENOBUFS);
+ }
+ tp->t_state = TCPS_CLOSED;
+ INP_UNLOCK(inp);
+ INP_INFO_WUNLOCK(&tcbinfo);
+ return (0);
+}
+
+/*
+ * Initiate (or continue) disconnect.
+ * If embryonic state, just send reset (once).
+ * If in ``let data drain'' option and linger null, just drop.
+ * Otherwise (hard), mark socket disconnecting and drop
+ * current input data; switch states based on user close, and
+ * send segment to peer (with FIN).
+ */
+static void
+tcp_disconnect(struct tcpcb *tp)
+{
+ struct inpcb *inp = tp->t_inpcb;
+ struct socket *so = inp->inp_socket;
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(inp);
+
+ /*
+ * Neither tcp_close() nor tcp_drop() should return NULL, as the
+ * socket is still open.
+ */
+ if (tp->t_state < TCPS_ESTABLISHED) {
+ tp = cxgb_tcp_close(tp);
+ KASSERT(tp != NULL,
+ ("tcp_disconnect: tcp_close() returned NULL"));
+ } else if ((so->so_options & SO_LINGER) && so->so_linger == 0) {
+ tp = cxgb_tcp_drop(tp, 0);
+ KASSERT(tp != NULL,
+ ("tcp_disconnect: tcp_drop() returned NULL"));
+ } else {
+ soisdisconnecting(so);
+ sbflush(&so->so_rcv);
+ tcp_usrclosed(tp);
+ if (!(inp->inp_vflag & INP_DROPPED))
+ tcp_gen_disconnect(tp);
+ }
+}
+
+/*
+ * User issued close, and wish to trail through shutdown states:
+ * if never received SYN, just forget it. If got a SYN from peer,
+ * but haven't sent FIN, then go to FIN_WAIT_1 state to send peer a FIN.
+ * If already got a FIN from peer, then almost done; go to LAST_ACK
+ * state. In all other cases, have already sent FIN to peer (e.g.
+ * after PRU_SHUTDOWN), and just have to play tedious game waiting
+ * for peer to send FIN or not respond to keep-alives, etc.
+ * We can let the user exit from the close as soon as the FIN is acked.
+ */
+static void
+tcp_usrclosed(struct tcpcb *tp)
+{
+
+ INP_INFO_WLOCK_ASSERT(&tcbinfo);
+ INP_LOCK_ASSERT(tp->t_inpcb);
+
+ switch (tp->t_state) {
+ case TCPS_LISTEN:
+ tcp_gen_listen_close(tp);
+ case TCPS_CLOSED:
+ tp->t_state = TCPS_CLOSED;
+ tp = cxgb_tcp_close(tp);
+ /*
+ * tcp_close() should never return NULL here as the socket is
+ * still open.
+ */
+ KASSERT(tp != NULL,
+ ("tcp_usrclosed: tcp_close() returned NULL"));
+ break;
+
+ case TCPS_SYN_SENT:
+ case TCPS_SYN_RECEIVED:
+ tp->t_flags |= TF_NEEDFIN;
+ break;
+
+ case TCPS_ESTABLISHED:
+ tp->t_state = TCPS_FIN_WAIT_1;
+ break;
+
+ case TCPS_CLOSE_WAIT:
+ tp->t_state = TCPS_LAST_ACK;
+ break;
+ }
+ if (tp->t_state >= TCPS_FIN_WAIT_2) {
+ soisdisconnected(tp->t_inpcb->inp_socket);
+ /* Prevent the connection hanging in FIN_WAIT_2 forever. */
+ if (tp->t_state == TCPS_FIN_WAIT_2) {
+ int timeout;
+
+ timeout = (tcp_fast_finwait2_recycle) ?
+ tcp_finwait2_timeout : tcp_maxidle;
+ tcp_timer_activate(tp, TT_2MSL, timeout);
+ }
+ }
+}
diff --git a/sys/dev/cxgb/ulp/tom/cxgb_tom.c b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
index 2dc6150..b5b87b7 100644
--- a/sys/dev/cxgb/ulp/tom/cxgb_tom.c
+++ b/sys/dev/cxgb/ulp/tom/cxgb_tom.c
@@ -60,7 +60,7 @@ __FBSDID("$FreeBSD$");
#include <netinet/in_pcb.h>
#include <netinet/tcp.h>
#include <netinet/tcp_var.h>
-#include <netinet/tcp_ofld.h>
+#include <netinet/tcp_offload.h>
#include <netinet/tcp_fsm.h>
#include <net/route.h>
@@ -77,6 +77,8 @@ __FBSDID("$FreeBSD$");
#include <dev/cxgb/ulp/tom/cxgb_defs.h>
#include <dev/cxgb/ulp/tom/cxgb_t3_ddp.h>
#include <dev/cxgb/ulp/tom/cxgb_toepcb.h>
+#include <dev/cxgb/ulp/tom/cxgb_tcp.h>
+
static int activated = 1;
TUNABLE_INT("hw.t3toe.activated", &activated);
@@ -177,6 +179,8 @@ toepcb_release(struct toepcb *toep)
static void
t3cdev_add(struct tom_data *t)
{
+ printf("t3cdev_add\n");
+
mtx_lock(&cxgb_list_lock);
TAILQ_INSERT_TAIL(&cxgb_list, t, entry);
mtx_unlock(&cxgb_list_lock);
@@ -187,7 +191,8 @@ t3cdev_add(struct tom_data *t)
* initialize its cpl_handlers
* and register it as a T3C client
*/
-static void t3c_tom_add(struct t3cdev *cdev)
+static void
+t3c_tom_add(struct t3cdev *cdev)
{
int i;
unsigned int wr_len;
@@ -195,9 +200,12 @@ static void t3c_tom_add(struct t3cdev *cdev)
struct toedev *tdev;
struct adap_ports *port_info;
+ printf("%s called\n", __FUNCTION__);
+
+
t = malloc(sizeof(*t), M_CXGB, M_NOWAIT|M_ZERO);
- if (!t)
+ if (t == NULL)
return;
if (cdev->ctl(cdev, GET_WR_LEN, &wr_len) < 0)
@@ -226,11 +234,15 @@ static void t3c_tom_add(struct t3cdev *cdev)
}
TOM_DATA(tdev) = t;
+ printf("nports=%d\n", port_info->nports);
for (i = 0; i < port_info->nports; i++) {
struct ifnet *ifp = port_info->lldevs[i];
TOEDEV(ifp) = tdev;
+
+ printf("enabling toe on %p\n", ifp);
- ifp->if_capabilities |= IFCAP_TOE;
+ ifp->if_capabilities |= IFCAP_TOE4;
+ ifp->if_capenable |= IFCAP_TOE4;
}
t->ports = port_info;
@@ -242,8 +254,10 @@ static void t3c_tom_add(struct t3cdev *cdev)
return;
out_free_all:
+ printf("out_free_all fail\n");
free(port_info, M_CXGB);
out_free_tom:
+ printf("out_free_tom fail\n");
free(t, M_CXGB);
return;
}
@@ -293,8 +307,8 @@ can_offload(struct toedev *dev, struct socket *so)
atomic_load_acq_int(&t->tids_in_use) + t->atids_in_use < tomd->conf.max_conn);
}
-
-static int tom_ctl(struct toedev *dev, unsigned int req, void *data)
+static int
+tom_ctl(struct toedev *dev, unsigned int req, void *data)
{
struct tom_data *t = TOM_DATA(dev);
struct t3cdev *cdev = t->cdev;
@@ -377,32 +391,33 @@ t3_toe_attach(struct toedev *dev, const struct offload_id *entry)
}
static void
-cxgb_toe_listen(void *unused, int event, struct tcpcb *tp)
+cxgb_toe_listen_start(void *unused, struct tcpcb *tp)
{
struct socket *so = tp->t_inpcb->inp_socket;
struct tom_data *p;
+
+ mtx_lock(&cxgb_list_lock);
+ TAILQ_FOREACH(p, &cxgb_list, entry) {
+ t3_listen_start(&p->tdev, so, p->cdev);
+ }
+ mtx_unlock(&cxgb_list_lock);
+}
- switch (event) {
- case OFLD_LISTEN_OPEN:
- case OFLD_LISTEN_CLOSE:
- mtx_lock(&cxgb_list_lock);
- TAILQ_FOREACH(p, &cxgb_list, entry) {
- if (event == OFLD_LISTEN_OPEN)
- t3_listen_start(&p->tdev, so, p->cdev);
- else if (tp->t_state == TCPS_LISTEN) {
- printf("stopping listen on port=%d\n",
- ntohs(tp->t_inpcb->inp_lport));
-
- t3_listen_stop(&p->tdev, so, p->cdev);
- }
-
+static void
+cxgb_toe_listen_stop(void *unused, struct tcpcb *tp)
+{
+ struct socket *so = tp->t_inpcb->inp_socket;
+ struct tom_data *p;
+
+ mtx_lock(&cxgb_list_lock);
+ TAILQ_FOREACH(p, &cxgb_list, entry) {
+ if (tp->t_state == TCPS_LISTEN) {
+ printf("stopping listen on port=%d\n",
+ ntohs(tp->t_inpcb->inp_lport));
+ t3_listen_stop(&p->tdev, so, p->cdev);
}
- mtx_unlock(&cxgb_list_lock);
- break;
- default:
- log(LOG_ERR, "unrecognized listen event %d\n", event);
- break;
}
+ mtx_unlock(&cxgb_list_lock);
}
static void
@@ -416,7 +431,7 @@ cxgb_register_listeners(void)
tp = intotcpcb(inp);
if (tp->t_state == TCPS_LISTEN)
- cxgb_toe_listen(NULL, OFLD_LISTEN_OPEN, tp);
+ cxgb_toe_listen_start(NULL, tp);
}
INP_INFO_RUNLOCK(&tcbinfo);
}
@@ -450,12 +465,19 @@ t3_tom_init(void)
"Unable to register Chelsio T3 TCP offload module.\n");
return -1;
}
+ INP_INFO_WLOCK(&tcbinfo);
+
+ INP_INFO_WUNLOCK(&tcbinfo);
mtx_init(&cxgb_list_lock, "cxgb tom list", NULL, MTX_DEF);
- listen_tag = EVENTHANDLER_REGISTER(ofld_listen, cxgb_toe_listen, NULL, EVENTHANDLER_PRI_ANY);
+ listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_start,
+ cxgb_toe_listen_start, NULL, EVENTHANDLER_PRI_ANY);
+ listen_tag = EVENTHANDLER_REGISTER(tcp_offload_listen_stop,
+ cxgb_toe_listen_stop, NULL, EVENTHANDLER_PRI_ANY);
TAILQ_INIT(&cxgb_list);
/* Register to offloading devices */
+ printf("setting add to %p\n", t3c_tom_add);
t3c_tom_client.add = t3c_tom_add;
cxgb_register_client(&t3c_tom_client);
cxgb_register_listeners();
diff --git a/sys/modules/cxgb/Makefile b/sys/modules/cxgb/Makefile
index 120cc9b..ef633e7 100644
--- a/sys/modules/cxgb/Makefile
+++ b/sys/modules/cxgb/Makefile
@@ -1,7 +1,7 @@
# $FreeBSD$
SUBDIR= cxgb
SUBDIR+= toecore
-#SUBDIR+= tom
+SUBDIR+= tom
#SUBDIR+= iw_cxgb
.include <bsd.subdir.mk>
diff --git a/sys/modules/cxgb/cxgb/Makefile b/sys/modules/cxgb/cxgb/Makefile
index b8455f1..1f41ac2 100644
--- a/sys/modules/cxgb/cxgb/Makefile
+++ b/sys/modules/cxgb/cxgb/Makefile
@@ -8,11 +8,11 @@ SRCS= cxgb_mc5.c cxgb_vsc8211.c cxgb_ael1002.c cxgb_mv88e1xxx.c
SRCS+= cxgb_xgmac.c cxgb_vsc7323.c cxgb_t3_hw.c cxgb_main.c
SRCS+= cxgb_sge.c cxgb_lro.c cxgb_offload.c cxgb_l2t.c
SRCS+= device_if.h bus_if.h pci_if.h opt_zero.h opt_sched.h
-SRCS+= uipc_mvec.c
-#SRCS+= cxgb_multiq.c cxgb_support.c
+SRCS+= uipc_mvec.c cxgb_support.c
+#SRCS+= cxgb_multiq.c
CFLAGS+= -DCONFIG_CHELSIO_T3_CORE -g -DCONFIG_DEFINED -DDEFAULT_JUMBO -I${CXGB} -DSMP
-CFLAGS+= -DDISABLE_MBUF_IOVEC
+#CFLAGS+= -DDISABLE_MBUF_IOVEC
#CFLAGS+= -DIFNET_MULTIQUEUE
#CFLAGS+= -DINVARIANT_SUPPORT -DINVARIANTS
#CFLAGS+= -DWITNESS
diff --git a/sys/modules/cxgb/tom/Makefile b/sys/modules/cxgb/tom/Makefile
index ece891ce..ba02b91 100644
--- a/sys/modules/cxgb/tom/Makefile
+++ b/sys/modules/cxgb/tom/Makefile
@@ -4,5 +4,9 @@ TOM = ${.CURDIR}/../../../dev/cxgb/ulp/tom
KMOD= tom
SRCS= cxgb_tom.c cxgb_cpl_io.c cxgb_listen.c cxgb_tom_sysctl.c cxgb_cpl_socket.c
-SRCS+= device_if.h bus_if.h pci_if.h
-.include <bsd.kmod.mk> \ No newline at end of file
+SRCS+= cxgb_tcp_subr.c cxgb_tcp_usrreq.c
+SRCS+= opt_compat.h opt_inet.h opt_inet6.h opt_ipsec.h opt_mac.h opt_tcpdebug.h opt_ddb.h
+SRCS+= device_if.h bus_if.h pci_if.h
+
+#CFLAGS+= -DDEBUG_PRINT -DDEBUG
+.include <bsd.kmod.mk>
OpenPOWER on IntegriCloud