summaryrefslogtreecommitdiffstats
path: root/sys/dev/re
diff options
context:
space:
mode:
authoryongari <yongari@FreeBSD.org>2011-01-26 20:25:40 +0000
committeryongari <yongari@FreeBSD.org>2011-01-26 20:25:40 +0000
commit4f8ca18133344fa25bf06dfe587c6938f88386e6 (patch)
tree6e16f32f571377a15da3d85055e95cf7f0d4ea0f /sys/dev/re
parent1e124ec538abd89755a07370b3e298c5d4e275bc (diff)
downloadFreeBSD-src-4f8ca18133344fa25bf06dfe587c6938f88386e6.zip
FreeBSD-src-4f8ca18133344fa25bf06dfe587c6938f88386e6.tar.gz
Do not use interrupt taskqueue on controllers with MSI/MSI-X
capability. One of reason using interrupt taskqueue in re(4) was to reduce number of TX/RX interrupts under load because re(4) controllers have no good TX/RX interrupt moderation mechanism. Basic TX interrupt moderation is done by hardware for most controllers but RX interrupt moderation through undocumented register showed poor RX performance so it was disabled in r215025. Using taskqueue to handle RX interrupt greatly reduced number of interrupts but re(4) consumed all available CPU cycles to run the taskqueue under high TX/RX network load. This can happen even with RTL810x fast ethernet controller and I believe this is not acceptable for most systems. To mitigate the issue, use one-shot timer register to moderate RX interrupts. The timer register provides programmable one-shot timer and can be used to suppress interrupt generation. The timer runs at 125MHZ on PCIe controllers so the minimum time allowed for the timer is 8ns. Data sheet says the register is 32 bits but experimentation shows only lower 13 bits are valid so maximum time that can be programmed is 65.528us. This yields theoretical maximum number of RX interrupts that could be generated per second is about 15260. Combined with TX completion interrupts re(4) shall generate less than 20k interrupts. This number is still slightly high compared to other intelligent ethernet controllers but system is very responsive even under high network load. Introduce sysctl variable dev.re.%d.int_rx_mod that controls amount of time to delay RX interrupt processing in units of us. Value 0 completely disables RX interrupt moderation. To provide old behavior for controllers that have MSI/MSI-X capability, introduce a new tunable hw.re.intr_filter. If the tunable is set to non-zero value, driver will use interrupt taskqueue. The default value of the tunable is 0. This tunable has no effect on controllers that has no MSI/MSI-X capability or if MSI/MSI-X is explicitly disabled by administrator. While I'm here cleanup interrupt setup/teardown since re(4) uses single MSI/MSI-X message at this moment.
Diffstat (limited to 'sys/dev/re')
-rw-r--r--sys/dev/re/if_re.c207
1 files changed, 175 insertions, 32 deletions
diff --git a/sys/dev/re/if_re.c b/sys/dev/re/if_re.c
index 5708e98..e8ae822 100644
--- a/sys/dev/re/if_re.c
+++ b/sys/dev/re/if_re.c
@@ -157,6 +157,8 @@ MODULE_DEPEND(re, miibus, 1, 1, 1);
#include "miibus_if.h"
/* Tunables. */
+static int intr_filter = 0;
+TUNABLE_INT("hw.re.intr_filter", &intr_filter);
static int msi_disable = 0;
TUNABLE_INT("hw.re.msi_disable", &msi_disable);
static int msix_disable = 0;
@@ -253,6 +255,7 @@ static int re_poll (struct ifnet *, enum poll_cmd, int);
static int re_poll_locked (struct ifnet *, enum poll_cmd, int);
#endif
static int re_intr (void *);
+static void re_intr_msi (void *);
static void re_tick (void *);
static void re_int_task (void *, int);
static void re_start (struct ifnet *);
@@ -290,6 +293,8 @@ static int re_diag (struct rl_softc *);
static void re_add_sysctls (struct rl_softc *);
static int re_sysctl_stats (SYSCTL_HANDLER_ARGS);
+static int sysctl_int_range (SYSCTL_HANDLER_ARGS, int, int);
+static int sysctl_hw_re_int_mod (SYSCTL_HANDLER_ARGS);
static device_method_t re_methods[] = {
/* Device interface */
@@ -1574,19 +1579,19 @@ re_attach(device_t dev)
}
#endif
+#ifdef RE_TX_MODERATION
+ intr_filter = 1;
+#endif
/* Hook interrupt last to avoid having to lock softc */
- if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
+ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 &&
+ intr_filter == 0) {
+ error = bus_setup_intr(dev, sc->rl_irq[0],
+ INTR_TYPE_NET | INTR_MPSAFE, NULL, re_intr_msi, sc,
+ &sc->rl_intrhand[0]);
+ } else {
error = bus_setup_intr(dev, sc->rl_irq[0],
INTR_TYPE_NET | INTR_MPSAFE, re_intr, NULL, sc,
&sc->rl_intrhand[0]);
- else {
- for (i = 0; i < RL_MSI_MESSAGES; i++) {
- error = bus_setup_intr(dev, sc->rl_irq[i],
- INTR_TYPE_NET | INTR_MPSAFE, re_intr, NULL, sc,
- &sc->rl_intrhand[i]);
- if (error != 0)
- break;
- }
}
if (error) {
device_printf(dev, "couldn't set up irq\n");
@@ -1657,31 +1662,22 @@ re_detach(device_t dev)
* stopped here.
*/
- for (i = 0; i < RL_MSI_MESSAGES; i++) {
- if (sc->rl_intrhand[i] != NULL) {
- bus_teardown_intr(dev, sc->rl_irq[i],
- sc->rl_intrhand[i]);
- sc->rl_intrhand[i] = NULL;
- }
+ if (sc->rl_intrhand[0] != NULL) {
+ bus_teardown_intr(dev, sc->rl_irq[0], sc->rl_intrhand[0]);
+ sc->rl_intrhand[0] = NULL;
}
if (ifp != NULL)
if_free(ifp);
- if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0) {
- if (sc->rl_irq[0] != NULL) {
- bus_release_resource(dev, SYS_RES_IRQ, 0,
- sc->rl_irq[0]);
- sc->rl_irq[0] = NULL;
- }
- } else {
- for (i = 0, rid = 1; i < RL_MSI_MESSAGES; i++, rid++) {
- if (sc->rl_irq[i] != NULL) {
- bus_release_resource(dev, SYS_RES_IRQ, rid,
- sc->rl_irq[i]);
- sc->rl_irq[i] = NULL;
- }
- }
- pci_release_msi(dev);
+ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
+ rid = 0;
+ else
+ rid = 1;
+ if (sc->rl_irq[0] != NULL) {
+ bus_release_resource(dev, SYS_RES_IRQ, rid, sc->rl_irq[0]);
+ sc->rl_irq[0] = NULL;
}
+ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0)
+ pci_release_msi(dev);
if (sc->rl_res_pba) {
rid = PCIR_BAR(4);
bus_release_resource(dev, SYS_RES_MEMORY, rid, sc->rl_res_pba);
@@ -1970,6 +1966,7 @@ re_rx_list_init(struct rl_softc *sc)
sc->rl_ldata.rl_rx_prodidx = 0;
sc->rl_head = sc->rl_tail = NULL;
+ sc->rl_int_rx_act = 0;
return (0);
}
@@ -1993,6 +1990,7 @@ re_jrx_list_init(struct rl_softc *sc)
sc->rl_ldata.rl_rx_prodidx = 0;
sc->rl_head = sc->rl_tail = NULL;
+ sc->rl_int_rx_act = 0;
return (0);
}
@@ -2478,6 +2476,87 @@ re_int_task(void *arg, int npending)
CSR_WRITE_2(sc, RL_IMR, RL_INTRS_CPLUS);
}
+static void
+re_intr_msi(void *xsc)
+{
+ struct rl_softc *sc;
+ struct ifnet *ifp;
+ uint16_t intrs, status;
+
+ sc = xsc;
+ RL_LOCK(sc);
+
+ ifp = sc->rl_ifp;
+#ifdef DEVICE_POLLING
+ if (ifp->if_capenable & IFCAP_POLLING) {
+ RL_UNLOCK(sc);
+ return;
+ }
+#endif
+ /* Disable interrupts. */
+ CSR_WRITE_2(sc, RL_IMR, 0);
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+ RL_UNLOCK(sc);
+ return;
+ }
+
+ intrs = RL_INTRS_CPLUS;
+ status = CSR_READ_2(sc, RL_ISR);
+ CSR_WRITE_2(sc, RL_ISR, status);
+ if (sc->rl_int_rx_act > 0) {
+ intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW |
+ RL_ISR_RX_OVERRUN);
+ status &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR | RL_ISR_FIFO_OFLOW |
+ RL_ISR_RX_OVERRUN);
+ }
+
+ if (status & (RL_ISR_TIMEOUT_EXPIRED | RL_ISR_RX_OK | RL_ISR_RX_ERR |
+ RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) {
+ re_rxeof(sc, NULL);
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
+ if (sc->rl_int_rx_mod != 0 &&
+ (status & (RL_ISR_RX_OK | RL_ISR_RX_ERR |
+ RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN)) != 0) {
+ /* Rearm one-shot timer. */
+ CSR_WRITE_4(sc, RL_TIMERCNT, 1);
+ intrs &= ~(RL_ISR_RX_OK | RL_ISR_RX_ERR |
+ RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN);
+ sc->rl_int_rx_act = 1;
+ } else {
+ intrs |= RL_ISR_RX_OK | RL_ISR_RX_ERR |
+ RL_ISR_FIFO_OFLOW | RL_ISR_RX_OVERRUN;
+ sc->rl_int_rx_act = 0;
+ }
+ }
+ }
+
+ /*
+ * Some chips will ignore a second TX request issued
+ * while an existing transmission is in progress. If
+ * the transmitter goes idle but there are still
+ * packets waiting to be sent, we need to restart the
+ * channel here to flush them out. This only seems to
+ * be required with the PCIe devices.
+ */
+ if ((status & (RL_ISR_TX_OK | RL_ISR_TX_DESC_UNAVAIL)) &&
+ (sc->rl_flags & RL_FLAG_PCIE))
+ CSR_WRITE_1(sc, sc->rl_txstart, RL_TXSTART_START);
+ if (status & (RL_ISR_TX_OK | RL_ISR_TX_ERR | RL_ISR_TX_DESC_UNAVAIL))
+ re_txeof(sc);
+
+ if (status & RL_ISR_SYSTEM_ERR) {
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ re_init_locked(sc);
+ }
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) != 0) {
+ if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+ re_start_locked(ifp);
+ CSR_WRITE_2(sc, RL_IMR, intrs);
+ }
+ RL_UNLOCK(sc);
+}
+
static int
re_encap(struct rl_softc *sc, struct mbuf **m_head)
{
@@ -3007,18 +3086,35 @@ re_init_locked(struct rl_softc *sc)
CSR_WRITE_1(sc, RL_COMMAND, RL_CMD_TX_ENB|RL_CMD_RX_ENB);
#endif
-#ifdef RE_TX_MODERATION
/*
* Initialize the timer interrupt register so that
* a timer interrupt will be generated once the timer
* reaches a certain number of ticks. The timer is
- * reloaded on each transmit. This gives us TX interrupt
+ * reloaded on each transmit.
+ */
+#ifdef RE_TX_MODERATION
+ /*
+ * Use timer interrupt register to moderate TX interrupt
* moderation, which dramatically improves TX frame rate.
*/
if (sc->rl_type == RL_8169)
CSR_WRITE_4(sc, RL_TIMERINT_8169, 0x800);
else
CSR_WRITE_4(sc, RL_TIMERINT, 0x400);
+#else
+ /*
+ * Use timer interrupt register to moderate RX interrupt
+ * moderation.
+ */
+ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) != 0 &&
+ intr_filter == 0) {
+ if (sc->rl_type == RL_8169)
+ CSR_WRITE_4(sc, RL_TIMERINT_8169,
+ RL_USECS(sc->rl_int_rx_mod));
+ } else {
+ if (sc->rl_type == RL_8169)
+ CSR_WRITE_4(sc, RL_TIMERINT_8169, RL_USECS(0));
+ }
#endif
/*
@@ -3535,6 +3631,7 @@ re_add_sysctls(struct rl_softc *sc)
{
struct sysctl_ctx_list *ctx;
struct sysctl_oid_list *children;
+ int error;
ctx = device_get_sysctl_ctx(sc->rl_dev);
children = SYSCTL_CHILDREN(device_get_sysctl_tree(sc->rl_dev));
@@ -3542,6 +3639,26 @@ re_add_sysctls(struct rl_softc *sc)
SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "stats",
CTLTYPE_INT | CTLFLAG_RW, sc, 0, re_sysctl_stats, "I",
"Statistics Information");
+ if ((sc->rl_flags & (RL_FLAG_MSI | RL_FLAG_MSIX)) == 0)
+ return;
+
+ SYSCTL_ADD_PROC(ctx, children, OID_AUTO, "int_rx_mod",
+ CTLTYPE_INT | CTLFLAG_RW, &sc->rl_int_rx_mod, 0,
+ sysctl_hw_re_int_mod, "I", "re RX interrupt moderation");
+ /* Pull in device tunables. */
+ sc->rl_int_rx_mod = RL_TIMER_DEFAULT;
+ error = resource_int_value(device_get_name(sc->rl_dev),
+ device_get_unit(sc->rl_dev), "int_rx_mod", &sc->rl_int_rx_mod);
+ if (error == 0) {
+ if (sc->rl_int_rx_mod < RL_TIMER_MIN ||
+ sc->rl_int_rx_mod > RL_TIMER_MAX) {
+ device_printf(sc->rl_dev, "int_rx_mod value out of "
+ "range; using default: %d\n",
+ RL_TIMER_DEFAULT);
+ sc->rl_int_rx_mod = RL_TIMER_DEFAULT;
+ }
+ }
+
}
static int
@@ -3619,3 +3736,29 @@ done:
return (error);
}
+
+static int
+sysctl_int_range(SYSCTL_HANDLER_ARGS, int low, int high)
+{
+ int error, value;
+
+ if (arg1 == NULL)
+ return (EINVAL);
+ value = *(int *)arg1;
+ error = sysctl_handle_int(oidp, &value, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+ if (value < low || value > high)
+ return (EINVAL);
+ *(int *)arg1 = value;
+
+ return (0);
+}
+
+static int
+sysctl_hw_re_int_mod(SYSCTL_HANDLER_ARGS)
+{
+
+ return (sysctl_int_range(oidp, arg1, arg2, req, RL_TIMER_MIN,
+ RL_TIMER_MAX));
+}
OpenPOWER on IntegriCloud