diff options
author | gallatin <gallatin@FreeBSD.org> | 2009-09-21 20:16:10 +0000 |
---|---|---|
committer | gallatin <gallatin@FreeBSD.org> | 2009-09-21 20:16:10 +0000 |
commit | aeed8258a908bdadfeeab09b58aad3a38180ba33 (patch) | |
tree | e94b242b117839bb51b1da08efd2e80ccebf84df /sys/dev/mxge | |
parent | 6e8dbac60f31aee3ef35a22c7f79518f6869a278 (diff) | |
download | FreeBSD-src-aeed8258a908bdadfeeab09b58aad3a38180ba33.zip FreeBSD-src-aeed8258a908bdadfeeab09b58aad3a38180ba33.tar.gz |
Improve mxge watchdog routine's ability to reliably reset a failed NIC:
- Mark the link as down, so if watchdog reset fails, link watching
failover software can notice it
- Don't send MXGEFW_CMD_ETHERNET_DOWN if the NIC has been reset, it is
not needed, and will fail on a freshly reset NIC.
- Ensure the transmit routines aren't attempting to PIO write to doorbells
while the NIC is being reset.
- Download the correct f/w, rather than using the EEPROM f/w after reset.
- Export a count of the number of watchdog resets via sysctl
- Zero all f/w stats at reset. This will lead to less confusing
diagnostic output when investigating NIC failures.
MFC after: 3 days
Diffstat (limited to 'sys/dev/mxge')
-rw-r--r-- | sys/dev/mxge/if_mxge.c | 99 |
1 files changed, 72 insertions, 27 deletions
diff --git a/sys/dev/mxge/if_mxge.c b/sys/dev/mxge/if_mxge.c index 8fdc440..6e8ca67 100644 --- a/sys/dev/mxge/if_mxge.c +++ b/sys/dev/mxge/if_mxge.c @@ -144,7 +144,7 @@ MODULE_DEPEND(mxge, zlib, 1, 1, 1); static int mxge_load_firmware(mxge_softc_t *sc, int adopt); static int mxge_send_cmd(mxge_softc_t *sc, uint32_t cmd, mxge_cmd_t *data); -static int mxge_close(mxge_softc_t *sc); +static int mxge_close(mxge_softc_t *sc, int down); static int mxge_open(mxge_softc_t *sc); static void mxge_tick(void *arg); @@ -1309,8 +1309,7 @@ mxge_reset(mxge_softc_t *sc, int interrupts_setup) ss->lro_queued = 0; ss->lro_flushed = 0; if (ss->fw_stats != NULL) { - ss->fw_stats->valid = 0; - ss->fw_stats->send_done_count = 0; + bzero(ss->fw_stats, sizeof *ss->fw_stats); } } sc->rdma_tags_available = 15; @@ -1421,7 +1420,7 @@ mxge_change_lro_locked(mxge_softc_t *sc, int lro_cnt) ifp->if_capenable |= IFCAP_LRO; sc->lro_cnt = lro_cnt; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - mxge_close(sc); + mxge_close(sc, 0); err = mxge_open(sc); } return err; @@ -1537,6 +1536,10 @@ mxge_add_sysctls(mxge_softc_t *sc) "read_write_dma_MBs", CTLFLAG_RD, &sc->read_write_dma, 0, "DMA concurrent Read/Write speed in MB/s"); + SYSCTL_ADD_INT(ctx, children, OID_AUTO, + "watchdog_resets", + CTLFLAG_RD, &sc->watchdog_resets, + 0, "Number of times NIC was reset"); /* performance related tunables */ @@ -3648,7 +3651,7 @@ abort: } static int -mxge_close(mxge_softc_t *sc) +mxge_close(mxge_softc_t *sc, int down) { mxge_cmd_t cmd; int err, old_down_cnt; @@ -3665,21 +3668,23 @@ mxge_close(mxge_softc_t *sc) } #endif sc->ifp->if_drv_flags &= ~IFF_DRV_RUNNING; - old_down_cnt = sc->down_cnt; - wmb(); - err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd); - if (err) { - device_printf(sc->dev, "Couldn't bring down link\n"); - } - if (old_down_cnt == sc->down_cnt) { - /* wait for down irq */ - DELAY(10 * sc->intr_coal_delay); - } - wmb(); - if (old_down_cnt == sc->down_cnt) { - device_printf(sc->dev, "never got down irq\n"); + if (!down) { + old_down_cnt = sc->down_cnt; + wmb(); + err = mxge_send_cmd(sc, MXGEFW_CMD_ETHERNET_DOWN, &cmd); + if (err) { + device_printf(sc->dev, + "Couldn't bring down link\n"); + } + if (old_down_cnt == sc->down_cnt) { + /* wait for down irq */ + DELAY(10 * sc->intr_coal_delay); + } + wmb(); + if (old_down_cnt == sc->down_cnt) { + device_printf(sc->dev, "never got down irq\n"); + } } - mxge_free_mbufs(sc); return 0; @@ -3732,8 +3737,9 @@ static int mxge_watchdog_reset(mxge_softc_t *sc, int slice) { struct pci_devinfo *dinfo; + struct mxge_slice_state *ss; mxge_tx_ring_t *tx; - int err; + int err, running, s, num_tx_slices = 1; uint32_t reboot; uint16_t cmd; @@ -3767,6 +3773,30 @@ mxge_watchdog_reset(mxge_softc_t *sc, int slice) reboot = mxge_read_reboot(sc); device_printf(sc->dev, "NIC rebooted, status = 0x%x\n", reboot); + running = sc->ifp->if_drv_flags & IFF_DRV_RUNNING; + if (running) { + + /* + * quiesce NIC so that TX routines will not try to + * xmit after restoration of BAR + */ + + /* Mark the link as down */ + if (sc->link_state) { + sc->link_state = 0; + if_link_state_change(sc->ifp, + LINK_STATE_DOWN); + } +#ifdef IFNET_BUF_RING + num_tx_slices = sc->num_slices; +#endif + /* grab all TX locks to ensure no tx */ + for (s = 0; s < num_tx_slices; s++) { + ss = &sc->ss[s]; + mtx_lock(&ss->tx.mtx); + } + mxge_close(sc, 1); + } /* restore PCI configuration space */ dinfo = device_get_ivars(sc->dev); pci_cfg_restore(sc->dev, dinfo); @@ -3774,10 +3804,22 @@ mxge_watchdog_reset(mxge_softc_t *sc, int slice) /* and redo any changes we made to our config space */ mxge_setup_cfg_space(sc); - if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) { - mxge_close(sc); - err = mxge_open(sc); + /* reload f/w */ + err = mxge_load_firmware(sc, 0); + if (err) { + device_printf(sc->dev, + "Unable to re-load f/w\n"); } + if (running) { + if (!err) + err = mxge_open(sc); + /* release all TX locks */ + for (s = 0; s < num_tx_slices; s++) { + ss = &sc->ss[s]; + mtx_unlock(&ss->tx.mtx); + } + } + sc->watchdog_resets++; } else { tx = &sc->ss[slice].tx; device_printf(sc->dev, @@ -3793,6 +3835,9 @@ mxge_watchdog_reset(mxge_softc_t *sc, int slice) be32toh(sc->ss->fw_stats->send_done_count)); device_printf(sc->dev, "not resetting\n"); } + if (err) + device_printf(sc->dev, "watchdog reset failed\n"); + return (err); } @@ -3908,11 +3953,11 @@ mxge_change_mtu(mxge_softc_t *sc, int mtu) old_mtu = ifp->if_mtu; ifp->if_mtu = mtu; if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - mxge_close(sc); + mxge_close(sc, 0); err = mxge_open(sc); if (err != 0) { ifp->if_mtu = old_mtu; - mxge_close(sc); + mxge_close(sc, 0); (void) mxge_open(sc); } } @@ -3970,7 +4015,7 @@ mxge_ioctl(struct ifnet *ifp, u_long command, caddr_t data) } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) { - mxge_close(sc); + mxge_close(sc, 0); } } mtx_unlock(&sc->driver_mtx); @@ -4700,7 +4745,7 @@ mxge_detach(device_t dev) mtx_lock(&sc->driver_mtx); sc->dying = 1; if (sc->ifp->if_drv_flags & IFF_DRV_RUNNING) - mxge_close(sc); + mxge_close(sc, 0); mtx_unlock(&sc->driver_mtx); ether_ifdetach(sc->ifp); callout_drain(&sc->co_hdl); |