diff options
Diffstat (limited to 'sys/dev/hyperv/netvsc/if_hn.c')
-rw-r--r-- | sys/dev/hyperv/netvsc/if_hn.c | 1238 |
1 files changed, 986 insertions, 252 deletions
diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c index 1731c97..8750ab0 100644 --- a/sys/dev/hyperv/netvsc/if_hn.c +++ b/sys/dev/hyperv/netvsc/if_hn.c @@ -57,6 +57,7 @@ __FBSDID("$FreeBSD$"); #include "opt_inet6.h" #include "opt_inet.h" +#include "opt_hn.h" #include <sys/param.h> #include <sys/bus.h> @@ -152,7 +153,11 @@ __FBSDID("$FreeBSD$"); sx_init(&(sc)->hn_lock, device_get_nameunit((sc)->hn_dev)) #define HN_LOCK_DESTROY(sc) sx_destroy(&(sc)->hn_lock) #define HN_LOCK_ASSERT(sc) sx_assert(&(sc)->hn_lock, SA_XLOCKED) -#define HN_LOCK(sc) sx_xlock(&(sc)->hn_lock) +#define HN_LOCK(sc) \ +do { \ + while (sx_try_xlock(&(sc)->hn_lock) == 0) \ + DELAY(1000); \ +} while (0) #define HN_UNLOCK(sc) sx_xunlock(&(sc)->hn_lock) #define HN_CSUM_IP_MASK (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP) @@ -162,10 +167,24 @@ __FBSDID("$FreeBSD$"); #define HN_CSUM_IP6_HWASSIST(sc) \ ((sc)->hn_tx_ring[0].hn_csum_assist & HN_CSUM_IP6_MASK) +#define HN_PKTSIZE_MIN(align) \ + roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN + \ + HN_RNDIS_PKT_LEN, (align)) +#define HN_PKTSIZE(m, align) \ + roundup2((m)->m_pkthdr.len + HN_RNDIS_PKT_LEN, (align)) + +#define HN_RING_IDX2CPU(sc, idx) (((sc)->hn_cpu + (idx)) % mp_ncpus) + struct hn_txdesc { #ifndef HN_USE_TXDESC_BUFRING SLIST_ENTRY(hn_txdesc) link; #endif + STAILQ_ENTRY(hn_txdesc) agg_link; + + /* Aggregated txdescs, in sending order. */ + STAILQ_HEAD(, hn_txdesc) agg_list; + + /* The oldest packet, if transmission aggregation happens. */ struct mbuf *m; struct hn_tx_ring *txr; int refs; @@ -183,6 +202,7 @@ struct hn_txdesc { #define HN_TXD_FLAG_ONLIST 0x0001 #define HN_TXD_FLAG_DMAMAP 0x0002 +#define HN_TXD_FLAG_ONAGG 0x0004 struct hn_rxinfo { uint32_t vlan_info; @@ -262,6 +282,11 @@ static int hn_rxfilter_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_key_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS); static int hn_rss_hash_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS); +static int hn_polling_sysctl(SYSCTL_HANDLER_ARGS); static void hn_stop(struct hn_softc *); static void hn_init_locked(struct hn_softc *); @@ -278,6 +303,7 @@ static int hn_synth_attach(struct hn_softc *, int); static void hn_synth_detach(struct hn_softc *); static int hn_synth_alloc_subchans(struct hn_softc *, int *); +static bool hn_synth_attachable(const struct hn_softc *); static void hn_suspend(struct hn_softc *); static void hn_suspend_data(struct hn_softc *); static void hn_suspend_mgmt(struct hn_softc *); @@ -285,7 +311,10 @@ static void hn_resume(struct hn_softc *); static void hn_resume_data(struct hn_softc *); static void hn_resume_mgmt(struct hn_softc *); static void hn_suspend_mgmt_taskfunc(void *, int); -static void hn_chan_drain(struct vmbus_channel *); +static void hn_chan_drain(struct hn_softc *, + struct vmbus_channel *); +static void hn_polling(struct hn_softc *, u_int); +static void hn_chan_polling(struct vmbus_channel *, u_int); static void hn_update_link_status(struct hn_softc *); static void hn_change_network(struct hn_softc *); @@ -297,9 +326,10 @@ static void hn_link_status(struct hn_softc *); static int hn_create_rx_data(struct hn_softc *, int); static void hn_destroy_rx_data(struct hn_softc *); static int hn_check_iplen(const struct mbuf *, int); -static int hn_set_rxfilter(struct hn_softc *); +static int hn_set_rxfilter(struct hn_softc *, uint32_t); +static int hn_rxfilter_config(struct hn_softc *); static int hn_rss_reconfig(struct hn_softc *); -static void hn_rss_ind_fixup(struct hn_softc *, int); +static void hn_rss_ind_fixup(struct hn_softc *); static int hn_rxpkt(struct hn_rx_ring *, const void *, int, const struct hn_rxinfo *); @@ -309,7 +339,9 @@ static int hn_create_tx_data(struct hn_softc *, int); static void hn_fixup_tx_data(struct hn_softc *); static void hn_destroy_tx_data(struct hn_softc *); static void hn_txdesc_dmamap_destroy(struct hn_txdesc *); -static int hn_encap(struct hn_tx_ring *, +static void hn_txdesc_gc(struct hn_tx_ring *, + struct hn_txdesc *); +static int hn_encap(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *, struct mbuf **); static int hn_txpkt(struct ifnet *, struct hn_tx_ring *, struct hn_txdesc *); @@ -318,6 +350,10 @@ static void hn_set_tso_maxsize(struct hn_softc *, int, int); static bool hn_tx_ring_pending(struct hn_tx_ring *); static void hn_tx_ring_qflush(struct hn_tx_ring *); static void hn_resume_tx(struct hn_softc *, int); +static void hn_set_txagg(struct hn_softc *); +static void *hn_try_txagg(struct ifnet *, + struct hn_tx_ring *, struct hn_txdesc *, + int); static int hn_get_txswq_depth(const struct hn_tx_ring *); static void hn_txpkt_done(struct hn_nvs_sendctx *, struct hn_softc *, struct vmbus_channel *, @@ -385,10 +421,18 @@ SYSCTL_INT(_hw_hn, OID_AUTO, lro_entry_count, CTLFLAG_RDTUN, #endif #endif -/* Use shared TX taskqueue */ -static int hn_share_tx_taskq = 0; -SYSCTL_INT(_hw_hn, OID_AUTO, share_tx_taskq, CTLFLAG_RDTUN, - &hn_share_tx_taskq, 0, "Enable shared TX taskqueue"); +static int hn_tx_taskq_cnt = 1; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_cnt, CTLFLAG_RDTUN, + &hn_tx_taskq_cnt, 0, "# of TX taskqueues"); + +#define HN_TX_TASKQ_M_INDEP 0 +#define HN_TX_TASKQ_M_GLOBAL 1 +#define HN_TX_TASKQ_M_EVTTQ 2 + +static int hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_taskq_mode, CTLFLAG_RDTUN, + &hn_tx_taskq_mode, 0, "TX taskqueue modes: " + "0 - independent, 1 - share global tx taskqs, 2 - share event taskqs"); #ifndef HN_USE_TXDESC_BUFRING static int hn_use_txdesc_bufring = 0; @@ -398,11 +442,6 @@ static int hn_use_txdesc_bufring = 1; SYSCTL_INT(_hw_hn, OID_AUTO, use_txdesc_bufring, CTLFLAG_RD, &hn_use_txdesc_bufring, 0, "Use buf_ring for TX descriptors"); -/* Bind TX taskqueue to the target CPU */ -static int hn_bind_tx_taskq = -1; -SYSCTL_INT(_hw_hn, OID_AUTO, bind_tx_taskq, CTLFLAG_RDTUN, - &hn_bind_tx_taskq, 0, "Bind TX taskqueue to the specified cpu"); - #ifdef HN_IFSTART_SUPPORT /* Use ifnet.if_start instead of ifnet.if_transmit */ static int hn_use_if_start = 0; @@ -433,8 +472,18 @@ SYSCTL_UINT(_hw_hn, OID_AUTO, lro_mbufq_depth, CTLFLAG_RDTUN, &hn_lro_mbufq_depth, 0, "Depth of LRO mbuf queue"); #endif +/* Packet transmission aggregation size limit */ +static int hn_tx_agg_size = -1; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_size, CTLFLAG_RDTUN, + &hn_tx_agg_size, 0, "Packet transmission aggregation size limit"); + +/* Packet transmission aggregation count limit */ +static int hn_tx_agg_pkts = -1; +SYSCTL_INT(_hw_hn, OID_AUTO, tx_agg_pkts, CTLFLAG_RDTUN, + &hn_tx_agg_pkts, 0, "Packet transmission aggregation packet limit"); + static u_int hn_cpu_index; /* next CPU for channel */ -static struct taskqueue *hn_tx_taskq; /* shared TX taskqueue */ +static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */ static const uint8_t hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = { @@ -472,7 +521,7 @@ hn_set_lro_lenlim(struct hn_softc *sc, int lenlim) { int i; - for (i = 0; i < sc->hn_rx_ring_inuse; ++i) + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_length_lim = lenlim; } #endif @@ -625,11 +674,25 @@ do { \ #endif /* INET6 || INET */ static int -hn_set_rxfilter(struct hn_softc *sc) +hn_set_rxfilter(struct hn_softc *sc, uint32_t filter) +{ + int error = 0; + + HN_LOCK_ASSERT(sc); + + if (sc->hn_rx_filter != filter) { + error = hn_rndis_set_rxfilter(sc, filter); + if (!error) + sc->hn_rx_filter = filter; + } + return (error); +} + +static int +hn_rxfilter_config(struct hn_softc *sc) { struct ifnet *ifp = sc->hn_ifp; uint32_t filter; - int error = 0; HN_LOCK_ASSERT(sc); @@ -639,26 +702,90 @@ hn_set_rxfilter(struct hn_softc *sc) filter = NDIS_PACKET_TYPE_DIRECTED; if (ifp->if_flags & IFF_BROADCAST) filter |= NDIS_PACKET_TYPE_BROADCAST; -#ifdef notyet - /* - * See the comment in SIOCADDMULTI/SIOCDELMULTI. - */ /* TODO: support multicast list */ if ((ifp->if_flags & IFF_ALLMULTI) || !TAILQ_EMPTY(&ifp->if_multiaddrs)) filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; -#else - /* Always enable ALLMULTI */ - filter |= NDIS_PACKET_TYPE_ALL_MULTICAST; -#endif } + return (hn_set_rxfilter(sc, filter)); +} - if (sc->hn_rx_filter != filter) { - error = hn_rndis_set_rxfilter(sc, filter); - if (!error) - sc->hn_rx_filter = filter; +static void +hn_set_txagg(struct hn_softc *sc) +{ + uint32_t size, pkts; + int i; + + /* + * Setup aggregation size. + */ + if (sc->hn_agg_size < 0) + size = UINT32_MAX; + else + size = sc->hn_agg_size; + + if (sc->hn_rndis_agg_size < size) + size = sc->hn_rndis_agg_size; + + /* NOTE: We only aggregate packets using chimney sending buffers. */ + if (size > (uint32_t)sc->hn_chim_szmax) + size = sc->hn_chim_szmax; + + if (size <= 2 * HN_PKTSIZE_MIN(sc->hn_rndis_agg_align)) { + /* Disable */ + size = 0; + pkts = 0; + goto done; + } + + /* NOTE: Type of the per TX ring setting is 'int'. */ + if (size > INT_MAX) + size = INT_MAX; + + /* + * Setup aggregation packet count. + */ + if (sc->hn_agg_pkts < 0) + pkts = UINT32_MAX; + else + pkts = sc->hn_agg_pkts; + + if (sc->hn_rndis_agg_pkts < pkts) + pkts = sc->hn_rndis_agg_pkts; + + if (pkts <= 1) { + /* Disable */ + size = 0; + pkts = 0; + goto done; + } + + /* NOTE: Type of the per TX ring setting is 'short'. */ + if (pkts > SHRT_MAX) + pkts = SHRT_MAX; + +done: + /* NOTE: Type of the per TX ring setting is 'short'. */ + if (sc->hn_rndis_agg_align > SHRT_MAX) { + /* Disable */ + size = 0; + pkts = 0; + } + + if (bootverbose) { + if_printf(sc->hn_ifp, "TX agg size %u, pkts %u, align %u\n", + size, pkts, sc->hn_rndis_agg_align); + } + + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { + struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + + mtx_lock(&txr->hn_tx_lock); + txr->hn_agg_szmax = size; + txr->hn_agg_pktmax = pkts; + txr->hn_agg_align = sc->hn_rndis_agg_align; + mtx_unlock(&txr->hn_tx_lock); } - return (error); } static int @@ -711,11 +838,12 @@ hn_rss_reconfig(struct hn_softc *sc) } static void -hn_rss_ind_fixup(struct hn_softc *sc, int nchan) +hn_rss_ind_fixup(struct hn_softc *sc) { struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; - int i; + int i, nchan; + nchan = sc->hn_rx_ring_inuse; KASSERT(nchan > 1, ("invalid # of channels %d", nchan)); /* @@ -773,19 +901,6 @@ hn_probe(device_t dev) return ENXIO; } -static void -hn_cpuset_setthread_task(void *xmask, int pending __unused) -{ - cpuset_t *mask = xmask; - int error; - - error = cpuset_setthread(curthread->td_tid, mask); - if (error) { - panic("curthread=%ju: can't pin; error=%d", - (uintmax_t)curthread->td_tid, error); - } -} - static int hn_attach(device_t dev) { @@ -801,28 +916,29 @@ hn_attach(device_t dev) HN_LOCK_INIT(sc); /* + * Initialize these tunables once. + */ + sc->hn_agg_size = hn_tx_agg_size; + sc->hn_agg_pkts = hn_tx_agg_pkts; + + /* * Setup taskqueue for transmission. */ - if (hn_tx_taskq == NULL) { - sc->hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, - taskqueue_thread_enqueue, &sc->hn_tx_taskq); - taskqueue_start_threads(&sc->hn_tx_taskq, 1, PI_NET, "%s tx", - device_get_nameunit(dev)); - if (hn_bind_tx_taskq >= 0) { - int cpu = hn_bind_tx_taskq; - struct task cpuset_task; - cpuset_t cpu_set; - - if (cpu > mp_ncpus - 1) - cpu = mp_ncpus - 1; - CPU_SETOF(cpu, &cpu_set); - TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, - &cpu_set); - taskqueue_enqueue(sc->hn_tx_taskq, &cpuset_task); - taskqueue_drain(sc->hn_tx_taskq, &cpuset_task); + if (hn_tx_taskq_mode == HN_TX_TASKQ_M_INDEP) { + int i; + + sc->hn_tx_taskqs = + malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), + M_DEVBUF, M_WAITOK); + for (i = 0; i < hn_tx_taskq_cnt; ++i) { + sc->hn_tx_taskqs[i] = taskqueue_create("hn_tx", + M_WAITOK, taskqueue_thread_enqueue, + &sc->hn_tx_taskqs[i]); + taskqueue_start_threads(&sc->hn_tx_taskqs[i], 1, PI_NET, + "%s tx%d", device_get_nameunit(dev), i); } - } else { - sc->hn_tx_taskq = hn_tx_taskq; + } else if (hn_tx_taskq_mode == HN_TX_TASKQ_M_GLOBAL) { + sc->hn_tx_taskqs = hn_tx_taskque; } /* @@ -900,8 +1016,25 @@ hn_attach(device_t dev) */ sc->hn_xact = vmbus_xact_ctx_create(bus_get_dma_tag(dev), HN_XACT_REQ_SIZE, HN_XACT_RESP_SIZE, 0); - if (sc->hn_xact == NULL) + if (sc->hn_xact == NULL) { + error = ENXIO; + goto failed; + } + + /* + * Install orphan handler for the revocation of this device's + * primary channel. + * + * NOTE: + * The processing order is critical here: + * Install the orphan handler, _before_ testing whether this + * device's primary channel has been revoked or not. + */ + vmbus_chan_set_orphan(sc->hn_prichan, sc->hn_xact); + if (vmbus_chan_is_revoked(sc->hn_prichan)) { + error = ENXIO; goto failed; + } /* * Attach the synthetic parts, i.e. NVS and RNDIS. @@ -956,6 +1089,28 @@ hn_attach(device_t dev) SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rss_ind", CTLTYPE_OPAQUE | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, hn_rss_ind_sysctl, "IU", "RSS indirect table"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_size", + CTLFLAG_RD, &sc->hn_rndis_agg_size, 0, + "RNDIS offered packet transmission aggregation size limit"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_pkts", + CTLFLAG_RD, &sc->hn_rndis_agg_pkts, 0, + "RNDIS offered packet transmission aggregation count limit"); + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "rndis_agg_align", + CTLFLAG_RD, &sc->hn_rndis_agg_align, 0, + "RNDIS packet transmission aggregation alignment"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_size", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_size_sysctl, "I", + "Packet transmission aggregation size, 0 -- disable, -1 -- auto"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pkts", + CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_pkts_sysctl, "I", + "Packet transmission aggregation packets, " + "0 -- disable, -1 -- auto"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "polling", + CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0, + hn_polling_sysctl, "I", + "Polling frequency: [100,1000000], 0 disable polling"); /* * Setup the ifmedia, which has been initialized earlier. @@ -1020,6 +1175,13 @@ hn_attach(device_t dev) /* Enable all available capabilities by default. */ ifp->if_capenable = ifp->if_capabilities; + /* + * Disable IPv6 TSO and TXCSUM by default, they still can + * be enabled through SIOCSIFCAP. + */ + ifp->if_capenable &= ~(IFCAP_TXCSUM_IPV6 | IFCAP_TSO6); + ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO); + if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) { hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU); ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX; @@ -1056,6 +1218,14 @@ hn_detach(device_t dev) struct hn_softc *sc = device_get_softc(dev); struct ifnet *ifp = sc->hn_ifp; + if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) { + /* + * In case that the vmbus missed the orphan handler + * installation. + */ + vmbus_xact_ctx_orphan(sc->hn_xact); + } + if (device_is_attached(dev)) { HN_LOCK(sc); if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { @@ -1077,12 +1247,23 @@ hn_detach(device_t dev) hn_destroy_rx_data(sc); hn_destroy_tx_data(sc); - if (sc->hn_tx_taskq != hn_tx_taskq) - taskqueue_free(sc->hn_tx_taskq); + if (sc->hn_tx_taskqs != NULL && sc->hn_tx_taskqs != hn_tx_taskque) { + int i; + + for (i = 0; i < hn_tx_taskq_cnt; ++i) + taskqueue_free(sc->hn_tx_taskqs[i]); + free(sc->hn_tx_taskqs, M_DEVBUF); + } taskqueue_free(sc->hn_mgmt_taskq0); - if (sc->hn_xact != NULL) + if (sc->hn_xact != NULL) { + /* + * Uninstall the orphan handler _before_ the xact is + * destructed. + */ + vmbus_chan_unset_orphan(sc->hn_prichan); vmbus_xact_ctx_destroy(sc->hn_xact); + } if_free(ifp); @@ -1211,16 +1392,45 @@ hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) KASSERT((txd->flags & HN_TXD_FLAG_ONLIST) == 0, ("put an onlist txd %#x", txd->flags)); + KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, + ("put an onagg txd %#x", txd->flags)); KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); if (atomic_fetchadd_int(&txd->refs, -1) != 1) return 0; + if (!STAILQ_EMPTY(&txd->agg_list)) { + struct hn_txdesc *tmp_txd; + + while ((tmp_txd = STAILQ_FIRST(&txd->agg_list)) != NULL) { + int freed; + + KASSERT(STAILQ_EMPTY(&tmp_txd->agg_list), + ("resursive aggregation on aggregated txdesc")); + KASSERT((tmp_txd->flags & HN_TXD_FLAG_ONAGG), + ("not aggregated txdesc")); + KASSERT((tmp_txd->flags & HN_TXD_FLAG_DMAMAP) == 0, + ("aggregated txdesc uses dmamap")); + KASSERT(tmp_txd->chim_index == HN_NVS_CHIM_IDX_INVALID, + ("aggregated txdesc consumes " + "chimney sending buffer")); + KASSERT(tmp_txd->chim_size == 0, + ("aggregated txdesc has non-zero " + "chimney sending size")); + + STAILQ_REMOVE_HEAD(&txd->agg_list, agg_link); + tmp_txd->flags &= ~HN_TXD_FLAG_ONAGG; + freed = hn_txdesc_put(txr, tmp_txd); + KASSERT(freed, ("failed to free aggregated txdesc")); + } + } + if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { KASSERT((txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("chim txd uses dmamap")); hn_chim_free(txr->hn_sc, txd->chim_index); txd->chim_index = HN_NVS_CHIM_IDX_INVALID; + txd->chim_size = 0; } else if (txd->flags & HN_TXD_FLAG_DMAMAP) { bus_dmamap_sync(txr->hn_tx_data_dtag, txd->data_dmap, BUS_DMASYNC_POSTWRITE); @@ -1243,10 +1453,12 @@ hn_txdesc_put(struct hn_tx_ring *txr, struct hn_txdesc *txd) txr->hn_txdesc_avail++; SLIST_INSERT_HEAD(&txr->hn_txlist, txd, link); mtx_unlock_spin(&txr->hn_txlist_spin); -#else +#else /* HN_USE_TXDESC_BUFRING */ +#ifdef HN_DEBUG atomic_add_int(&txr->hn_txdesc_avail, 1); - buf_ring_enqueue(txr->hn_txdesc_br, txd); #endif + buf_ring_enqueue(txr->hn_txdesc_br, txd); +#endif /* !HN_USE_TXDESC_BUFRING */ return 1; } @@ -1272,11 +1484,16 @@ hn_txdesc_get(struct hn_tx_ring *txr) if (txd != NULL) { #ifdef HN_USE_TXDESC_BUFRING +#ifdef HN_DEBUG atomic_subtract_int(&txr->hn_txdesc_avail, 1); #endif +#endif /* HN_USE_TXDESC_BUFRING */ KASSERT(txd->m == NULL && txd->refs == 0 && + STAILQ_EMPTY(&txd->agg_list) && txd->chim_index == HN_NVS_CHIM_IDX_INVALID && + txd->chim_size == 0 && (txd->flags & HN_TXD_FLAG_ONLIST) && + (txd->flags & HN_TXD_FLAG_ONAGG) == 0 && (txd->flags & HN_TXD_FLAG_DMAMAP) == 0, ("invalid txd")); txd->flags &= ~HN_TXD_FLAG_ONLIST; txd->refs = 1; @@ -1289,10 +1506,26 @@ hn_txdesc_hold(struct hn_txdesc *txd) { /* 0->1 transition will never work */ - KASSERT(txd->refs > 0, ("invalid refs %d", txd->refs)); + KASSERT(txd->refs > 0, ("invalid txd refs %d", txd->refs)); atomic_add_int(&txd->refs, 1); } +static __inline void +hn_txdesc_agg(struct hn_txdesc *agg_txd, struct hn_txdesc *txd) +{ + + KASSERT((agg_txd->flags & HN_TXD_FLAG_ONAGG) == 0, + ("recursive aggregation on aggregating txdesc")); + + KASSERT((txd->flags & HN_TXD_FLAG_ONAGG) == 0, + ("already aggregated")); + KASSERT(STAILQ_EMPTY(&txd->agg_list), + ("recursive aggregation on to-be-aggregated txdesc")); + + txd->flags |= HN_TXD_FLAG_ONAGG; + STAILQ_INSERT_TAIL(&agg_txd->agg_list, txd, agg_link); +} + static bool hn_tx_ring_pending(struct hn_tx_ring *txr) { @@ -1327,7 +1560,7 @@ hn_txpkt_done(struct hn_nvs_sendctx *sndc, struct hn_softc *sc, txr = txd->txr; KASSERT(txr->hn_chan == chan, ("channel mismatch, on chan%u, should be chan%u", - vmbus_chan_subidx(chan), vmbus_chan_subidx(txr->hn_chan))); + vmbus_chan_id(chan), vmbus_chan_id(txr->hn_chan))); txr->hn_has_txeof = 1; hn_txdesc_put(txr, txd); @@ -1410,12 +1643,123 @@ hn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize, return (pi->rm_data); } +static __inline int +hn_flush_txagg(struct ifnet *ifp, struct hn_tx_ring *txr) +{ + struct hn_txdesc *txd; + struct mbuf *m; + int error, pkts; + + txd = txr->hn_agg_txd; + KASSERT(txd != NULL, ("no aggregate txdesc")); + + /* + * Since hn_txpkt() will reset this temporary stat, save + * it now, so that oerrors can be updated properly, if + * hn_txpkt() ever fails. + */ + pkts = txr->hn_stat_pkts; + + /* + * Since txd's mbuf will _not_ be freed upon hn_txpkt() + * failure, save it for later freeing, if hn_txpkt() ever + * fails. + */ + m = txd->m; + error = hn_txpkt(ifp, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m is not. */ + m_freem(m); + + txr->hn_flush_failed++; + if_inc_counter(ifp, IFCOUNTER_OERRORS, pkts); + } + + /* Reset all aggregation states. */ + txr->hn_agg_txd = NULL; + txr->hn_agg_szleft = 0; + txr->hn_agg_pktleft = 0; + txr->hn_agg_prevpkt = NULL; + + return (error); +} + +static void * +hn_try_txagg(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, + int pktsize) +{ + void *chim; + + if (txr->hn_agg_txd != NULL) { + if (txr->hn_agg_pktleft >= 1 && txr->hn_agg_szleft > pktsize) { + struct hn_txdesc *agg_txd = txr->hn_agg_txd; + struct rndis_packet_msg *pkt = txr->hn_agg_prevpkt; + int olen; + + /* + * Update the previous RNDIS packet's total length, + * it can be increased due to the mandatory alignment + * padding for this RNDIS packet. And update the + * aggregating txdesc's chimney sending buffer size + * accordingly. + * + * XXX + * Zero-out the padding, as required by the RNDIS spec. + */ + olen = pkt->rm_len; + pkt->rm_len = roundup2(olen, txr->hn_agg_align); + agg_txd->chim_size += pkt->rm_len - olen; + + /* Link this txdesc to the parent. */ + hn_txdesc_agg(agg_txd, txd); + + chim = (uint8_t *)pkt + pkt->rm_len; + /* Save the current packet for later fixup. */ + txr->hn_agg_prevpkt = chim; + + txr->hn_agg_pktleft--; + txr->hn_agg_szleft -= pktsize; + if (txr->hn_agg_szleft <= + HN_PKTSIZE_MIN(txr->hn_agg_align)) { + /* + * Probably can't aggregate more packets, + * flush this aggregating txdesc proactively. + */ + txr->hn_agg_pktleft = 0; + } + /* Done! */ + return (chim); + } + hn_flush_txagg(ifp, txr); + } + KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); + + txr->hn_tx_chimney_tried++; + txd->chim_index = hn_chim_alloc(txr->hn_sc); + if (txd->chim_index == HN_NVS_CHIM_IDX_INVALID) + return (NULL); + txr->hn_tx_chimney++; + + chim = txr->hn_sc->hn_chim + + (txd->chim_index * txr->hn_sc->hn_chim_szmax); + + if (txr->hn_agg_pktmax > 1 && + txr->hn_agg_szmax > pktsize + HN_PKTSIZE_MIN(txr->hn_agg_align)) { + txr->hn_agg_txd = txd; + txr->hn_agg_pktleft = txr->hn_agg_pktmax - 1; + txr->hn_agg_szleft = txr->hn_agg_szmax - pktsize; + txr->hn_agg_prevpkt = chim; + } + return (chim); +} + /* * NOTE: * If this function fails, then both txd and m_head0 will be freed. */ static int -hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) +hn_encap(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd, + struct mbuf **m_head0) { bus_dma_segment_t segs[HN_TX_DATA_SEGCNT_MAX]; int error, nsegs, i; @@ -1423,33 +1767,30 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) struct rndis_packet_msg *pkt; uint32_t *pi_data; void *chim = NULL; - int pktlen; + int pkt_hlen, pkt_size; pkt = txd->rndis_pkt; - if (m_head->m_pkthdr.len + HN_RNDIS_PKT_LEN < txr->hn_chim_size) { - /* - * This packet is small enough to fit into a chimney sending - * buffer. Try allocating one chimney sending buffer now. - */ - txr->hn_tx_chimney_tried++; - txd->chim_index = hn_chim_alloc(txr->hn_sc); - if (txd->chim_index != HN_NVS_CHIM_IDX_INVALID) { - chim = txr->hn_sc->hn_chim + - (txd->chim_index * txr->hn_sc->hn_chim_szmax); - /* - * Directly fill the chimney sending buffer w/ the - * RNDIS packet message. - */ + pkt_size = HN_PKTSIZE(m_head, txr->hn_agg_align); + if (pkt_size < txr->hn_chim_size) { + chim = hn_try_txagg(ifp, txr, txd, pkt_size); + if (chim != NULL) pkt = chim; - } + } else { + if (txr->hn_agg_txd != NULL) + hn_flush_txagg(ifp, txr); } pkt->rm_type = REMOTE_NDIS_PACKET_MSG; pkt->rm_len = sizeof(*pkt) + m_head->m_pkthdr.len; pkt->rm_dataoffset = sizeof(*pkt); pkt->rm_datalen = m_head->m_pkthdr.len; + pkt->rm_oobdataoffset = 0; + pkt->rm_oobdatalen = 0; + pkt->rm_oobdataelements = 0; pkt->rm_pktinfooffset = sizeof(*pkt); pkt->rm_pktinfolen = 0; + pkt->rm_vchandle = 0; + pkt->rm_reserved = 0; if (txr->hn_tx_flags & HN_TX_FLAG_HASHVAL) { /* @@ -1510,7 +1851,7 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) *pi_data |= NDIS_TXCSUM_INFO_UDPCS; } - pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; + pkt_hlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen; /* Convert RNDIS packet message offsets */ pkt->rm_dataoffset = hn_rndis_pktmsg_offset(pkt->rm_dataoffset); pkt->rm_pktinfooffset = hn_rndis_pktmsg_offset(pkt->rm_pktinfooffset); @@ -1519,25 +1860,36 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) * Fast path: Chimney sending. */ if (chim != NULL) { - KASSERT(txd->chim_index != HN_NVS_CHIM_IDX_INVALID, - ("chimney buffer is not used")); - KASSERT(pkt == chim, ("RNDIS pkt not in chimney buffer")); + struct hn_txdesc *tgt_txd = txd; + + if (txr->hn_agg_txd != NULL) { + tgt_txd = txr->hn_agg_txd; +#ifdef INVARIANTS + *m_head0 = NULL; +#endif + } + + KASSERT(pkt == chim, + ("RNDIS pkt not in chimney sending buffer")); + KASSERT(tgt_txd->chim_index != HN_NVS_CHIM_IDX_INVALID, + ("chimney sending buffer is not used")); + tgt_txd->chim_size += pkt->rm_len; m_copydata(m_head, 0, m_head->m_pkthdr.len, - ((uint8_t *)chim) + pktlen); + ((uint8_t *)chim) + pkt_hlen); - txd->chim_size = pkt->rm_len; txr->hn_gpa_cnt = 0; - txr->hn_tx_chimney++; txr->hn_sendpkt = hn_txpkt_chim; goto done; } + + KASSERT(txr->hn_agg_txd == NULL, ("aggregating sglist txdesc")); KASSERT(txd->chim_index == HN_NVS_CHIM_IDX_INVALID, ("chimney buffer is used")); KASSERT(pkt == txd->rndis_pkt, ("RNDIS pkt not in txdesc")); error = hn_txdesc_dmamap_load(txr, txd, &m_head, segs, &nsegs); - if (error) { + if (__predict_false(error)) { int freed; /* @@ -1551,7 +1903,7 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) ("fail to free txd upon txdma error")); txr->hn_txdma_failed++; - if_inc_counter(txr->hn_sc->hn_ifp, IFCOUNTER_OERRORS, 1); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); return error; } *m_head0 = m_head; @@ -1562,7 +1914,7 @@ hn_encap(struct hn_tx_ring *txr, struct hn_txdesc *txd, struct mbuf **m_head0) /* send packet with page buffer */ txr->hn_gpa[0].gpa_page = atop(txd->rndis_pkt_paddr); txr->hn_gpa[0].gpa_ofs = txd->rndis_pkt_paddr & PAGE_MASK; - txr->hn_gpa[0].gpa_len = pktlen; + txr->hn_gpa[0].gpa_len = pkt_hlen; /* * Fill the page buffers with mbuf info after the page @@ -1585,6 +1937,12 @@ done: /* Set the completion routine */ hn_nvs_sendctx_init(&txd->send_ctx, hn_txpkt_done, txd); + /* Update temporary stats for later use. */ + txr->hn_stat_pkts++; + txr->hn_stat_size += m_head->m_pkthdr.len; + if (m_head->m_flags & M_MCAST) + txr->hn_stat_mcasts++; + return 0; } @@ -1596,29 +1954,44 @@ done: static int hn_txpkt(struct ifnet *ifp, struct hn_tx_ring *txr, struct hn_txdesc *txd) { - int error, send_failed = 0; + int error, send_failed = 0, has_bpf; again: - /* - * Make sure that txd is not freed before ETHER_BPF_MTAP. - */ - hn_txdesc_hold(txd); + has_bpf = bpf_peers_present(ifp->if_bpf); + if (has_bpf) { + /* + * Make sure that this txd and any aggregated txds are not + * freed before ETHER_BPF_MTAP. + */ + hn_txdesc_hold(txd); + } error = txr->hn_sendpkt(txr, txd); if (!error) { - ETHER_BPF_MTAP(ifp, txd->m); - if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if (has_bpf) { + const struct hn_txdesc *tmp_txd; + + ETHER_BPF_MTAP(ifp, txd->m); + STAILQ_FOREACH(tmp_txd, &txd->agg_list, agg_link) + ETHER_BPF_MTAP(ifp, tmp_txd->m); + } + + if_inc_counter(ifp, IFCOUNTER_OPACKETS, txr->hn_stat_pkts); #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif { if_inc_counter(ifp, IFCOUNTER_OBYTES, - txd->m->m_pkthdr.len); - if (txd->m->m_flags & M_MCAST) - if_inc_counter(ifp, IFCOUNTER_OMCASTS, 1); + txr->hn_stat_size); + if (txr->hn_stat_mcasts != 0) { + if_inc_counter(ifp, IFCOUNTER_OMCASTS, + txr->hn_stat_mcasts); + } } - txr->hn_pkts++; + txr->hn_pkts += txr->hn_stat_pkts; + txr->hn_sends++; } - hn_txdesc_put(txr, txd); + if (has_bpf) + hn_txdesc_put(txr, txd); if (__predict_false(error)) { int freed; @@ -1656,7 +2029,13 @@ again: txr->hn_send_failed++; } - return error; + + /* Reset temporary stats, after this sending is done. */ + txr->hn_stat_size = 0; + txr->hn_stat_pkts = 0; + txr->hn_stat_mcasts = 0; + + return (error); } /* @@ -2041,10 +2420,18 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) } if (ifp->if_flags & IFF_UP) { - if (ifp->if_drv_flags & IFF_DRV_RUNNING) - hn_set_rxfilter(sc); - else + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + /* + * Caller meight hold mutex, e.g. + * bpf; use busy-wait for the RNDIS + * reply. + */ + HN_NO_SLEEPING(sc); + hn_rxfilter_config(sc); + HN_SLEEPING_OK(sc); + } else { hn_init_locked(sc); + } } else { if (ifp->if_drv_flags & IFF_DRV_RUNNING) hn_stop(sc); @@ -2105,27 +2492,23 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) case SIOCADDMULTI: case SIOCDELMULTI: -#ifdef notyet - /* - * XXX - * Multicast uses mutex, while RNDIS RX filter setting - * sleeps. We workaround this by always enabling - * ALLMULTI. ALLMULTI would actually always be on, even - * if we supported the SIOCADDMULTI/SIOCDELMULTI, since - * we don't support multicast address list configuration - * for this driver. - */ HN_LOCK(sc); if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0) { HN_UNLOCK(sc); break; } - if (ifp->if_drv_flags & IFF_DRV_RUNNING) - hn_set_rxfilter(sc); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + /* + * Multicast uses mutex; use busy-wait for + * the RNDIS reply. + */ + HN_NO_SLEEPING(sc); + hn_rxfilter_config(sc); + HN_SLEEPING_OK(sc); + } HN_UNLOCK(sc); -#endif break; case SIOCSIFMEDIA: @@ -2151,6 +2534,9 @@ hn_stop(struct hn_softc *sc) KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); + /* Disable polling. */ + hn_polling(sc, 0); + /* Clear RUNNING bit _before_ hn_suspend_data() */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); hn_suspend_data(sc); @@ -2176,7 +2562,7 @@ hn_init_locked(struct hn_softc *sc) return; /* Configure RX filter */ - hn_set_rxfilter(sc); + hn_rxfilter_config(sc); /* Clear OACTIVE bit. */ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); @@ -2188,6 +2574,10 @@ hn_init_locked(struct hn_softc *sc) /* Everything is ready; unleash! */ atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING); + + /* Re-enable polling if requested. */ + if (sc->hn_pollhz > 0) + hn_polling(sc, sc->hn_pollhz); } static void @@ -2250,7 +2640,7 @@ hn_lro_ackcnt_sysctl(SYSCTL_HANDLER_ARGS) */ --ackcnt; HN_LOCK(sc); - for (i = 0; i < sc->hn_rx_ring_inuse; ++i) + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) sc->hn_rx_ring[i].hn_lro.lro_ackcnt_lim = ackcnt; HN_UNLOCK(sc); return 0; @@ -2274,7 +2664,7 @@ hn_trust_hcsum_sysctl(SYSCTL_HANDLER_ARGS) return error; HN_LOCK(sc); - for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; if (on) @@ -2342,7 +2732,7 @@ hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) uint64_t stat; stat = 0; - for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; stat += *((uint64_t *)((uint8_t *)rxr + ofs)); } @@ -2352,7 +2742,7 @@ hn_rx_stat_u64_sysctl(SYSCTL_HANDLER_ARGS) return error; /* Zero out this stat. */ - for (i = 0; i < sc->hn_rx_ring_inuse; ++i) { + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { rxr = &sc->hn_rx_ring[i]; *((uint64_t *)((uint8_t *)rxr + ofs)) = 0; } @@ -2396,7 +2786,7 @@ hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) u_long stat; stat = 0; - for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; stat += *((u_long *)((uint8_t *)txr + ofs)); } @@ -2406,7 +2796,7 @@ hn_tx_stat_ulong_sysctl(SYSCTL_HANDLER_ARGS) return error; /* Zero out this stat. */ - for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((u_long *)((uint8_t *)txr + ofs)) = 0; } @@ -2428,7 +2818,7 @@ hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) return error; HN_LOCK(sc); - for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) { txr = &sc->hn_tx_ring[i]; *((int *)((uint8_t *)txr + ofs)) = conf; } @@ -2438,6 +2828,119 @@ hn_tx_conf_int_sysctl(SYSCTL_HANDLER_ARGS) } static int +hn_txagg_size_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error, size; + + size = sc->hn_agg_size; + error = sysctl_handle_int(oidp, &size, 0, req); + if (error || req->newptr == NULL) + return (error); + + HN_LOCK(sc); + sc->hn_agg_size = size; + hn_set_txagg(sc); + HN_UNLOCK(sc); + + return (0); +} + +static int +hn_txagg_pkts_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int error, pkts; + + pkts = sc->hn_agg_pkts; + error = sysctl_handle_int(oidp, &pkts, 0, req); + if (error || req->newptr == NULL) + return (error); + + HN_LOCK(sc); + sc->hn_agg_pkts = pkts; + hn_set_txagg(sc); + HN_UNLOCK(sc); + + return (0); +} + +static int +hn_txagg_pktmax_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int pkts; + + pkts = sc->hn_tx_ring[0].hn_agg_pktmax; + return (sysctl_handle_int(oidp, &pkts, 0, req)); +} + +static int +hn_txagg_align_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int align; + + align = sc->hn_tx_ring[0].hn_agg_align; + return (sysctl_handle_int(oidp, &align, 0, req)); +} + +static void +hn_chan_polling(struct vmbus_channel *chan, u_int pollhz) +{ + if (pollhz == 0) + vmbus_chan_poll_disable(chan); + else + vmbus_chan_poll_enable(chan, pollhz); +} + +static void +hn_polling(struct hn_softc *sc, u_int pollhz) +{ + int nsubch = sc->hn_rx_ring_inuse - 1; + + HN_LOCK_ASSERT(sc); + + if (nsubch > 0) { + struct vmbus_channel **subch; + int i; + + subch = vmbus_subchan_get(sc->hn_prichan, nsubch); + for (i = 0; i < nsubch; ++i) + hn_chan_polling(subch[i], pollhz); + vmbus_subchan_rel(subch, nsubch); + } + hn_chan_polling(sc->hn_prichan, pollhz); +} + +static int +hn_polling_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int pollhz, error; + + pollhz = sc->hn_pollhz; + error = sysctl_handle_int(oidp, &pollhz, 0, req); + if (error || req->newptr == NULL) + return (error); + + if (pollhz != 0 && + (pollhz < VMBUS_CHAN_POLLHZ_MIN || pollhz > VMBUS_CHAN_POLLHZ_MAX)) + return (EINVAL); + + HN_LOCK(sc); + if (sc->hn_pollhz != pollhz) { + sc->hn_pollhz = pollhz; + if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && + (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED)) + hn_polling(sc, sc->hn_pollhz); + } + HN_UNLOCK(sc); + + return (0); +} + +static int hn_ndis_version_sysctl(SYSCTL_HANDLER_ARGS) { struct hn_softc *sc = arg1; @@ -2546,7 +3049,7 @@ hn_rss_ind_sysctl(SYSCTL_HANDLER_ARGS) goto back; sc->hn_flags |= HN_FLAG_HAS_RSSIND; - hn_rss_ind_fixup(sc, sc->hn_rx_ring_inuse); + hn_rss_ind_fixup(sc); error = hn_rss_reconfig(sc); back: HN_UNLOCK(sc); @@ -2853,7 +3356,10 @@ hn_destroy_rx_data(struct hn_softc *sc) int i; if (sc->hn_rxbuf != NULL) { - hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); + if ((sc->hn_flags & HN_FLAG_RXBUF_REF) == 0) + hyperv_dmamem_free(&sc->hn_rxbuf_dma, sc->hn_rxbuf); + else + device_printf(sc->hn_dev, "RXBUF is referenced\n"); sc->hn_rxbuf = NULL; } @@ -2865,7 +3371,12 @@ hn_destroy_rx_data(struct hn_softc *sc) if (rxr->hn_br == NULL) continue; - hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); + if ((rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) == 0) { + hyperv_dmamem_free(&rxr->hn_br_dma, rxr->hn_br); + } else { + device_printf(sc->hn_dev, + "%dth channel bufring is referenced", i); + } rxr->hn_br = NULL; #if defined(INET) || defined(INET6) @@ -2906,7 +3417,12 @@ hn_tx_ring_create(struct hn_softc *sc, int id) M_WAITOK, &txr->hn_tx_lock); #endif - txr->hn_tx_taskq = sc->hn_tx_taskq; + if (hn_tx_taskq_mode == HN_TX_TASKQ_M_EVTTQ) { + txr->hn_tx_taskq = VMBUS_GET_EVENT_TASKQ( + device_get_parent(dev), dev, HN_RING_IDX2CPU(sc, id)); + } else { + txr->hn_tx_taskq = sc->hn_tx_taskqs[id % hn_tx_taskq_cnt]; + } #ifdef HN_IFSTART_SUPPORT if (hn_use_if_start) { @@ -2980,6 +3496,7 @@ hn_tx_ring_create(struct hn_softc *sc, int id) txd->txr = txr; txd->chim_index = HN_NVS_CHIM_IDX_INVALID; + STAILQ_INIT(&txd->agg_list); /* * Allocate and load RNDIS packet message. @@ -3049,9 +3566,11 @@ hn_tx_ring_create(struct hn_softc *sc, int id) if (txr->hn_tx_sysctl_tree != NULL) { child = SYSCTL_CHILDREN(txr->hn_tx_sysctl_tree); +#ifdef HN_DEBUG SYSCTL_ADD_INT(ctx, child, OID_AUTO, "txdesc_avail", CTLFLAG_RD, &txr->hn_txdesc_avail, 0, "# of available TX descs"); +#endif #ifdef HN_IFSTART_SUPPORT if (!hn_use_if_start) #endif @@ -3063,6 +3582,8 @@ hn_tx_ring_create(struct hn_softc *sc, int id) SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "packets", CTLFLAG_RW, &txr->hn_pkts, "# of packets transmitted"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "sends", + CTLFLAG_RW, &txr->hn_sends, "# of sends"); } } @@ -3084,24 +3605,43 @@ hn_txdesc_dmamap_destroy(struct hn_txdesc *txd) } static void +hn_txdesc_gc(struct hn_tx_ring *txr, struct hn_txdesc *txd) +{ + + KASSERT(txd->refs == 0 || txd->refs == 1, + ("invalid txd refs %d", txd->refs)); + + /* Aggregated txds will be freed by their aggregating txd. */ + if (txd->refs > 0 && (txd->flags & HN_TXD_FLAG_ONAGG) == 0) { + int freed; + + freed = hn_txdesc_put(txr, txd); + KASSERT(freed, ("can't free txdesc")); + } +} + +static void hn_tx_ring_destroy(struct hn_tx_ring *txr) { - struct hn_txdesc *txd; + int i; if (txr->hn_txdesc == NULL) return; -#ifndef HN_USE_TXDESC_BUFRING - while ((txd = SLIST_FIRST(&txr->hn_txlist)) != NULL) { - SLIST_REMOVE_HEAD(&txr->hn_txlist, link); - hn_txdesc_dmamap_destroy(txd); - } -#else - mtx_lock(&txr->hn_tx_lock); - while ((txd = buf_ring_dequeue_sc(txr->hn_txdesc_br)) != NULL) - hn_txdesc_dmamap_destroy(txd); - mtx_unlock(&txr->hn_tx_lock); -#endif + /* + * NOTE: + * Because the freeing of aggregated txds will be deferred + * to the aggregating txd, two passes are used here: + * - The first pass GCes any pending txds. This GC is necessary, + * since if the channels are revoked, hypervisor will not + * deliver send-done for all pending txds. + * - The second pass frees the busdma stuffs, i.e. after all txds + * were freed. + */ + for (i = 0; i < txr->hn_txdesc_cnt; ++i) + hn_txdesc_gc(txr, &txr->hn_txdesc[i]); + for (i = 0; i < txr->hn_txdesc_cnt; ++i) + hn_txdesc_dmamap_destroy(&txr->hn_txdesc[i]); if (txr->hn_tx_data_dtag != NULL) bus_dma_tag_destroy(txr->hn_tx_data_dtag); @@ -3177,6 +3717,11 @@ hn_create_tx_data(struct hn_softc *sc, int ring_cnt) CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_txdma_failed), hn_tx_stat_ulong_sysctl, "LU", "# of TX DMA failure"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_flush_failed", + CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, + __offsetof(struct hn_tx_ring, hn_flush_failed), + hn_tx_stat_ulong_sysctl, "LU", + "# of packet transmission aggregation flush failure"); SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "tx_collapsed", CTLTYPE_ULONG | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, __offsetof(struct hn_tx_ring, hn_tx_collapsed), @@ -3213,6 +3758,17 @@ hn_create_tx_data(struct hn_softc *sc, int ring_cnt) CTLFLAG_RD, &sc->hn_tx_ring_cnt, 0, "# created TX rings"); SYSCTL_ADD_INT(ctx, child, OID_AUTO, "tx_ring_inuse", CTLFLAG_RD, &sc->hn_tx_ring_inuse, 0, "# used TX rings"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "agg_szmax", + CTLFLAG_RD, &sc->hn_tx_ring[0].hn_agg_szmax, 0, + "Applied packet transmission aggregation size"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_pktmax", + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_pktmax_sysctl, "I", + "Applied packet transmission aggregation packets"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "agg_align", + CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0, + hn_txagg_align_sysctl, "I", + "Applied packet transmission aggregation alignment"); return 0; } @@ -3222,7 +3778,7 @@ hn_set_chim_size(struct hn_softc *sc, int chim_size) { int i; - for (i = 0; i < sc->hn_tx_ring_inuse; ++i) + for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_chim_size = chim_size; } @@ -3272,12 +3828,10 @@ hn_fixup_tx_data(struct hn_softc *sc) csum_assist |= CSUM_IP_TCP; if (sc->hn_caps & HN_CAP_UDP4CS) csum_assist |= CSUM_IP_UDP; -#ifdef notyet if (sc->hn_caps & HN_CAP_TCP6CS) csum_assist |= CSUM_IP6_TCP; if (sc->hn_caps & HN_CAP_UDP6CS) csum_assist |= CSUM_IP6_UDP; -#endif for (i = 0; i < sc->hn_tx_ring_cnt; ++i) sc->hn_tx_ring[i].hn_csum_assist = csum_assist; @@ -3298,7 +3852,12 @@ hn_destroy_tx_data(struct hn_softc *sc) int i; if (sc->hn_chim != NULL) { - hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); + if ((sc->hn_flags & HN_FLAG_CHIM_REF) == 0) { + hyperv_dmamem_free(&sc->hn_chim_dma, sc->hn_chim); + } else { + device_printf(sc->hn_dev, + "chimney sending buffer is referenced"); + } sc->hn_chim = NULL; } @@ -3332,18 +3891,20 @@ hn_start_locked(struct hn_tx_ring *txr, int len) { struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; + int sched = 0; KASSERT(hn_use_if_start, ("hn_start_locked is called, when if_start is disabled")); KASSERT(txr == &sc->hn_tx_ring[0], ("not the first TX ring")); mtx_assert(&txr->hn_tx_lock, MA_OWNED); + KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) - return 0; + return (0); if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != IFF_DRV_RUNNING) - return 0; + return (0); while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { struct hn_txdesc *txd; @@ -3361,7 +3922,8 @@ hn_start_locked(struct hn_tx_ring *txr, int len) * following up packets) to tx taskqueue. */ IFQ_DRV_PREPEND(&ifp->if_snd, m_head); - return 1; + sched = 1; + break; } #if defined(INET6) || defined(INET) @@ -3382,21 +3944,50 @@ hn_start_locked(struct hn_tx_ring *txr, int len) break; } - error = hn_encap(txr, txd, &m_head); + error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed */ + KASSERT(txr->hn_agg_txd == NULL, + ("encap failed w/ pending aggregating txdesc")); continue; } - error = hn_txpkt(ifp, txr, txd); - if (__predict_false(error)) { - /* txd is freed, but m_head is not */ - IFQ_DRV_PREPEND(&ifp->if_snd, m_head); - atomic_set_int(&ifp->if_drv_flags, IFF_DRV_OACTIVE); - break; + if (txr->hn_agg_pktleft == 0) { + if (txr->hn_agg_txd != NULL) { + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); + error = hn_flush_txagg(ifp, txr); + if (__predict_false(error)) { + atomic_set_int(&ifp->if_drv_flags, + IFF_DRV_OACTIVE); + break; + } + } else { + KASSERT(m_head != NULL, ("mbuf was freed")); + error = hn_txpkt(ifp, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m_head is not */ + IFQ_DRV_PREPEND(&ifp->if_snd, m_head); + atomic_set_int(&ifp->if_drv_flags, + IFF_DRV_OACTIVE); + break; + } + } + } +#ifdef INVARIANTS + else { + KASSERT(txr->hn_agg_txd != NULL, + ("no aggregating txdesc")); + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); } +#endif } - return 0; + + /* Flush pending aggerated transmission. */ + if (txr->hn_agg_txd != NULL) + hn_flush_txagg(ifp, txr); + return (sched); } static void @@ -3473,18 +4064,20 @@ hn_xmit(struct hn_tx_ring *txr, int len) struct hn_softc *sc = txr->hn_sc; struct ifnet *ifp = sc->hn_ifp; struct mbuf *m_head; + int sched = 0; mtx_assert(&txr->hn_tx_lock, MA_OWNED); #ifdef HN_IFSTART_SUPPORT KASSERT(hn_use_if_start == 0, ("hn_xmit is called, when if_start is enabled")); #endif + KASSERT(txr->hn_agg_txd == NULL, ("lingering aggregating txdesc")); if (__predict_false(txr->hn_suspended)) - return 0; + return (0); if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || txr->hn_oactive) - return 0; + return (0); while ((m_head = drbr_peek(ifp, txr->hn_mbuf_br)) != NULL) { struct hn_txdesc *txd; @@ -3497,7 +4090,8 @@ hn_xmit(struct hn_tx_ring *txr, int len) * following up packets) to tx taskqueue. */ drbr_putback(ifp, txr->hn_mbuf_br, m_head); - return 1; + sched = 1; + break; } txd = hn_txdesc_get(txr); @@ -3508,25 +4102,53 @@ hn_xmit(struct hn_tx_ring *txr, int len) break; } - error = hn_encap(txr, txd, &m_head); + error = hn_encap(ifp, txr, txd, &m_head); if (error) { /* Both txd and m_head are freed; discard */ + KASSERT(txr->hn_agg_txd == NULL, + ("encap failed w/ pending aggregating txdesc")); drbr_advance(ifp, txr->hn_mbuf_br); continue; } - error = hn_txpkt(ifp, txr, txd); - if (__predict_false(error)) { - /* txd is freed, but m_head is not */ - drbr_putback(ifp, txr->hn_mbuf_br, m_head); - txr->hn_oactive = 1; - break; + if (txr->hn_agg_pktleft == 0) { + if (txr->hn_agg_txd != NULL) { + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); + error = hn_flush_txagg(ifp, txr); + if (__predict_false(error)) { + txr->hn_oactive = 1; + break; + } + } else { + KASSERT(m_head != NULL, ("mbuf was freed")); + error = hn_txpkt(ifp, txr, txd); + if (__predict_false(error)) { + /* txd is freed, but m_head is not */ + drbr_putback(ifp, txr->hn_mbuf_br, + m_head); + txr->hn_oactive = 1; + break; + } + } + } +#ifdef INVARIANTS + else { + KASSERT(txr->hn_agg_txd != NULL, + ("no aggregating txdesc")); + KASSERT(m_head == NULL, + ("pending mbuf for aggregating txdesc")); } +#endif /* Sent */ drbr_advance(ifp, txr->hn_mbuf_br); } - return 0; + + /* Flush pending aggerated transmission. */ + if (txr->hn_agg_txd != NULL) + hn_flush_txagg(ifp, txr); + return (sched); } static int @@ -3695,7 +4317,7 @@ hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) } /* Bind this channel to a proper CPU. */ - vmbus_chan_cpu_set(chan, (sc->hn_cpu + idx) % mp_ncpus); + vmbus_chan_cpu_set(chan, HN_RING_IDX2CPU(sc, idx)); /* * Open this channel @@ -3706,11 +4328,14 @@ hn_chan_attach(struct hn_softc *sc, struct vmbus_channel *chan) cbr.cbr_rxsz = HN_RXBR_SIZE; error = vmbus_chan_open_br(chan, &cbr, NULL, 0, hn_chan_callback, rxr); if (error) { - if_printf(sc->hn_ifp, "open chan%u failed: %d\n", - vmbus_chan_id(chan), error); - rxr->hn_rx_flags &= ~HN_RX_FLAG_ATTACHED; - if (txr != NULL) - txr->hn_tx_flags &= ~HN_TX_FLAG_ATTACHED; + if (error == EISCONN) { + if_printf(sc->hn_ifp, "bufring is connected after " + "chan%u open failure\n", vmbus_chan_id(chan)); + rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; + } else { + if_printf(sc->hn_ifp, "open chan%u failed: %d\n", + vmbus_chan_id(chan), error); + } } return (error); } @@ -3719,7 +4344,7 @@ static void hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) { struct hn_rx_ring *rxr; - int idx; + int idx, error; idx = vmbus_chan_subidx(chan); @@ -3748,7 +4373,15 @@ hn_chan_detach(struct hn_softc *sc, struct vmbus_channel *chan) * NOTE: * Channel closing does _not_ destroy the target channel. */ - vmbus_chan_close(chan); + error = vmbus_chan_close_direct(chan); + if (error == EISCONN) { + if_printf(sc->hn_ifp, "chan%u bufring is connected " + "after being closed\n", vmbus_chan_id(chan)); + rxr->hn_rx_flags |= HN_RX_FLAG_BR_REF; + } else if (error) { + if_printf(sc->hn_ifp, "chan%u close failed: %d\n", + vmbus_chan_id(chan), error); + } } static int @@ -3758,15 +4391,18 @@ hn_attach_subchans(struct hn_softc *sc) int subchan_cnt = sc->hn_rx_ring_inuse - 1; int i, error = 0; - if (subchan_cnt == 0) - return (0); + KASSERT(subchan_cnt > 0, ("no sub-channels")); /* Attach the sub-channels. */ subchans = vmbus_subchan_get(sc->hn_prichan, subchan_cnt); for (i = 0; i < subchan_cnt; ++i) { - error = hn_chan_attach(sc, subchans[i]); - if (error) - break; + int error1; + + error1 = hn_chan_attach(sc, subchans[i]); + if (error1) { + error = error1; + /* Move on; all channels will be detached later. */ + } } vmbus_subchan_rel(subchans, subchan_cnt); @@ -3878,16 +4514,39 @@ hn_synth_alloc_subchans(struct hn_softc *sc, int *nsubch) return (0); } +static bool +hn_synth_attachable(const struct hn_softc *sc) +{ + int i; + + if (sc->hn_flags & HN_FLAG_ERRORS) + return (false); + + for (i = 0; i < sc->hn_rx_ring_cnt; ++i) { + const struct hn_rx_ring *rxr = &sc->hn_rx_ring[i]; + + if (rxr->hn_rx_flags & HN_RX_FLAG_BR_REF) + return (false); + } + return (true); +} + static int hn_synth_attach(struct hn_softc *sc, int mtu) { +#define ATTACHED_NVS 0x0002 +#define ATTACHED_RNDIS 0x0004 + struct ndis_rssprm_toeplitz *rss = &sc->hn_rss; int error, nsubch, nchan, i; - uint32_t old_caps; + uint32_t old_caps, attached = 0; KASSERT((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0, ("synthetic parts were attached")); + if (!hn_synth_attachable(sc)) + return (ENXIO); + /* Save capabilities for later verification. */ old_caps = sc->hn_caps; sc->hn_caps = 0; @@ -3901,21 +4560,23 @@ hn_synth_attach(struct hn_softc *sc, int mtu) */ error = hn_chan_attach(sc, sc->hn_prichan); if (error) - return (error); + goto failed; /* * Attach NVS. */ error = hn_nvs_attach(sc, mtu); if (error) - return (error); + goto failed; + attached |= ATTACHED_NVS; /* * Attach RNDIS _after_ NVS is attached. */ error = hn_rndis_attach(sc, mtu); if (error) - return (error); + goto failed; + attached |= ATTACHED_RNDIS; /* * Make sure capabilities are not changed. @@ -3923,9 +4584,8 @@ hn_synth_attach(struct hn_softc *sc, int mtu) if (device_is_attached(sc->hn_dev) && old_caps != sc->hn_caps) { if_printf(sc->hn_ifp, "caps mismatch old 0x%08x, new 0x%08x\n", old_caps, sc->hn_caps); - /* Restore old capabilities and abort. */ - sc->hn_caps = old_caps; - return ENXIO; + error = ENXIO; + goto failed; } /* @@ -3938,19 +4598,34 @@ hn_synth_attach(struct hn_softc *sc, int mtu) nsubch = sc->hn_rx_ring_cnt - 1; error = hn_synth_alloc_subchans(sc, &nsubch); if (error) - return (error); + goto failed; + /* NOTE: _Full_ synthetic parts detach is required now. */ + sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; + /* + * Set the # of TX/RX rings that could be used according to + * the # of channels that NVS offered. + */ nchan = nsubch + 1; + hn_set_ring_inuse(sc, nchan); if (nchan == 1) { /* Only the primary channel can be used; done */ goto back; } /* - * Configure RSS key and indirect table _after_ all sub-channels - * are allocated. + * Attach the sub-channels. + * + * NOTE: hn_set_ring_inuse() _must_ have been called. */ + error = hn_attach_subchans(sc); + if (error) + goto failed; + /* + * Configure RSS key and indirect table _after_ all sub-channels + * are attached. + */ if ((sc->hn_flags & HN_FLAG_HAS_RSSKEY) == 0) { /* * RSS key is not set yet; set it to the default RSS key. @@ -3978,34 +4653,38 @@ hn_synth_attach(struct hn_softc *sc, int mtu) * # of usable channels may be changed, so we have to * make sure that all entries in RSS indirect table * are valid. + * + * NOTE: hn_set_ring_inuse() _must_ have been called. */ - hn_rss_ind_fixup(sc, nchan); + hn_rss_ind_fixup(sc); } error = hn_rndis_conf_rss(sc, NDIS_RSS_FLAG_NONE); - if (error) { - /* - * Failed to configure RSS key or indirect table; only - * the primary channel can be used. - */ - nchan = 1; - } + if (error) + goto failed; back: /* - * Set the # of TX/RX rings that could be used according to - * the # of channels that NVS offered. + * Fixup transmission aggregation setup. */ - hn_set_ring_inuse(sc, nchan); + hn_set_txagg(sc); + return (0); - /* - * Attach the sub-channels, if any. - */ - error = hn_attach_subchans(sc); - if (error) - return (error); +failed: + if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) { + hn_synth_detach(sc); + } else { + if (attached & ATTACHED_RNDIS) + hn_rndis_detach(sc); + if (attached & ATTACHED_NVS) + hn_nvs_detach(sc); + hn_chan_detach(sc, sc->hn_prichan); + /* Restore old capabilities. */ + sc->hn_caps = old_caps; + } + return (error); - sc->hn_flags |= HN_FLAG_SYNTH_ATTACHED; - return (0); +#undef ATTACHED_RNDIS +#undef ATTACHED_NVS } /* @@ -4016,7 +4695,6 @@ back: static void hn_synth_detach(struct hn_softc *sc) { - HN_LOCK_ASSERT(sc); KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED, ("synthetic parts were not attached")); @@ -4052,10 +4730,17 @@ hn_set_ring_inuse(struct hn_softc *sc, int ring_cnt) } static void -hn_chan_drain(struct vmbus_channel *chan) +hn_chan_drain(struct hn_softc *sc, struct vmbus_channel *chan) { - while (!vmbus_chan_rx_empty(chan) || !vmbus_chan_tx_empty(chan)) + /* + * NOTE: + * The TX bufring will not be drained by the hypervisor, + * if the primary channel is revoked. + */ + while (!vmbus_chan_rx_empty(chan) || + (!vmbus_chan_is_revoked(sc->hn_prichan) && + !vmbus_chan_tx_empty(chan))) pause("waitch", 1); vmbus_chan_intr_drain(chan); } @@ -4064,6 +4749,7 @@ static void hn_suspend_data(struct hn_softc *sc) { struct vmbus_channel **subch = NULL; + struct hn_tx_ring *txr; int i, nsubch; HN_LOCK_ASSERT(sc); @@ -4072,26 +4758,29 @@ hn_suspend_data(struct hn_softc *sc) * Suspend TX. */ for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { - struct hn_tx_ring *txr = &sc->hn_tx_ring[i]; + txr = &sc->hn_tx_ring[i]; mtx_lock(&txr->hn_tx_lock); txr->hn_suspended = 1; mtx_unlock(&txr->hn_tx_lock); /* No one is able send more packets now. */ - /* Wait for all pending sends to finish. */ - while (hn_tx_ring_pending(txr)) + /* + * Wait for all pending sends to finish. + * + * NOTE: + * We will _not_ receive all pending send-done, if the + * primary channel is revoked. + */ + while (hn_tx_ring_pending(txr) && + !vmbus_chan_is_revoked(sc->hn_prichan)) pause("hnwtx", 1 /* 1 tick */); - - taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); - taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); } /* * Disable RX by clearing RX filter. */ - sc->hn_rx_filter = NDIS_PACKET_TYPE_NONE; - hn_rndis_set_rxfilter(sc, sc->hn_rx_filter); + hn_set_rxfilter(sc, NDIS_PACKET_TYPE_NONE); /* * Give RNDIS enough time to flush all pending data packets. @@ -4107,12 +4796,27 @@ hn_suspend_data(struct hn_softc *sc) if (subch != NULL) { for (i = 0; i < nsubch; ++i) - hn_chan_drain(subch[i]); + hn_chan_drain(sc, subch[i]); } - hn_chan_drain(sc->hn_prichan); + hn_chan_drain(sc, sc->hn_prichan); if (subch != NULL) vmbus_subchan_rel(subch, nsubch); + + /* + * Drain any pending TX tasks. + * + * NOTE: + * The above hn_chan_drain() can dispatch TX tasks, so the TX + * tasks will have to be drained _after_ the above hn_chan_drain() + * calls. + */ + for (i = 0; i < sc->hn_tx_ring_inuse; ++i) { + txr = &sc->hn_tx_ring[i]; + + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_tx_task); + taskqueue_drain(txr->hn_tx_taskq, &txr->hn_txeof_task); + } } static void @@ -4148,6 +4852,9 @@ static void hn_suspend(struct hn_softc *sc) { + /* Disable polling. */ + hn_polling(sc, 0); + if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_suspend_data(sc); hn_suspend_mgmt(sc); @@ -4180,7 +4887,7 @@ hn_resume_data(struct hn_softc *sc) /* * Re-enable RX. */ - hn_set_rxfilter(sc); + hn_rxfilter_config(sc); /* * Make sure to clear suspend status on "all" TX rings, @@ -4240,6 +4947,13 @@ hn_resume(struct hn_softc *sc) if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) hn_resume_data(sc); hn_resume_mgmt(sc); + + /* + * Re-enable polling if this interface is running and + * the polling is requested. + */ + if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) && sc->hn_pollhz > 0) + hn_polling(sc, sc->hn_pollhz); } static void @@ -4758,27 +5472,42 @@ hn_chan_callback(struct vmbus_channel *chan, void *xrxr) static void hn_tx_taskq_create(void *arg __unused) { + int i; + + /* + * Fix the # of TX taskqueues. + */ + if (hn_tx_taskq_cnt <= 0) + hn_tx_taskq_cnt = 1; + else if (hn_tx_taskq_cnt > mp_ncpus) + hn_tx_taskq_cnt = mp_ncpus; + + /* + * Fix the TX taskqueue mode. + */ + switch (hn_tx_taskq_mode) { + case HN_TX_TASKQ_M_INDEP: + case HN_TX_TASKQ_M_GLOBAL: + case HN_TX_TASKQ_M_EVTTQ: + break; + default: + hn_tx_taskq_mode = HN_TX_TASKQ_M_INDEP; + break; + } if (vm_guest != VM_GUEST_HV) return; - if (!hn_share_tx_taskq) + if (hn_tx_taskq_mode != HN_TX_TASKQ_M_GLOBAL) return; - hn_tx_taskq = taskqueue_create("hn_tx", M_WAITOK, - taskqueue_thread_enqueue, &hn_tx_taskq); - taskqueue_start_threads(&hn_tx_taskq, 1, PI_NET, "hn tx"); - if (hn_bind_tx_taskq >= 0) { - int cpu = hn_bind_tx_taskq; - struct task cpuset_task; - cpuset_t cpu_set; - - if (cpu > mp_ncpus - 1) - cpu = mp_ncpus - 1; - CPU_SETOF(cpu, &cpu_set); - TASK_INIT(&cpuset_task, 0, hn_cpuset_setthread_task, &cpu_set); - taskqueue_enqueue(hn_tx_taskq, &cpuset_task); - taskqueue_drain(hn_tx_taskq, &cpuset_task); + hn_tx_taskque = malloc(hn_tx_taskq_cnt * sizeof(struct taskqueue *), + M_DEVBUF, M_WAITOK); + for (i = 0; i < hn_tx_taskq_cnt; ++i) { + hn_tx_taskque[i] = taskqueue_create("hn_tx", M_WAITOK, + taskqueue_thread_enqueue, &hn_tx_taskque[i]); + taskqueue_start_threads(&hn_tx_taskque[i], 1, PI_NET, + "hn tx%d", i); } } SYSINIT(hn_txtq_create, SI_SUB_DRIVERS, SI_ORDER_SECOND, @@ -4788,8 +5517,13 @@ static void hn_tx_taskq_destroy(void *arg __unused) { - if (hn_tx_taskq != NULL) - taskqueue_free(hn_tx_taskq); + if (hn_tx_taskque != NULL) { + int i; + + for (i = 0; i < hn_tx_taskq_cnt; ++i) + taskqueue_free(hn_tx_taskque[i]); + free(hn_tx_taskque, M_DEVBUF); + } } SYSUNINIT(hn_txtq_destroy, SI_SUB_DRIVERS, SI_ORDER_SECOND, hn_tx_taskq_destroy, NULL); |