summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorsephe <sephe@FreeBSD.org>2017-08-21 05:25:30 +0000
committersephe <sephe@FreeBSD.org>2017-08-21 05:25:30 +0000
commitca9be1048e1114e0e543779418164a706bcbc1ca (patch)
tree9eca228e481acfeb5ada609b4a1610c691ebd774
parent0bb59cd0c7901cd78dac6d35eddff240cbb564c3 (diff)
downloadFreeBSD-src-ca9be1048e1114e0e543779418164a706bcbc1ca.zip
FreeBSD-src-ca9be1048e1114e0e543779418164a706bcbc1ca.tar.gz
MFC 322299,322483,322485-322487
322299 hyperv/hn: Implement transparent mode network VF. How network VF works with hn(4) on Hyper-V in transparent mode: - Each network VF has a cooresponding hn(4). - The network VF and the it's cooresponding hn(4) have the same hardware address. - Once the network VF is attached, the cooresponding hn(4) waits several seconds to make sure that the network VF attach routing completes, then: o Set the intersection of the network VF's if_capabilities and the cooresponding hn(4)'s if_capabilities to the cooresponding hn(4)'s if_capabilities. And adjust the cooresponding hn(4) if_capable and if_hwassist accordingly. (*) o Make sure that the cooresponding hn(4)'s TSO parameters meet the constraints posed by both the network VF and the cooresponding hn(4). (*) o The network VF's if_input is overridden. The overriding if_input changes the input packet's rcvif to the cooreponding hn(4). The network layers are tricked into thinking that all packets are neceived by the cooresponding hn(4). o If the cooresponding hn(4) was brought up, bring up the network VF. The transmission dispatched to the cooresponding hn(4) are redispatched to the network VF. o Bringing down the cooresponding hn(4) also brings down the network VF. o All IOCTLs issued to the cooresponding hn(4) are pass-through'ed to the network VF; the cooresponding hn(4) changes its internal state if necessary. o The media status of the cooresponding hn(4) solely relies on the network VF. o If there are multicast filters on the cooresponding hn(4), allmulti will be enabled on the network VF. (**) - Once the network VF is detached. Undo all damages did to the cooresponding hn(4) in the above item. NOTE: No operation should be issued directly to the network VF, if the network VF transparent mode is enabled. The network VF transparent mode can be enabled by setting tunable hw.hn.vf_transparent to 1. The network VF transparent mode is _not_ enabled by default, as of this commit. The benefit of the network VF transparent mode is that the network VF attachment and detachment are transparent to all network layers; e.g. live migration detaches and reattaches the network VF. The major drawbacks of the network VF transparent mode: - The netmap(4) support is lost, even if the VF supports it. - ALTQ does not work, since if_start method cannot be properly supported. (*) These decisions were made so that things will not be messed up too much during the transition period. (**) This does _not_ need to go through the fancy multicast filter management stuffs like what vlan(4) has, at least currently: - As of this write, multicast does not work in Azure. - As of this write, multicast packets go through the cooresponding hn(4). Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D11803 322483 hyperv/hn: Update VF's ibytes properly under transparent VF mode. While, I'm here add comment about why updating VF's imcast stat is not necessary. Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D11948 322485 hyperv/hn: Fix/enhance receiving path when VF is activated. - Update hn(4)'s stats properly for non-transparent mode VF. - Allow BPF tapping to hn(4) for non-transparent mode VF. - Don't setup mbuf hash, if 'options RSS' is set. In Azure, when VF is activated, TCP SYN and SYN|ACK go through hn(4) while the rest of segments and ACKs belonging to the same TCP 4-tuple go through the VF. So don't setup mbuf hash, if a VF is activated and 'options RSS' is not enabled. hn(4) and the VF may use neither the same RSS hash key nor the same RSS hash function, so the hash value for packets belonging to the same flow could be different! - Disable LRO. hn(4) will only receive broadcast packets, multicast packets, TCP SYN and SYN|ACK (in Azure), LRO is useless for these packet types. For non-transparent, we definitely _cannot_ enable LRO at all, since the LRO flush will use hn(4) as the receiving interface; i.e. hn_ifp->if_input(hn_ifp, m). While I'm here, remove unapplied comment and minor style change. Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D11978 322486 hyperv/hn: Minor cleanup Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D11979 322487 hyperv/hn: Re-set datapath after synthetic parts reattached. Do this even for non-transparent mode VF. Better safe than sorry. Sponsored by: Microsoft Differential Revision: https://reviews.freebsd.org/D11981 Approved by: re (delphij)
-rw-r--r--sys/dev/hyperv/netvsc/if_hn.c983
-rw-r--r--sys/dev/hyperv/netvsc/if_hnreg.h2
-rw-r--r--sys/dev/hyperv/netvsc/if_hnvar.h29
3 files changed, 947 insertions, 67 deletions
diff --git a/sys/dev/hyperv/netvsc/if_hn.c b/sys/dev/hyperv/netvsc/if_hn.c
index 8661df3..341a287 100644
--- a/sys/dev/hyperv/netvsc/if_hn.c
+++ b/sys/dev/hyperv/netvsc/if_hn.c
@@ -122,6 +122,8 @@ __FBSDID("$FreeBSD$");
#define HN_VFMAP_SIZE_DEF 8
+#define HN_XPNT_VF_ATTWAIT_MIN 2 /* seconds */
+
/* YYY should get it from the underlying channel */
#define HN_TX_DESC_CNT 512
@@ -258,6 +260,7 @@ static void hn_ifnet_event(void *, struct ifnet *, int);
static void hn_ifaddr_event(void *, struct ifnet *);
static void hn_ifnet_attevent(void *, struct ifnet *);
static void hn_ifnet_detevent(void *, struct ifnet *);
+static void hn_ifnet_lnkevent(void *, struct ifnet *, int);
static bool hn_ismyvf(const struct hn_softc *,
const struct ifnet *);
@@ -265,6 +268,17 @@ static void hn_rxvf_change(struct hn_softc *,
struct ifnet *, bool);
static void hn_rxvf_set(struct hn_softc *, struct ifnet *);
static void hn_rxvf_set_task(void *, int);
+static void hn_xpnt_vf_input(struct ifnet *, struct mbuf *);
+static int hn_xpnt_vf_iocsetflags(struct hn_softc *);
+static int hn_xpnt_vf_iocsetcaps(struct hn_softc *,
+ struct ifreq *);
+static void hn_xpnt_vf_saveifflags(struct hn_softc *);
+static bool hn_xpnt_vf_isready(struct hn_softc *);
+static void hn_xpnt_vf_setready(struct hn_softc *);
+static void hn_xpnt_vf_init_taskfunc(void *, int);
+static void hn_xpnt_vf_init(struct hn_softc *);
+static void hn_xpnt_vf_setenable(struct hn_softc *);
+static void hn_xpnt_vf_setdisable(struct hn_softc *, bool);
static int hn_rndis_rxinfo(const void *, int,
struct hn_rxinfo *);
@@ -315,6 +329,8 @@ static int hn_vf_sysctl(SYSCTL_HANDLER_ARGS);
static int hn_rxvf_sysctl(SYSCTL_HANDLER_ARGS);
static int hn_vflist_sysctl(SYSCTL_HANDLER_ARGS);
static int hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS);
+static int hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS);
static void hn_stop(struct hn_softc *, bool);
static void hn_init_locked(struct hn_softc *);
@@ -345,6 +361,7 @@ static void hn_disable_rx(struct hn_softc *);
static void hn_drain_rxtx(struct hn_softc *, int);
static void hn_polling(struct hn_softc *, u_int);
static void hn_chan_polling(struct vmbus_channel *, u_int);
+static void hn_mtu_change_fixup(struct hn_softc *);
static void hn_update_link_status(struct hn_softc *);
static void hn_change_network(struct hn_softc *);
@@ -520,6 +537,22 @@ SYSCTL_PROC(_hw_hn, OID_AUTO, vflist, CTLFLAG_RD | CTLTYPE_STRING,
SYSCTL_PROC(_hw_hn, OID_AUTO, vfmap, CTLFLAG_RD | CTLTYPE_STRING,
0, 0, hn_vfmap_sysctl, "A", "VF mapping");
+/* Transparent VF */
+static int hn_xpnt_vf = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_transparent, CTLFLAG_RDTUN,
+ &hn_xpnt_vf, 0, "Transparent VF mod");
+
+/* Accurate BPF support for Transparent VF */
+static int hn_xpnt_vf_accbpf = 0;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_accbpf, CTLFLAG_RDTUN,
+ &hn_xpnt_vf_accbpf, 0, "Accurate BPF for transparent VF");
+
+/* Extra wait for transparent VF attach routing; unit seconds. */
+static int hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
+SYSCTL_INT(_hw_hn, OID_AUTO, vf_xpnt_attwait, CTLFLAG_RWTUN,
+ &hn_xpnt_vf_attwait, 0,
+ "Extra wait for transparent VF attach routing; unit: seconds");
+
static u_int hn_cpu_index; /* next CPU for channel */
static struct taskqueue **hn_tx_taskque;/* shared TX taskqueues */
@@ -536,6 +569,12 @@ hn_rss_key_default[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa
};
+static const struct hyperv_guid hn_guid = {
+ .hv_guid = {
+ 0x63, 0x51, 0x61, 0xf8, 0x3e, 0xdf, 0xc5, 0x46,
+ 0x91, 0x3f, 0xf2, 0xd2, 0xf9, 0x65, 0xed, 0x0e }
+};
+
static device_method_t hn_methods[] = {
/* Device interface */
DEVMETHOD(device_probe, hn_probe),
@@ -796,8 +835,12 @@ hn_rxfilter_config(struct hn_softc *sc)
HN_LOCK_ASSERT(sc);
- if ((ifp->if_flags & IFF_PROMISC) ||
- (sc->hn_flags & HN_FLAG_RXVF)) {
+ /*
+ * If the non-transparent mode VF is activated, we don't know how
+ * its RX filter is configured, so stick the synthetic device in
+ * the promiscous mode.
+ */
+ if ((ifp->if_flags & IFF_PROMISC) || (sc->hn_flags & HN_FLAG_RXVF)) {
filter = NDIS_PACKET_TYPE_PROMISCUOUS;
} else {
filter = NDIS_PACKET_TYPE_DIRECTED;
@@ -1073,7 +1116,7 @@ hn_rxvf_change(struct hn_softc *sc, struct ifnet *ifp, bool rxvf)
}
hn_nvs_set_datapath(sc,
- rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTHETIC);
+ rxvf ? HN_NVS_DATAPATH_VF : HN_NVS_DATAPATH_SYNTH);
hn_rxvf_set(sc, rxvf ? ifp : NULL);
@@ -1113,6 +1156,375 @@ hn_ifaddr_event(void *arg, struct ifnet *ifp)
hn_rxvf_change(arg, ifp, ifp->if_flags & IFF_UP);
}
+static int
+hn_xpnt_vf_iocsetcaps(struct hn_softc *sc, struct ifreq *ifr)
+{
+ struct ifnet *ifp, *vf_ifp;
+ uint64_t tmp;
+ int error;
+
+ HN_LOCK_ASSERT(sc);
+ ifp = sc->hn_ifp;
+ vf_ifp = sc->hn_vf_ifp;
+
+ /*
+ * Fix up requested capabilities w/ supported capabilities,
+ * since the supported capabilities could have been changed.
+ */
+ ifr->ifr_reqcap &= ifp->if_capabilities;
+ /* Pass SIOCSIFCAP to VF. */
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFCAP, (caddr_t)ifr);
+
+ /*
+ * NOTE:
+ * The error will be propagated to the callers, however, it
+ * is _not_ useful here.
+ */
+
+ /*
+ * Merge VF's enabled capabilities.
+ */
+ ifp->if_capenable = vf_ifp->if_capenable & ifp->if_capabilities;
+
+ tmp = vf_ifp->if_hwassist & HN_CSUM_IP_HWASSIST(sc);
+ if (ifp->if_capenable & IFCAP_TXCSUM)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ tmp = vf_ifp->if_hwassist & HN_CSUM_IP6_HWASSIST(sc);
+ if (ifp->if_capenable & IFCAP_TXCSUM_IPV6)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ tmp = vf_ifp->if_hwassist & CSUM_IP_TSO;
+ if (ifp->if_capenable & IFCAP_TSO4)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ tmp = vf_ifp->if_hwassist & CSUM_IP6_TSO;
+ if (ifp->if_capenable & IFCAP_TSO6)
+ ifp->if_hwassist |= tmp;
+ else
+ ifp->if_hwassist &= ~tmp;
+
+ return (error);
+}
+
+static int
+hn_xpnt_vf_iocsetflags(struct hn_softc *sc)
+{
+ struct ifnet *vf_ifp;
+ struct ifreq ifr;
+
+ HN_LOCK_ASSERT(sc);
+ vf_ifp = sc->hn_vf_ifp;
+
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+ ifr.ifr_flags = vf_ifp->if_flags & 0xffff;
+ ifr.ifr_flagshigh = vf_ifp->if_flags >> 16;
+ return (vf_ifp->if_ioctl(vf_ifp, SIOCSIFFLAGS, (caddr_t)&ifr));
+}
+
+static void
+hn_xpnt_vf_saveifflags(struct hn_softc *sc)
+{
+ struct ifnet *ifp = sc->hn_ifp;
+ int allmulti = 0;
+
+ HN_LOCK_ASSERT(sc);
+
+ /* XXX vlan(4) style mcast addr maintenance */
+ if (!TAILQ_EMPTY(&ifp->if_multiaddrs))
+ allmulti = IFF_ALLMULTI;
+
+ /* Always set the VF's if_flags */
+ sc->hn_vf_ifp->if_flags = ifp->if_flags | allmulti;
+}
+
+static void
+hn_xpnt_vf_input(struct ifnet *vf_ifp, struct mbuf *m)
+{
+ struct rm_priotracker pt;
+ struct ifnet *hn_ifp = NULL;
+ struct mbuf *mn;
+
+ /*
+ * XXX racy, if hn(4) ever detached.
+ */
+ rm_rlock(&hn_vfmap_lock, &pt);
+ if (vf_ifp->if_index < hn_vfmap_size)
+ hn_ifp = hn_vfmap[vf_ifp->if_index];
+ rm_runlock(&hn_vfmap_lock, &pt);
+
+ if (hn_ifp != NULL) {
+ for (mn = m; mn != NULL; mn = mn->m_nextpkt) {
+ /*
+ * Allow tapping on the VF.
+ */
+ ETHER_BPF_MTAP(vf_ifp, mn);
+
+ /*
+ * Update VF stats.
+ */
+ if ((vf_ifp->if_capenable & IFCAP_HWSTATS) == 0) {
+ if_inc_counter(vf_ifp, IFCOUNTER_IBYTES,
+ mn->m_pkthdr.len);
+ }
+ /*
+ * XXX IFCOUNTER_IMCAST
+ * This stat updating is kinda invasive, since it
+ * requires two checks on the mbuf: the length check
+ * and the ethernet header check. As of this write,
+ * all multicast packets go directly to hn(4), which
+ * makes imcast stat updating in the VF a try in vian.
+ */
+
+ /*
+ * Fix up rcvif and increase hn(4)'s ipackets.
+ */
+ mn->m_pkthdr.rcvif = hn_ifp;
+ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
+ }
+ /*
+ * Go through hn(4)'s if_input.
+ */
+ hn_ifp->if_input(hn_ifp, m);
+ } else {
+ /*
+ * In the middle of the transition; free this
+ * mbuf chain.
+ */
+ while (m != NULL) {
+ mn = m->m_nextpkt;
+ m->m_nextpkt = NULL;
+ m_freem(m);
+ m = mn;
+ }
+ }
+}
+
+static void
+hn_mtu_change_fixup(struct hn_softc *sc)
+{
+ struct ifnet *ifp;
+
+ HN_LOCK_ASSERT(sc);
+ ifp = sc->hn_ifp;
+
+ hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
+#if __FreeBSD_version >= 1100099
+ if (sc->hn_rx_ring[0].hn_lro.lro_length_lim < HN_LRO_LENLIM_MIN(ifp))
+ hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
+#endif
+}
+
+static void
+hn_xpnt_vf_setready(struct hn_softc *sc)
+{
+ struct ifnet *ifp, *vf_ifp;
+ struct ifreq ifr;
+
+ HN_LOCK_ASSERT(sc);
+ ifp = sc->hn_ifp;
+ vf_ifp = sc->hn_vf_ifp;
+
+ /*
+ * Mark the VF ready.
+ */
+ sc->hn_vf_rdytick = 0;
+
+ /*
+ * Save information for restoration.
+ */
+ sc->hn_saved_caps = ifp->if_capabilities;
+ sc->hn_saved_tsomax = ifp->if_hw_tsomax;
+ sc->hn_saved_tsosegcnt = ifp->if_hw_tsomaxsegcount;
+ sc->hn_saved_tsosegsz = ifp->if_hw_tsomaxsegsize;
+
+ /*
+ * Intersect supported/enabled capabilities.
+ *
+ * NOTE:
+ * if_hwassist is not changed here.
+ */
+ ifp->if_capabilities &= vf_ifp->if_capabilities;
+ ifp->if_capenable &= ifp->if_capabilities;
+
+ /*
+ * Fix TSO settings.
+ */
+ if (ifp->if_hw_tsomax > vf_ifp->if_hw_tsomax)
+ ifp->if_hw_tsomax = vf_ifp->if_hw_tsomax;
+ if (ifp->if_hw_tsomaxsegcount > vf_ifp->if_hw_tsomaxsegcount)
+ ifp->if_hw_tsomaxsegcount = vf_ifp->if_hw_tsomaxsegcount;
+ if (ifp->if_hw_tsomaxsegsize > vf_ifp->if_hw_tsomaxsegsize)
+ ifp->if_hw_tsomaxsegsize = vf_ifp->if_hw_tsomaxsegsize;
+
+ /*
+ * Change VF's enabled capabilities.
+ */
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+ ifr.ifr_reqcap = ifp->if_capenable;
+ hn_xpnt_vf_iocsetcaps(sc, &ifr);
+
+ if (ifp->if_mtu != ETHERMTU) {
+ int error;
+
+ /*
+ * Change VF's MTU.
+ */
+ memset(&ifr, 0, sizeof(ifr));
+ strlcpy(ifr.ifr_name, vf_ifp->if_xname, sizeof(ifr.ifr_name));
+ ifr.ifr_mtu = ifp->if_mtu;
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU, (caddr_t)&ifr);
+ if (error) {
+ if_printf(ifp, "%s SIOCSIFMTU %lu failed\n",
+ vf_ifp->if_xname, ifp->if_mtu);
+ if (ifp->if_mtu > ETHERMTU) {
+ if_printf(ifp, "change MTU to %d\n", ETHERMTU);
+
+ /*
+ * XXX
+ * No need to adjust the synthetic parts' MTU;
+ * failure of the adjustment will cause us
+ * infinite headache.
+ */
+ ifp->if_mtu = ETHERMTU;
+ hn_mtu_change_fixup(sc);
+ }
+ }
+ }
+}
+
+static bool
+hn_xpnt_vf_isready(struct hn_softc *sc)
+{
+
+ HN_LOCK_ASSERT(sc);
+
+ if (!hn_xpnt_vf || sc->hn_vf_ifp == NULL)
+ return (false);
+
+ if (sc->hn_vf_rdytick == 0)
+ return (true);
+
+ if (sc->hn_vf_rdytick > ticks)
+ return (false);
+
+ /* Mark VF as ready. */
+ hn_xpnt_vf_setready(sc);
+ return (true);
+}
+
+static void
+hn_xpnt_vf_setenable(struct hn_softc *sc)
+{
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+ rm_wlock(&sc->hn_vf_lock);
+ sc->hn_xvf_flags |= HN_XVFFLAG_ENABLED;
+ rm_wunlock(&sc->hn_vf_lock);
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_rx_flags |= HN_RX_FLAG_XPNT_VF;
+}
+
+static void
+hn_xpnt_vf_setdisable(struct hn_softc *sc, bool clear_vf)
+{
+ int i;
+
+ HN_LOCK_ASSERT(sc);
+
+ /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+ rm_wlock(&sc->hn_vf_lock);
+ sc->hn_xvf_flags &= ~HN_XVFFLAG_ENABLED;
+ if (clear_vf)
+ sc->hn_vf_ifp = NULL;
+ rm_wunlock(&sc->hn_vf_lock);
+
+ for (i = 0; i < sc->hn_rx_ring_cnt; ++i)
+ sc->hn_rx_ring[i].hn_rx_flags &= ~HN_RX_FLAG_XPNT_VF;
+}
+
+static void
+hn_xpnt_vf_init(struct hn_softc *sc)
+{
+ int error;
+
+ HN_LOCK_ASSERT(sc);
+
+ KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
+ ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
+
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "try bringing up %s\n",
+ sc->hn_vf_ifp->if_xname);
+ }
+
+ /*
+ * Bring the VF up.
+ */
+ hn_xpnt_vf_saveifflags(sc);
+ sc->hn_vf_ifp->if_flags |= IFF_UP;
+ error = hn_xpnt_vf_iocsetflags(sc);
+ if (error) {
+ if_printf(sc->hn_ifp, "bringing up %s failed: %d\n",
+ sc->hn_vf_ifp->if_xname, error);
+ return;
+ }
+
+ /*
+ * NOTE:
+ * Datapath setting must happen _after_ bringing the VF up.
+ */
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
+
+ /* Mark transparent mode VF as enabled. */
+ hn_xpnt_vf_setenable(sc);
+}
+
+static void
+hn_xpnt_vf_init_taskfunc(void *xsc, int pending __unused)
+{
+ struct hn_softc *sc = xsc;
+
+ HN_LOCK(sc);
+
+ if ((sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) == 0)
+ goto done;
+ if (sc->hn_vf_ifp == NULL)
+ goto done;
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ goto done;
+
+ if (sc->hn_vf_rdytick != 0) {
+ /* Mark VF as ready. */
+ hn_xpnt_vf_setready(sc);
+ }
+
+ if (sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ /*
+ * Delayed VF initialization.
+ */
+ if (bootverbose) {
+ if_printf(sc->hn_ifp, "delayed initialize %s\n",
+ sc->hn_vf_ifp->if_xname);
+ }
+ hn_xpnt_vf_init(sc);
+ }
+done:
+ HN_UNLOCK(sc);
+}
+
static void
hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
{
@@ -1132,6 +1544,16 @@ hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
goto done;
}
+ if (hn_xpnt_vf && ifp->if_start != NULL) {
+ /*
+ * ifnet.if_start is _not_ supported by transparent
+ * mode VF; mainly due to the IFF_DRV_OACTIVE flag.
+ */
+ if_printf(sc->hn_ifp, "%s uses if_start, which is unsupported "
+ "in transparent VF mode.\n", ifp->if_xname);
+ goto done;
+ }
+
rm_wlock(&hn_vfmap_lock);
if (ifp->if_index >= hn_vfmap_size) {
@@ -1155,7 +1577,37 @@ hn_ifnet_attevent(void *xsc, struct ifnet *ifp)
rm_wunlock(&hn_vfmap_lock);
+ /* NOTE: hn_vf_lock for hn_transmit()/hn_qflush() */
+ rm_wlock(&sc->hn_vf_lock);
+ KASSERT((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) == 0,
+ ("%s: transparent VF was enabled", sc->hn_ifp->if_xname));
sc->hn_vf_ifp = ifp;
+ rm_wunlock(&sc->hn_vf_lock);
+
+ if (hn_xpnt_vf) {
+ int wait_ticks;
+
+ /*
+ * Install if_input for vf_ifp, which does vf_ifp -> hn_ifp.
+ * Save vf_ifp's current if_input for later restoration.
+ */
+ sc->hn_vf_input = ifp->if_input;
+ ifp->if_input = hn_xpnt_vf_input;
+
+ /*
+ * Stop link status management; use the VF's.
+ */
+ hn_suspend_mgmt(sc);
+
+ /*
+ * Give VF sometime to complete its attach routing.
+ */
+ wait_ticks = hn_xpnt_vf_attwait * hz;
+ sc->hn_vf_rdytick = ticks + wait_ticks;
+
+ taskqueue_enqueue_timeout(sc->hn_vf_taskq, &sc->hn_vf_init,
+ wait_ticks);
+ }
done:
HN_UNLOCK(sc);
}
@@ -1173,7 +1625,58 @@ hn_ifnet_detevent(void *xsc, struct ifnet *ifp)
if (!hn_ismyvf(sc, ifp))
goto done;
- sc->hn_vf_ifp = NULL;
+ if (hn_xpnt_vf) {
+ /*
+ * Make sure that the delayed initialization is not running.
+ *
+ * NOTE:
+ * - This lock _must_ be released, since the hn_vf_init task
+ * will try holding this lock.
+ * - It is safe to release this lock here, since the
+ * hn_ifnet_attevent() is interlocked by the hn_vf_ifp.
+ *
+ * XXX racy, if hn(4) ever detached.
+ */
+ HN_UNLOCK(sc);
+ taskqueue_drain_timeout(sc->hn_vf_taskq, &sc->hn_vf_init);
+ HN_LOCK(sc);
+
+ KASSERT(sc->hn_vf_input != NULL, ("%s VF input is not saved",
+ sc->hn_ifp->if_xname));
+ ifp->if_input = sc->hn_vf_input;
+ sc->hn_vf_input = NULL;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
+
+ if (sc->hn_vf_rdytick == 0) {
+ /*
+ * The VF was ready; restore some settings.
+ */
+ sc->hn_ifp->if_capabilities = sc->hn_saved_caps;
+ /*
+ * NOTE:
+ * There is _no_ need to fixup if_capenable and
+ * if_hwassist, since the if_capabilities before
+ * restoration was an intersection of the VF's
+ * if_capabilites and the synthetic device's
+ * if_capabilites.
+ */
+ sc->hn_ifp->if_hw_tsomax = sc->hn_saved_tsomax;
+ sc->hn_ifp->if_hw_tsomaxsegcount =
+ sc->hn_saved_tsosegcnt;
+ sc->hn_ifp->if_hw_tsomaxsegsize = sc->hn_saved_tsosegsz;
+ }
+
+ /*
+ * Resume link status management, which was suspended
+ * by hn_ifnet_attevent().
+ */
+ hn_resume_mgmt(sc);
+ }
+
+ /* Mark transparent mode VF as disabled. */
+ hn_xpnt_vf_setdisable(sc, true /* clear hn_vf_ifp */);
rm_wlock(&hn_vfmap_lock);
@@ -1192,18 +1695,20 @@ done:
HN_UNLOCK(sc);
}
-/* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */
-static const struct hyperv_guid g_net_vsc_device_type = {
- .hv_guid = {0x63, 0x51, 0x61, 0xF8, 0x3E, 0xDF, 0xc5, 0x46,
- 0x91, 0x3F, 0xF2, 0xD2, 0xF9, 0x65, 0xED, 0x0E}
-};
+static void
+hn_ifnet_lnkevent(void *xsc, struct ifnet *ifp, int link_state)
+{
+ struct hn_softc *sc = xsc;
+
+ if (sc->hn_vf_ifp == ifp)
+ if_link_state_change(sc->hn_ifp, link_state);
+}
static int
hn_probe(device_t dev)
{
- if (VMBUS_PROBE_GUID(device_get_parent(dev), dev,
- &g_net_vsc_device_type) == 0) {
+ if (VMBUS_PROBE_GUID(device_get_parent(dev), dev, &hn_guid) == 0) {
device_set_desc(dev, "Hyper-V Network Interface");
return BUS_PROBE_DEFAULT;
}
@@ -1223,6 +1728,9 @@ hn_attach(device_t dev)
sc->hn_dev = dev;
sc->hn_prichan = vmbus_get_channel(dev);
HN_LOCK_INIT(sc);
+ rm_init(&sc->hn_vf_lock, "hnvf");
+ if (hn_xpnt_vf && hn_xpnt_vf_accbpf)
+ sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
/*
* Initialize these tunables once.
@@ -1262,6 +1770,18 @@ hn_attach(device_t dev)
TIMEOUT_TASK_INIT(sc->hn_mgmt_taskq0, &sc->hn_netchg_status, 0,
hn_netchg_status_taskfunc, sc);
+ if (hn_xpnt_vf) {
+ /*
+ * Setup taskqueue for VF tasks, e.g. delayed VF bringing up.
+ */
+ sc->hn_vf_taskq = taskqueue_create("hn_vf", M_WAITOK,
+ taskqueue_thread_enqueue, &sc->hn_vf_taskq);
+ taskqueue_start_threads(&sc->hn_vf_taskq, 1, PI_NET, "%s vf",
+ device_get_nameunit(dev));
+ TIMEOUT_TASK_INIT(sc->hn_vf_taskq, &sc->hn_vf_init, 0,
+ hn_xpnt_vf_init_taskfunc, sc);
+ }
+
/*
* Allocate ifnet and setup its name earlier, so that if_printf
* can be used by functions, which will be called after
@@ -1384,6 +1904,14 @@ hn_attach(device_t dev)
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "hwassist",
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
hn_hwassist_sysctl, "A", "hwassist");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_max",
+ CTLFLAG_RD, &ifp->if_hw_tsomax, 0, "max TSO size");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegcnt",
+ CTLFLAG_RD, &ifp->if_hw_tsomaxsegcount, 0,
+ "max # of TSO segments");
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "tso_maxsegsz",
+ CTLFLAG_RD, &ifp->if_hw_tsomaxsegsize, 0,
+ "max size of TSO segment");
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxfilter",
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
hn_rxfilter_sysctl, "A", "rxfilter");
@@ -1423,9 +1951,20 @@ hn_attach(device_t dev)
SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf",
CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
hn_vf_sysctl, "A", "Virtual Function's name");
- SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
- CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
- hn_rxvf_sysctl, "A", "activated Virtual Function's name");
+ if (!hn_xpnt_vf) {
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "rxvf",
+ CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_rxvf_sysctl, "A", "activated Virtual Function's name");
+ } else {
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_enabled",
+ CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, sc, 0,
+ hn_xpnt_vf_enabled_sysctl, "I",
+ "Transparent VF enabled");
+ SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "vf_xpnt_accbpf",
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, sc, 0,
+ hn_xpnt_vf_accbpf_sysctl, "I",
+ "Accurate BPF for transparent VF");
+ }
/*
* Setup the ifmedia, which has been initialized earlier.
@@ -1463,7 +2002,7 @@ hn_attach(device_t dev)
ifp->if_qflush = hn_xmit_qflush;
}
- ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO;
+ ifp->if_capabilities |= IFCAP_RXCSUM | IFCAP_LRO | IFCAP_LINKSTATE;
#ifdef foo
/* We can't diff IPv6 packets from IPv4 packets on RX path. */
ifp->if_capabilities |= IFCAP_RXCSUM_IPV6;
@@ -1498,7 +2037,13 @@ hn_attach(device_t dev)
ifp->if_hwassist &= ~(HN_CSUM_IP6_MASK | CSUM_IP6_TSO);
if (ifp->if_capabilities & (IFCAP_TSO6 | IFCAP_TSO4)) {
+ /*
+ * Lock hn_set_tso_maxsize() to simplify its
+ * internal logic.
+ */
+ HN_LOCK(sc);
hn_set_tso_maxsize(sc, hn_tso_maxlen, ETHERMTU);
+ HN_UNLOCK(sc);
ifp->if_hw_tsomaxsegcount = HN_TX_DATA_SEGCNT_MAX;
ifp->if_hw_tsomaxsegsize = PAGE_SIZE;
}
@@ -1519,10 +2064,15 @@ hn_attach(device_t dev)
sc->hn_mgmt_taskq = sc->hn_mgmt_taskq0;
hn_update_link_status(sc);
- sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
- hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
- sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
- hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
+ if (!hn_xpnt_vf) {
+ sc->hn_ifnet_evthand = EVENTHANDLER_REGISTER(ifnet_event,
+ hn_ifnet_event, sc, EVENTHANDLER_PRI_ANY);
+ sc->hn_ifaddr_evthand = EVENTHANDLER_REGISTER(ifaddr_event,
+ hn_ifaddr_event, sc, EVENTHANDLER_PRI_ANY);
+ } else {
+ sc->hn_ifnet_lnkhand = EVENTHANDLER_REGISTER(ifnet_link_event,
+ hn_ifnet_lnkevent, sc, EVENTHANDLER_PRI_ANY);
+ }
/*
* NOTE:
@@ -1549,6 +2099,14 @@ hn_detach(device_t dev)
struct hn_softc *sc = device_get_softc(dev);
struct ifnet *ifp = sc->hn_ifp, *vf_ifp;
+ if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
+ /*
+ * In case that the vmbus missed the orphan handler
+ * installation.
+ */
+ vmbus_xact_ctx_orphan(sc->hn_xact);
+ }
+
if (sc->hn_ifaddr_evthand != NULL)
EVENTHANDLER_DEREGISTER(ifaddr_event, sc->hn_ifaddr_evthand);
if (sc->hn_ifnet_evthand != NULL)
@@ -1561,20 +2119,14 @@ hn_detach(device_t dev)
EVENTHANDLER_DEREGISTER(ifnet_departure_event,
sc->hn_ifnet_dethand);
}
+ if (sc->hn_ifnet_lnkhand != NULL)
+ EVENTHANDLER_DEREGISTER(ifnet_link_event, sc->hn_ifnet_lnkhand);
vf_ifp = sc->hn_vf_ifp;
__compiler_membar();
if (vf_ifp != NULL)
hn_ifnet_detevent(sc, vf_ifp);
- if (sc->hn_xact != NULL && vmbus_chan_is_revoked(sc->hn_prichan)) {
- /*
- * In case that the vmbus missed the orphan handler
- * installation.
- */
- vmbus_xact_ctx_orphan(sc->hn_xact);
- }
-
if (device_is_attached(dev)) {
HN_LOCK(sc);
if (sc->hn_flags & HN_FLAG_SYNTH_ATTACHED) {
@@ -1604,6 +2156,8 @@ hn_detach(device_t dev)
free(sc->hn_tx_taskqs, M_DEVBUF);
}
taskqueue_free(sc->hn_mgmt_taskq0);
+ if (sc->hn_vf_taskq != NULL)
+ taskqueue_free(sc->hn_vf_taskq);
if (sc->hn_xact != NULL) {
/*
@@ -1617,6 +2171,7 @@ hn_detach(device_t dev)
if_free(ifp);
HN_LOCK_DESTROY(sc);
+ rm_destroy(&sc->hn_vf_lock);
return (0);
}
@@ -2458,13 +3013,16 @@ static int
hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
const struct hn_rxinfo *info)
{
- struct ifnet *ifp;
+ struct ifnet *ifp, *hn_ifp = rxr->hn_ifp;
struct mbuf *m_new;
int size, do_lro = 0, do_csum = 1;
int hash_type = M_HASHTYPE_OPAQUE;
- /* If the VF is active, inject the packet through the VF */
- ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : rxr->hn_ifp;
+ /*
+ * If the non-transparent mode VF is active, inject this packet
+ * into the VF.
+ */
+ ifp = rxr->hn_rxvf_ifp ? rxr->hn_rxvf_ifp : hn_ifp;
if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
/*
@@ -2478,10 +3036,15 @@ hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
return (0);
}
+ if (__predict_false(dlen < ETHER_HDR_LEN)) {
+ if_inc_counter(hn_ifp, IFCOUNTER_IERRORS, 1);
+ return (0);
+ }
+
if (dlen <= MHLEN) {
m_new = m_gethdr(M_NOWAIT, MT_DATA);
if (m_new == NULL) {
- if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
+ if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
return (0);
}
memcpy(mtod(m_new, void *), data, dlen);
@@ -2502,7 +3065,7 @@ hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size);
if (m_new == NULL) {
- if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
+ if_inc_counter(hn_ifp, IFCOUNTER_IQDROPS, 1);
return (0);
}
@@ -2510,7 +3073,7 @@ hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
}
m_new->m_pkthdr.rcvif = ifp;
- if (__predict_false((ifp->if_capenable & IFCAP_RXCSUM) == 0))
+ if (__predict_false((hn_ifp->if_capenable & IFCAP_RXCSUM) == 0))
do_csum = 0;
/* receive side checksum offload */
@@ -2551,8 +3114,9 @@ hn_rxpkt(struct hn_rx_ring *rxr, const void *data, int dlen,
int hoff;
hoff = sizeof(*eh);
- if (m_new->m_len < hoff)
- goto skip;
+ /* Checked at the beginning of this function. */
+ KASSERT(m_new->m_len >= hoff, ("not ethernet frame"));
+
eh = mtod(m_new, struct ether_header *);
etype = ntohs(eh->ether_type);
if (etype == ETHERTYPE_VLAN) {
@@ -2607,6 +3171,37 @@ skip:
m_new->m_flags |= M_VLANTAG;
}
+ /*
+ * If VF is activated (tranparent/non-transparent mode does not
+ * matter here).
+ *
+ * - Don't setup mbuf hash, if 'options RSS' is set.
+ *
+ * In Azure, when VF is activated, TCP SYN and SYN|ACK go
+ * through hn(4) while the rest of segments and ACKs belonging
+ * to the same TCP 4-tuple go through the VF. So don't setup
+ * mbuf hash, if a VF is activated and 'options RSS' is not
+ * enabled. hn(4) and the VF may use neither the same RSS
+ * hash key nor the same RSS hash function, so the hash value
+ * for packets belonging to the same flow could be different!
+ *
+ * - Disable LRO
+ *
+ * hn(4) will only receive broadcast packets, multicast packets,
+ * TCP SYN and SYN|ACK (in Azure), LRO is useless for these
+ * packet types.
+ *
+ * For non-transparent, we definitely _cannot_ enable LRO at
+ * all, since the LRO flush will use hn(4) as the receiving
+ * interface; i.e. hn_ifp->if_input(hn_ifp, m).
+ */
+ if (hn_ifp != ifp || (rxr->hn_rx_flags & HN_RX_FLAG_XPNT_VF)) {
+ do_lro = 0; /* disable LRO. */
+#ifndef RSS
+ goto skip_hash; /* skip mbuf hash setup */
+#endif
+ }
+
if (info->hash_info != HN_NDIS_HASH_INFO_INVALID) {
rxr->hn_rss_pkts++;
m_new->m_pkthdr.flowid = info->hash_value;
@@ -2654,15 +3249,36 @@ skip:
}
M_HASHTYPE_SET(m_new, hash_type);
- /*
- * Note: Moved RX completion back to hv_nv_on_receive() so all
- * messages (not just data messages) will trigger a response.
- */
+#ifndef RSS
+skip_hash:
+#endif
+ if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
+ if (hn_ifp != ifp) {
+ const struct ether_header *eh;
- ifp->if_ipackets++;
+ /*
+ * Non-transparent mode VF is activated.
+ */
+
+ /*
+ * Allow tapping on hn(4).
+ */
+ ETHER_BPF_MTAP(hn_ifp, m_new);
+
+ /*
+ * Update hn(4)'s stats.
+ */
+ if_inc_counter(hn_ifp, IFCOUNTER_IPACKETS, 1);
+ if_inc_counter(hn_ifp, IFCOUNTER_IBYTES, m_new->m_pkthdr.len);
+ /* Checked at the beginning of this function. */
+ KASSERT(m_new->m_len >= ETHER_HDR_LEN, ("not ethernet frame"));
+ eh = mtod(m_new, struct ether_header *);
+ if (ETHER_IS_MULTICAST(eh->ether_dhost))
+ if_inc_counter(hn_ifp, IFCOUNTER_IMCASTS, 1);
+ }
rxr->hn_pkts++;
- if ((ifp->if_capenable & IFCAP_LRO) && do_lro) {
+ if ((hn_ifp->if_capenable & IFCAP_LRO) && do_lro) {
#if defined(INET) || defined(INET6)
struct lro_ctrl *lro = &rxr->hn_lro;
@@ -2675,9 +3291,7 @@ skip:
}
#endif
}
-
- /* We're not holding the lock here, so don't release it */
- (*ifp->if_input)(ifp, m_new);
+ ifp->if_input(ifp, m_new);
return (0);
}
@@ -2686,7 +3300,8 @@ static int
hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
{
struct hn_softc *sc = ifp->if_softc;
- struct ifreq *ifr = (struct ifreq *)data;
+ struct ifreq *ifr = (struct ifreq *)data, ifr_vf;
+ struct ifnet *vf_ifp;
int mask, error = 0;
switch (cmd) {
@@ -2715,6 +3330,21 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
break;
}
+ if (hn_xpnt_vf_isready(sc)) {
+ vf_ifp = sc->hn_vf_ifp;
+ ifr_vf = *ifr;
+ strlcpy(ifr_vf.ifr_name, vf_ifp->if_xname,
+ sizeof(ifr_vf.ifr_name));
+ error = vf_ifp->if_ioctl(vf_ifp, SIOCSIFMTU,
+ (caddr_t)&ifr_vf);
+ if (error) {
+ HN_UNLOCK(sc);
+ if_printf(ifp, "%s SIOCSIFMTU %d failed: %d\n",
+ vf_ifp->if_xname, ifr->ifr_mtu, error);
+ break;
+ }
+ }
+
/*
* Suspend this interface before the synthetic parts
* are ripped.
@@ -2743,23 +3373,33 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
ifp->if_mtu = ifr->ifr_mtu;
/*
- * Make sure that various parameters based on MTU are
- * still valid, after the MTU change.
+ * Synthetic parts' reattach may change the chimney
+ * sending size; update it.
*/
if (sc->hn_tx_ring[0].hn_chim_size > sc->hn_chim_szmax)
hn_set_chim_size(sc, sc->hn_chim_szmax);
- hn_set_tso_maxsize(sc, hn_tso_maxlen, ifp->if_mtu);
-#if __FreeBSD_version >= 1100099
- if (sc->hn_rx_ring[0].hn_lro.lro_length_lim <
- HN_LRO_LENLIM_MIN(ifp))
- hn_set_lro_lenlim(sc, HN_LRO_LENLIM_MIN(ifp));
-#endif
+
+ /*
+ * Make sure that various parameters based on MTU are
+ * still valid, after the MTU change.
+ */
+ hn_mtu_change_fixup(sc);
/*
* All done! Resume the interface now.
*/
hn_resume(sc);
+ if ((sc->hn_flags & HN_FLAG_RXVF) ||
+ (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+ /*
+ * Since we have reattached the NVS part,
+ * change the datapath to VF again; in case
+ * that it is lost, after the NVS was detached.
+ */
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_VF);
+ }
+
HN_UNLOCK(sc);
break;
@@ -2771,6 +3411,9 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
break;
}
+ if (hn_xpnt_vf_isready(sc))
+ hn_xpnt_vf_saveifflags(sc);
+
if (ifp->if_flags & IFF_UP) {
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
/*
@@ -2781,6 +3424,9 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
HN_NO_SLEEPING(sc);
hn_rxfilter_config(sc);
HN_SLEEPING_OK(sc);
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ error = hn_xpnt_vf_iocsetflags(sc);
} else {
hn_init_locked(sc);
}
@@ -2795,7 +3441,22 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
case SIOCSIFCAP:
HN_LOCK(sc);
- mask = ifr->ifr_reqcap ^ ifp->if_capenable;
+
+ if (hn_xpnt_vf_isready(sc)) {
+ ifr_vf = *ifr;
+ strlcpy(ifr_vf.ifr_name, sc->hn_vf_ifp->if_xname,
+ sizeof(ifr_vf.ifr_name));
+ error = hn_xpnt_vf_iocsetcaps(sc, &ifr_vf);
+ HN_UNLOCK(sc);
+ break;
+ }
+
+ /*
+ * Fix up requested capabilities w/ supported capabilities,
+ * since the supported capabilities could have been changed.
+ */
+ mask = (ifr->ifr_reqcap & ifp->if_capabilities) ^
+ ifp->if_capenable;
if (mask & IFCAP_TXCSUM) {
ifp->if_capenable ^= IFCAP_TXCSUM;
@@ -2860,11 +3521,42 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
HN_SLEEPING_OK(sc);
}
+ /* XXX vlan(4) style mcast addr maintenance */
+ if (hn_xpnt_vf_isready(sc)) {
+ int old_if_flags;
+
+ old_if_flags = sc->hn_vf_ifp->if_flags;
+ hn_xpnt_vf_saveifflags(sc);
+
+ if ((sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) &&
+ ((old_if_flags ^ sc->hn_vf_ifp->if_flags) &
+ IFF_ALLMULTI))
+ error = hn_xpnt_vf_iocsetflags(sc);
+ }
+
HN_UNLOCK(sc);
break;
case SIOCSIFMEDIA:
case SIOCGIFMEDIA:
+ HN_LOCK(sc);
+ if (hn_xpnt_vf_isready(sc)) {
+ /*
+ * SIOCGIFMEDIA expects ifmediareq, so don't
+ * create and pass ifr_vf to the VF here; just
+ * replace the ifr_name.
+ */
+ vf_ifp = sc->hn_vf_ifp;
+ strlcpy(ifr->ifr_name, vf_ifp->if_xname,
+ sizeof(ifr->ifr_name));
+ error = vf_ifp->if_ioctl(vf_ifp, cmd, data);
+ /* Restore the ifr_name. */
+ strlcpy(ifr->ifr_name, ifp->if_xname,
+ sizeof(ifr->ifr_name));
+ HN_UNLOCK(sc);
+ break;
+ }
+ HN_UNLOCK(sc);
error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd);
break;
@@ -2886,11 +3578,35 @@ hn_stop(struct hn_softc *sc, bool detaching)
KASSERT(sc->hn_flags & HN_FLAG_SYNTH_ATTACHED,
("synthetic parts were not attached"));
+ /* Clear RUNNING bit ASAP. */
+ atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
+
/* Disable polling. */
hn_polling(sc, 0);
- /* Clear RUNNING bit _before_ hn_suspend_data() */
- atomic_clear_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
+ KASSERT(sc->hn_vf_ifp != NULL,
+ ("%s: VF is not attached", ifp->if_xname));
+
+ /* Mark transparent mode VF as disabled. */
+ hn_xpnt_vf_setdisable(sc, false /* keep hn_vf_ifp */);
+
+ /*
+ * NOTE:
+ * Datapath setting must happen _before_ bringing
+ * the VF down.
+ */
+ hn_nvs_set_datapath(sc, HN_NVS_DATAPATH_SYNTH);
+
+ /*
+ * Bring the VF down.
+ */
+ hn_xpnt_vf_saveifflags(sc);
+ sc->hn_vf_ifp->if_flags &= ~IFF_UP;
+ hn_xpnt_vf_iocsetflags(sc);
+ }
+
+ /* Suspend data transfers. */
hn_suspend_data(sc);
/* Clear OACTIVE bit. */
@@ -2899,8 +3615,8 @@ hn_stop(struct hn_softc *sc, bool detaching)
sc->hn_tx_ring[i].hn_oactive = 0;
/*
- * If the VF is active, make sure the filter is not 0, even if
- * the synthetic NIC is down.
+ * If the non-transparent mode VF is active, make sure
+ * that the RX filter still allows packet reception.
*/
if (!detaching && (sc->hn_flags & HN_FLAG_RXVF))
hn_rxfilter_config(sc);
@@ -2931,6 +3647,11 @@ hn_init_locked(struct hn_softc *sc)
/* Clear TX 'suspended' bit. */
hn_resume_tx(sc, sc->hn_tx_ring_inuse);
+ if (hn_xpnt_vf_isready(sc)) {
+ /* Initialize transparent VF. */
+ hn_xpnt_vf_init(sc);
+ }
+
/* Everything is ready; unleash! */
atomic_set_int(&ifp->if_drv_flags, IFF_DRV_RUNNING);
@@ -3550,6 +4271,42 @@ hn_vfmap_sysctl(SYSCTL_HANDLER_ARGS)
}
static int
+hn_xpnt_vf_accbpf_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int error, onoff = 0;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF)
+ onoff = 1;
+ error = sysctl_handle_int(oidp, &onoff, 0, req);
+ if (error || req->newptr == NULL)
+ return (error);
+
+ HN_LOCK(sc);
+ /* NOTE: hn_vf_lock for hn_transmit() */
+ rm_wlock(&sc->hn_vf_lock);
+ if (onoff)
+ sc->hn_xvf_flags |= HN_XVFFLAG_ACCBPF;
+ else
+ sc->hn_xvf_flags &= ~HN_XVFFLAG_ACCBPF;
+ rm_wunlock(&sc->hn_vf_lock);
+ HN_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+hn_xpnt_vf_enabled_sysctl(SYSCTL_HANDLER_ARGS)
+{
+ struct hn_softc *sc = arg1;
+ int enabled = 0;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ enabled = 1;
+ return (sysctl_handle_int(oidp, &enabled, 0, req));
+}
+
+static int
hn_check_iplen(const struct mbuf *m, int hoff)
{
const struct ip *ip;
@@ -4265,8 +5022,11 @@ static void
hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
{
struct ifnet *ifp = sc->hn_ifp;
+ u_int hw_tsomax;
int tso_minlen;
+ HN_LOCK_ASSERT(sc);
+
if ((ifp->if_capabilities & (IFCAP_TSO4 | IFCAP_TSO6)) == 0)
return;
@@ -4284,7 +5044,13 @@ hn_set_tso_maxsize(struct hn_softc *sc, int tso_maxlen, int mtu)
tso_maxlen = IP_MAXPACKET;
if (tso_maxlen > sc->hn_ndis_tso_szmax)
tso_maxlen = sc->hn_ndis_tso_szmax;
- ifp->if_hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+ hw_tsomax = tso_maxlen - (ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN);
+
+ if (hn_xpnt_vf_isready(sc)) {
+ if (hw_tsomax > sc->hn_vf_ifp->if_hw_tsomax)
+ hw_tsomax = sc->hn_vf_ifp->if_hw_tsomax;
+ }
+ ifp->if_hw_tsomax = hw_tsomax;
if (bootverbose)
if_printf(ifp, "TSO size max %u\n", ifp->if_hw_tsomax);
}
@@ -4637,6 +5403,59 @@ hn_transmit(struct ifnet *ifp, struct mbuf *m)
struct hn_tx_ring *txr;
int error, idx = 0;
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED) {
+ struct rm_priotracker pt;
+
+ rm_rlock(&sc->hn_vf_lock, &pt);
+ if (__predict_true(sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)) {
+ struct mbuf *m_bpf = NULL;
+ int obytes, omcast;
+
+ obytes = m->m_pkthdr.len;
+ if (m->m_flags & M_MCAST)
+ omcast = 1;
+
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ACCBPF) {
+ if (bpf_peers_present(ifp->if_bpf)) {
+ m_bpf = m_copypacket(m, M_NOWAIT);
+ if (m_bpf == NULL) {
+ /*
+ * Failed to grab a shallow
+ * copy; tap now.
+ */
+ ETHER_BPF_MTAP(ifp, m);
+ }
+ }
+ } else {
+ ETHER_BPF_MTAP(ifp, m);
+ }
+
+ error = sc->hn_vf_ifp->if_transmit(sc->hn_vf_ifp, m);
+ rm_runlock(&sc->hn_vf_lock, &pt);
+
+ if (m_bpf != NULL) {
+ if (!error)
+ ETHER_BPF_MTAP(ifp, m_bpf);
+ m_freem(m_bpf);
+ }
+
+ if (error == ENOBUFS) {
+ if_inc_counter(ifp, IFCOUNTER_OQDROPS, 1);
+ } else if (error) {
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ } else {
+ if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
+ if_inc_counter(ifp, IFCOUNTER_OBYTES, obytes);
+ if (omcast) {
+ if_inc_counter(ifp, IFCOUNTER_OMCASTS,
+ omcast);
+ }
+ }
+ return (error);
+ }
+ rm_runlock(&sc->hn_vf_lock, &pt);
+ }
+
#if defined(INET6) || defined(INET)
/*
* Perform TSO packet header fixup now, since the TSO
@@ -4718,11 +5537,17 @@ static void
hn_xmit_qflush(struct ifnet *ifp)
{
struct hn_softc *sc = ifp->if_softc;
+ struct rm_priotracker pt;
int i;
for (i = 0; i < sc->hn_tx_ring_inuse; ++i)
hn_tx_ring_qflush(&sc->hn_tx_ring[i]);
if_qflush(ifp);
+
+ rm_rlock(&sc->hn_vf_lock, &pt);
+ if (sc->hn_xvf_flags & HN_XVFFLAG_ENABLED)
+ sc->hn_vf_ifp->if_qflush(sc->hn_vf_ifp);
+ rm_runlock(&sc->hn_vf_lock, &pt);
}
static void
@@ -5409,6 +6234,11 @@ hn_suspend(struct hn_softc *sc)
/* Disable polling. */
hn_polling(sc, 0);
+ /*
+ * If the non-transparent mode VF is activated, the synthetic
+ * device is receiving packets, so the data path of the
+ * synthetic device must be suspended.
+ */
if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
(sc->hn_flags & HN_FLAG_RXVF))
hn_suspend_data(sc);
@@ -5499,17 +6329,24 @@ static void
hn_resume(struct hn_softc *sc)
{
+ /*
+ * If the non-transparent mode VF is activated, the synthetic
+ * device have to receive packets, so the data path of the
+ * synthetic device must be resumed.
+ */
if ((sc->hn_ifp->if_drv_flags & IFF_DRV_RUNNING) ||
(sc->hn_flags & HN_FLAG_RXVF))
hn_resume_data(sc);
/*
- * When the VF is activated, the synthetic interface is changed
- * to DOWN in hn_rxvf_change(). Here, if the VF is still active,
- * we don't call hn_resume_mgmt() until the VF is deactivated in
- * hn_rxvf_change().
+ * Don't resume link status change if VF is attached/activated.
+ * - In the non-transparent VF mode, the synthetic device marks
+ * link down until the VF is deactivated; i.e. VF is down.
+ * - In transparent VF mode, VF's media status is used until
+ * the VF is detached.
*/
- if (!(sc->hn_flags & HN_FLAG_RXVF))
+ if ((sc->hn_flags & HN_FLAG_RXVF) == 0 &&
+ !(hn_xpnt_vf && sc->hn_vf_ifp != NULL))
hn_resume_mgmt(sc);
/*
@@ -6039,6 +6876,24 @@ hn_sysinit(void *arg __unused)
{
int i;
+#ifdef HN_IFSTART_SUPPORT
+ /*
+ * Don't use ifnet.if_start if transparent VF mode is requested;
+ * mainly due to the IFF_DRV_OACTIVE flag.
+ */
+ if (hn_xpnt_vf && hn_use_if_start) {
+ hn_use_if_start = 0;
+ printf("hn: tranparent VF mode, if_transmit will be used, "
+ "instead of if_start\n");
+ }
+#endif
+ if (hn_xpnt_vf_attwait < HN_XPNT_VF_ATTWAIT_MIN) {
+ printf("hn: invalid transparent VF attach routing "
+ "wait timeout %d, reset to %d\n",
+ hn_xpnt_vf_attwait, HN_XPNT_VF_ATTWAIT_MIN);
+ hn_xpnt_vf_attwait = HN_XPNT_VF_ATTWAIT_MIN;
+ }
+
/*
* Initialize VF map.
*/
diff --git a/sys/dev/hyperv/netvsc/if_hnreg.h b/sys/dev/hyperv/netvsc/if_hnreg.h
index a964b4f..6100306 100644
--- a/sys/dev/hyperv/netvsc/if_hnreg.h
+++ b/sys/dev/hyperv/netvsc/if_hnreg.h
@@ -133,7 +133,7 @@ struct hn_nvs_ndis_init {
} __packed;
CTASSERT(sizeof(struct hn_nvs_ndis_init) >= HN_NVS_REQSIZE_MIN);
-#define HN_NVS_DATAPATH_SYNTHETIC 0
+#define HN_NVS_DATAPATH_SYNTH 0
#define HN_NVS_DATAPATH_VF 1
/* No response */
diff --git a/sys/dev/hyperv/netvsc/if_hnvar.h b/sys/dev/hyperv/netvsc/if_hnvar.h
index 76cd397..06785be 100644
--- a/sys/dev/hyperv/netvsc/if_hnvar.h
+++ b/sys/dev/hyperv/netvsc/if_hnvar.h
@@ -63,6 +63,7 @@ struct hn_rx_ring {
struct hn_tx_ring *hn_txr;
void *hn_pktbuf;
int hn_pktbuf_len;
+ int hn_rx_flags; /* HN_RX_FLAG_ */
uint8_t *hn_rxbuf; /* shadow sc->hn_rxbuf */
int hn_rx_idx;
@@ -82,7 +83,6 @@ struct hn_rx_ring {
/* Rarely used stuffs */
struct sysctl_oid *hn_rx_sysctl_tree;
- int hn_rx_flags;
void *hn_br; /* TX/RX bufring */
struct hyperv_dma hn_br_dma;
@@ -96,6 +96,7 @@ struct hn_rx_ring {
#define HN_RX_FLAG_ATTACHED 0x0001
#define HN_RX_FLAG_BR_REF 0x0002
+#define HN_RX_FLAG_XPNT_VF 0x0004
struct hn_tx_ring {
#ifndef HN_USE_TXDESC_BUFRING
@@ -175,7 +176,6 @@ struct hn_tx_ring {
struct hn_softc {
struct ifnet *hn_ifp;
struct arpcom arpcom;
- struct ifnet *hn_vf_ifp; /* SR-IOV VF */
struct ifmedia hn_media;
device_t hn_dev;
int hn_if_flags;
@@ -186,6 +186,10 @@ struct hn_softc {
int hn_rx_ring_inuse;
struct hn_rx_ring *hn_rx_ring;
+ struct rmlock hn_vf_lock;
+ struct ifnet *hn_vf_ifp; /* SR-IOV VF */
+ uint32_t hn_xvf_flags; /* transparent VF flags */
+
int hn_tx_ring_cnt;
int hn_tx_ring_inuse;
struct hn_tx_ring *hn_tx_ring;
@@ -242,6 +246,24 @@ struct hn_softc {
eventhandler_tag hn_ifnet_evthand;
eventhandler_tag hn_ifnet_atthand;
eventhandler_tag hn_ifnet_dethand;
+ eventhandler_tag hn_ifnet_lnkhand;
+
+ /*
+ * Transparent VF delayed initialization.
+ */
+ int hn_vf_rdytick; /* ticks, 0 == ready */
+ struct taskqueue *hn_vf_taskq;
+ struct timeout_task hn_vf_init;
+
+ /*
+ * Saved information for VF under transparent mode.
+ */
+ void (*hn_vf_input)
+ (struct ifnet *, struct mbuf *);
+ int hn_saved_caps;
+ u_int hn_saved_tsomax;
+ u_int hn_saved_tsosegcnt;
+ u_int hn_saved_tsosegsz;
};
#define HN_FLAG_RXBUF_CONNECTED 0x0001
@@ -256,6 +278,9 @@ struct hn_softc {
#define HN_FLAG_ERRORS (HN_FLAG_RXBUF_REF | HN_FLAG_CHIM_REF)
+#define HN_XVFFLAG_ENABLED 0x0001
+#define HN_XVFFLAG_ACCBPF 0x0002
+
#define HN_NO_SLEEPING(sc) \
do { \
(sc)->hn_flags |= HN_FLAG_NO_SLEEPING; \
OpenPOWER on IntegriCloud