diff options
Diffstat (limited to 'sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c')
-rw-r--r-- | sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c | 390 |
1 files changed, 323 insertions, 67 deletions
diff --git a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c index 33718a9..f8ebd38 100644 --- a/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c +++ b/sys/dev/hyperv/netvsc/hv_netvsc_drv_freebsd.c @@ -69,6 +69,7 @@ __FBSDID("$FreeBSD$"); #include <sys/queue.h> #include <sys/lock.h> #include <sys/sx.h> +#include <sys/sysctl.h> #include <net/if.h> #include <net/if_arp.h> @@ -138,13 +139,14 @@ __FBSDID("$FreeBSD$"); CSUM_IP_ISCSI|CSUM_IP6_UDP|CSUM_IP6_TCP|CSUM_IP6_SCTP| \ CSUM_IP6_TSO|CSUM_IP6_ISCSI) -/* - * Data types - */ - -struct hv_netvsc_driver_context { - uint32_t drv_inited; -}; +/* XXX move to netinet/tcp_lro.h */ +#define HN_LRO_HIWAT_MAX 65535 +#define HN_LRO_HIWAT_DEF HN_LRO_HIWAT_MAX +/* YYY 2*MTU is a bit rough, but should be good enough. */ +#define HN_LRO_HIWAT_MTULIM(ifp) (2 * (ifp)->if_mtu) +#define HN_LRO_HIWAT_ISVALID(sc, hiwat) \ + ((hiwat) >= HN_LRO_HIWAT_MTULIM((sc)->hn_ifp) || \ + (hiwat) <= HN_LRO_HIWAT_MAX) /* * Be aware that this sleepable mutex will exhibit WITNESS errors when @@ -168,9 +170,9 @@ struct hv_netvsc_driver_context { int hv_promisc_mode = 0; /* normal mode by default */ -/* The one and only one */ -static struct hv_netvsc_driver_context g_netvsc_drv; - +/* Trust tcp segements verification on host side. */ +static int hn_trust_hosttcp = 0; +TUNABLE_INT("dev.hn.trust_hosttcp", &hn_trust_hosttcp); /* * Forward declarations @@ -181,6 +183,21 @@ static void hn_ifinit(void *xsc); static int hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); static int hn_start_locked(struct ifnet *ifp); static void hn_start(struct ifnet *ifp); +static int hn_ifmedia_upd(struct ifnet *ifp); +static void hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr); +#ifdef HN_LRO_HIWAT +static int hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS); +#endif +static int hn_check_iplen(const struct mbuf *, int); + +static __inline void +hn_set_lro_hiwat(struct hn_softc *sc, int hiwat) +{ + sc->hn_lro_hiwat = hiwat; +#ifdef HN_LRO_HIWAT + sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; +#endif +} /* * NetVsc get message transport protocol type @@ -238,35 +255,27 @@ static uint32_t get_transport_proto_type(struct mbuf *m_head) return (ret_val); } -/* - * NetVsc driver initialization - * Note: Filter init is no longer required - */ static int -netvsc_drv_init(void) +hn_ifmedia_upd(struct ifnet *ifp __unused) { - return (0); + + return EOPNOTSUPP; } -/* - * NetVsc global initialization entry point - */ static void -netvsc_init(void) +hn_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) { - if (bootverbose) - printf("Netvsc initializing... "); + struct hn_softc *sc = ifp->if_softc; - /* - * XXXKYS: cleanup initialization - */ - if (!cold && !g_netvsc_drv.drv_inited) { - g_netvsc_drv.drv_inited = 1; - netvsc_drv_init(); - if (bootverbose) - printf("done!\n"); - } else if (bootverbose) - printf("Already initialized!\n"); + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + + if (!sc->hn_carrier) { + ifmr->ifm_active |= IFM_NONE; + return; + } + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active |= IFM_10G_T | IFM_FDX; } /* {F8615163-DF3E-46c5-913F-F2D2F965ED0E} */ @@ -310,10 +319,10 @@ netvsc_attach(device_t dev) hn_softc_t *sc; int unit = device_get_unit(dev); struct ifnet *ifp; + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; int ret; - netvsc_init(); - sc = device_get_softc(dev); if (sc == NULL) { return (ENOMEM); @@ -322,6 +331,8 @@ netvsc_attach(device_t dev) bzero(sc, sizeof(hn_softc_t)); sc->hn_unit = unit; sc->hn_dev = dev; + sc->hn_lro_hiwat = HN_LRO_HIWAT_DEF; + sc->hn_trust_hosttcp = hn_trust_hosttcp; NV_LOCK_INIT(sc, "NetVSCLock"); @@ -344,14 +355,22 @@ netvsc_attach(device_t dev) ifp->if_snd.ifq_drv_maxlen = 511; IFQ_SET_READY(&ifp->if_snd); + ifmedia_init(&sc->hn_media, 0, hn_ifmedia_upd, hn_ifmedia_sts); + ifmedia_add(&sc->hn_media, IFM_ETHER | IFM_AUTO, 0, NULL); + ifmedia_set(&sc->hn_media, IFM_ETHER | IFM_AUTO); + /* XXX ifmedia_set really should do this for us */ + sc->hn_media.ifm_media = sc->hn_media.ifm_cur->ifm_media; + /* * Tell upper layers that we support full VLAN capability. */ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); ifp->if_capabilities |= - IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO; + IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | + IFCAP_LRO; ifp->if_capenable |= - IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO; + IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_MTU | IFCAP_HWCSUM | IFCAP_TSO | + IFCAP_LRO; /* * Only enable UDP checksum offloading when it is on 2012R2 or * later. UDP checksum offloading doesn't work on earlier @@ -372,8 +391,63 @@ netvsc_attach(device_t dev) sc->hn_carrier = 1; } +#if defined(INET) || defined(INET6) + tcp_lro_init(&sc->hn_lro); + /* Driver private LRO settings */ + sc->hn_lro.ifp = ifp; +#ifdef HN_LRO_HIWAT + sc->hn_lro.lro_hiwat = sc->hn_lro_hiwat; +#endif +#endif /* INET || INET6 */ + ether_ifattach(ifp, device_info.mac_addr); + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_queued", + CTLFLAG_RW, &sc->hn_lro.lro_queued, 0, "LRO queued"); + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "lro_flushed", + CTLFLAG_RW, &sc->hn_lro.lro_flushed, 0, "LRO flushed"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "lro_tried", + CTLFLAG_RW, &sc->hn_lro_tried, "# of LRO tries"); +#ifdef HN_LRO_HIWAT + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "lro_hiwat", + CTLTYPE_INT | CTLFLAG_RW, sc, 0, hn_lro_hiwat_sysctl, + "I", "LRO high watermark"); +#endif + SYSCTL_ADD_INT(ctx, child, OID_AUTO, "trust_hosttcp", + CTLFLAG_RW, &sc->hn_trust_hosttcp, 0, + "Trust tcp segement verification on host side, " + "when csum info is missing"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_ip", + CTLFLAG_RW, &sc->hn_csum_ip, "RXCSUM IP"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_tcp", + CTLFLAG_RW, &sc->hn_csum_tcp, "RXCSUM TCP"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "csum_trusted", + CTLFLAG_RW, &sc->hn_csum_trusted, + "# of TCP segements that we trust host's csum verification"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "small_pkts", + CTLFLAG_RW, &sc->hn_small_pkts, "# of small packets received"); + + if (unit == 0) { + struct sysctl_ctx_list *dc_ctx; + struct sysctl_oid_list *dc_child; + devclass_t dc; + + /* + * Add sysctl nodes for devclass + */ + dc = device_get_devclass(dev); + dc_ctx = devclass_get_sysctl_ctx(dc); + dc_child = SYSCTL_CHILDREN(devclass_get_sysctl_tree(dc)); + + SYSCTL_ADD_INT(dc_ctx, dc_child, OID_AUTO, "trust_hosttcp", + CTLFLAG_RD, &hn_trust_hosttcp, 0, + "Trust tcp segement verification on host side, " + "when csum info is missing (global setting)"); + } + return (0); } @@ -383,6 +457,7 @@ netvsc_attach(device_t dev) static int netvsc_detach(device_t dev) { + struct hn_softc *sc = device_get_softc(dev); struct hv_device *hv_device = vmbus_get_devctx(dev); if (bootverbose) @@ -401,6 +476,11 @@ netvsc_detach(device_t dev) hv_rf_on_device_remove(hv_device, HV_RF_NV_DESTROY_CHANNEL); + ifmedia_removeall(&sc->hn_media); +#if defined(INET) || defined(INET6) + tcp_lro_free(&sc->hn_lro); +#endif + return (0); } @@ -887,7 +967,7 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, struct mbuf *m_new; struct ifnet *ifp; device_t dev = device_ctx->device; - int size; + int size, do_lro = 0; if (sc == NULL) { return (0); /* TODO: KYS how can this be! */ @@ -906,40 +986,44 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, */ if (packet->tot_data_buf_len > (ifp->if_mtu + ETHER_HDR_LEN)) { return (0); - } - - /* - * Get an mbuf with a cluster. For packets 2K or less, - * get a standard 2K cluster. For anything larger, get a - * 4K cluster. Any buffers larger than 4K can cause problems - * if looped around to the Hyper-V TX channel, so avoid them. - */ - size = MCLBYTES; - - if (packet->tot_data_buf_len > MCLBYTES) { - /* 4096 */ - size = MJUMPAGESIZE; - } + } else if (packet->tot_data_buf_len <= MHLEN) { + m_new = m_gethdr(M_NOWAIT, MT_DATA); + if (m_new == NULL) + return (0); + memcpy(mtod(m_new, void *), packet->data, + packet->tot_data_buf_len); + m_new->m_pkthdr.len = m_new->m_len = packet->tot_data_buf_len; + sc->hn_small_pkts++; + } else { + /* + * Get an mbuf with a cluster. For packets 2K or less, + * get a standard 2K cluster. For anything larger, get a + * 4K cluster. Any buffers larger than 4K can cause problems + * if looped around to the Hyper-V TX channel, so avoid them. + */ + size = MCLBYTES; + if (packet->tot_data_buf_len > MCLBYTES) { + /* 4096 */ + size = MJUMPAGESIZE; + } - m_new = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, size); + m_new = m_getjcl(M_NOWAIT, MT_DATA, M_PKTHDR, size); + if (m_new == NULL) { + device_printf(dev, "alloc mbuf failed.\n"); + return (0); + } - if (m_new == NULL) { - device_printf(dev, "alloc mbuf failed.\n"); - return (0); + hv_m_append(m_new, packet->tot_data_buf_len, packet->data); } - - hv_m_append(m_new, packet->tot_data_buf_len, - packet->data); - m_new->m_pkthdr.rcvif = ifp; /* receive side checksum offload */ - m_new->m_pkthdr.csum_flags = 0; if (NULL != csum_info) { /* IP csum offload */ if (csum_info->receive.ip_csum_succeeded) { m_new->m_pkthdr.csum_flags |= (CSUM_IP_CHECKED | CSUM_IP_VALID); + sc->hn_csum_ip++; } /* TCP csum offload */ @@ -947,9 +1031,50 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, m_new->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PSEUDO_HDR); m_new->m_pkthdr.csum_data = 0xffff; + sc->hn_csum_tcp++; + } + + if (csum_info->receive.ip_csum_succeeded && + csum_info->receive.tcp_csum_succeeded) + do_lro = 1; + } else { + const struct ether_header *eh; + uint16_t etype; + int hoff; + + hoff = sizeof(*eh); + if (m_new->m_len < hoff) + goto skip; + eh = mtod(m_new, struct ether_header *); + etype = ntohs(eh->ether_type); + if (etype == ETHERTYPE_VLAN) { + const struct ether_vlan_header *evl; + + hoff = sizeof(*evl); + if (m_new->m_len < hoff) + goto skip; + evl = mtod(m_new, struct ether_vlan_header *); + etype = ntohs(evl->evl_proto); } - } + if (etype == ETHERTYPE_IP) { + int pr; + + pr = hn_check_iplen(m_new, hoff); + if (pr == IPPROTO_TCP) { + if (sc->hn_trust_hosttcp) { + sc->hn_csum_trusted++; + m_new->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | + CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m_new->m_pkthdr.csum_data = 0xffff; + } + /* Rely on SW csum verification though... */ + do_lro = 1; + } + } + } +skip: if ((packet->vlan_tci != 0) && (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) != 0) { m_new->m_pkthdr.ether_vtag = packet->vlan_tci; @@ -963,12 +1088,41 @@ netvsc_recv(struct hv_device *device_ctx, netvsc_packet *packet, ifp->if_ipackets++; + if ((ifp->if_capenable & IFCAP_LRO) && do_lro) { +#if defined(INET) || defined(INET6) + struct lro_ctrl *lro = &sc->hn_lro; + + if (lro->lro_cnt) { + sc->hn_lro_tried++; + if (tcp_lro_rx(lro, m_new, 0) == 0) { + /* DONE! */ + return 0; + } + } +#endif + } + /* We're not holding the lock here, so don't release it */ (*ifp->if_input)(ifp, m_new); return (0); } +void +netvsc_recv_rollup(struct hv_device *device_ctx) +{ +#if defined(INET) || defined(INET6) + hn_softc_t *sc = device_get_softc(device_ctx->device); + struct lro_ctrl *lro = &sc->hn_lro; + struct lro_entry *queued; + + while ((queued = SLIST_FIRST(&lro->lro_active)) != NULL) { + SLIST_REMOVE_HEAD(&lro->lro_active, next); + tcp_lro_flush(lro, queued); + } +#endif +} + /* * Rules for using sc->temp_unusable: * 1. sc->temp_unusable can only be read or written while holding NV_LOCK() @@ -1024,7 +1178,13 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) /* Obtain and record requested MTU */ ifp->if_mtu = ifr->ifr_mtu; - + /* + * Make sure that LRO high watermark is still valid, + * after MTU change (the 2*MTU limit). + */ + if (!HN_LRO_HIWAT_ISVALID(sc, sc->hn_lro_hiwat)) + hn_set_lro_hiwat(sc, HN_LRO_HIWAT_MTULIM(ifp)); + do { NV_LOCK(sc); if (!sc->temp_unusable) { @@ -1149,6 +1309,8 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) ifp->if_capenable |= IFCAP_RXCSUM; } } + if (mask & IFCAP_LRO) + ifp->if_capenable ^= IFCAP_LRO; if (mask & IFCAP_TSO4) { ifp->if_capenable ^= IFCAP_TSO4; @@ -1173,10 +1335,11 @@ hn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) error = 0; } #endif - /* FALLTHROUGH */ + error = EINVAL; + break; case SIOCSIFMEDIA: case SIOCGIFMEDIA: - error = EINVAL; + error = ifmedia_ioctl(ifp, ifr, &sc->hn_media, cmd); break; default: error = ether_ioctl(ifp, cmd, data); @@ -1294,6 +1457,102 @@ hn_watchdog(struct ifnet *ifp) } #endif +#ifdef HN_LRO_HIWAT +static int +hn_lro_hiwat_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct hn_softc *sc = arg1; + int hiwat, error; + + hiwat = sc->hn_lro_hiwat; + error = sysctl_handle_int(oidp, &hiwat, 0, req); + if (error || req->newptr == NULL) + return error; + + if (!HN_LRO_HIWAT_ISVALID(sc, hiwat)) + return EINVAL; + + if (sc->hn_lro_hiwat != hiwat) + hn_set_lro_hiwat(sc, hiwat); + return 0; +} +#endif /* HN_LRO_HIWAT */ + +static int +hn_check_iplen(const struct mbuf *m, int hoff) +{ + const struct ip *ip; + int len, iphlen, iplen; + const struct tcphdr *th; + int thoff; /* TCP data offset */ + + len = hoff + sizeof(struct ip); + + /* The packet must be at least the size of an IP header. */ + if (m->m_pkthdr.len < len) + return IPPROTO_DONE; + + /* The fixed IP header must reside completely in the first mbuf. */ + if (m->m_len < len) + return IPPROTO_DONE; + + ip = mtodo(m, hoff); + + /* Bound check the packet's stated IP header length. */ + iphlen = ip->ip_hl << 2; + if (iphlen < sizeof(struct ip)) /* minimum header length */ + return IPPROTO_DONE; + + /* The full IP header must reside completely in the one mbuf. */ + if (m->m_len < hoff + iphlen) + return IPPROTO_DONE; + + iplen = ntohs(ip->ip_len); + + /* + * Check that the amount of data in the buffers is as + * at least much as the IP header would have us expect. + */ + if (m->m_pkthdr.len < hoff + iplen) + return IPPROTO_DONE; + + /* + * Ignore IP fragments. + */ + if (ntohs(ip->ip_off) & (IP_OFFMASK | IP_MF)) + return IPPROTO_DONE; + + /* + * The TCP/IP or UDP/IP header must be entirely contained within + * the first fragment of a packet. + */ + switch (ip->ip_p) { + case IPPROTO_TCP: + if (iplen < iphlen + sizeof(struct tcphdr)) + return IPPROTO_DONE; + if (m->m_len < hoff + iphlen + sizeof(struct tcphdr)) + return IPPROTO_DONE; + th = (const struct tcphdr *)((const uint8_t *)ip + iphlen); + thoff = th->th_off << 2; + if (thoff < sizeof(struct tcphdr) || thoff + iphlen > iplen) + return IPPROTO_DONE; + if (m->m_len < hoff + iphlen + thoff) + return IPPROTO_DONE; + break; + case IPPROTO_UDP: + if (iplen < iphlen + sizeof(struct udphdr)) + return IPPROTO_DONE; + if (m->m_len < hoff + iphlen + sizeof(struct udphdr)) + return IPPROTO_DONE; + break; + default: + if (iplen < iphlen) + return IPPROTO_DONE; + break; + } + return ip->ip_p; +} + static device_method_t netvsc_methods[] = { /* Device interface */ DEVMETHOD(device_probe, netvsc_probe), @@ -1315,6 +1574,3 @@ static devclass_t netvsc_devclass; DRIVER_MODULE(hn, vmbus, netvsc_driver, netvsc_devclass, 0, 0); MODULE_VERSION(hn, 1); MODULE_DEPEND(hn, vmbus, 1, 1, 1); -SYSINIT(netvsc_initx, SI_SUB_KTHREAD_IDLE, SI_ORDER_MIDDLE + 1, netvsc_init, - NULL); - |