/*- * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "bhyverun.h" #include "pci_emul.h" #include "mevent.h" #include "virtio.h" #define VTNET_RINGSZ 256 #define VTNET_MAXSEGS 32 /* * PCI config-space register offsets */ #define VTNET_R_CFG0 24 #define VTNET_R_CFG1 25 #define VTNET_R_CFG2 26 #define VTNET_R_CFG3 27 #define VTNET_R_CFG4 28 #define VTNET_R_CFG5 29 #define VTNET_R_CFG6 30 #define VTNET_R_CFG7 31 #define VTNET_R_MAX 31 #define VTNET_REGSZ VTNET_R_MAX+1 /* * Host capabilities */ #define VTNET_S_HOSTCAPS \ ( 0x00000020 | /* host supplies MAC */ \ 0x00008000 | /* host can merge Rx buffers */ \ 0x00010000 ) /* config status available */ /* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 #define VTNET_CTLQ 2 #define VTNET_MAXQ 3 static int use_msix = 1; struct vring_hqueue { /* Internal state */ uint16_t hq_size; uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ /* Host-context pointers to the queue */ struct virtio_desc *hq_dtable; uint16_t *hq_avail_flags; uint16_t *hq_avail_idx; /* monotonically increasing */ uint16_t *hq_avail_ring; uint16_t *hq_used_flags; uint16_t *hq_used_idx; /* monotonically increasing */ struct virtio_used *hq_used_ring; }; /* * Fixed network header size */ struct virtio_net_rxhdr { uint8_t vrh_flags; uint8_t vrh_gso_type; uint16_t vrh_hdr_len; uint16_t vrh_gso_size; uint16_t vrh_csum_start; uint16_t vrh_csum_offset; uint16_t vrh_bufs; } __packed; /* * Debug printf */ static int pci_vtnet_debug; #define DPRINTF(params) if (pci_vtnet_debug) printf params #define WPRINTF(params) printf params /* * Per-device softc */ struct pci_vtnet_softc { struct pci_devinst *vsc_pi; pthread_mutex_t vsc_mtx; struct mevent *vsc_mevp; int vsc_curq; int vsc_status; int vsc_isr; int vsc_tapfd; int vsc_rx_ready; int vsc_rxpend; uint32_t vsc_features; uint8_t vsc_macaddr[6]; uint64_t vsc_pfn[VTNET_MAXQ]; struct vring_hqueue vsc_hq[VTNET_MAXQ]; uint16_t vsc_msix_table_idx[VTNET_MAXQ]; }; /* * Return the size of IO BAR that maps virtio header and device specific * region. The size would vary depending on whether MSI-X is enabled or * not. */ static uint64_t pci_vtnet_iosize(struct pci_devinst *pi) { if (pci_msix_enabled(pi)) return (VTNET_REGSZ); else return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX)); } /* * Return the number of available descriptors in the vring taking care * of the 16-bit index wraparound. */ static int hq_num_avail(struct vring_hqueue *hq) { int ndesc; if (*hq->hq_avail_idx >= hq->hq_cur_aidx) ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx; else ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1; assert(ndesc >= 0 && ndesc <= hq->hq_size); return (ndesc); } static uint16_t pci_vtnet_qsize(int qnum) { /* XXX no ctl queue currently */ if (qnum == VTNET_CTLQ) { return (0); } /* XXX fixed currently. Maybe different for tx/rx/ctl */ return (VTNET_RINGSZ); } static void pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring) { struct vring_hqueue *hq; assert(ring < VTNET_MAXQ); hq = &sc->vsc_hq[ring]; /* * Reset all soft state */ hq->hq_cur_aidx = 0; } static void pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value) { if (value == 0) { DPRINTF(("vtnet: device reset requested !\n")); pci_vtnet_ring_reset(sc, VTNET_RXQ); pci_vtnet_ring_reset(sc, VTNET_TXQ); sc->vsc_rx_ready = 0; } sc->vsc_status = value; } /* * Called to send a buffer chain out to the tap device */ static void pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) { char pad[60]; if (sc->vsc_tapfd == -1) return; /* * If the length is < 60, pad out to that and add the * extra zero'd segment to the iov. It is guaranteed that * there is always an extra iov available by the caller. */ if (len < 60) { memset(pad, 0, 60 - len); iov[iovcnt].iov_base = pad; iov[iovcnt].iov_len = 60 - len; iovcnt++; } (void) writev(sc->vsc_tapfd, iov, iovcnt); } /* * Called when there is read activity on the tap file descriptor. * Each buffer posted by the guest is assumed to be able to contain * an entire ethernet frame + rx header. * MP note: the dummybuf is only used for discarding frames, so there * is no need for it to be per-vtnet or locked. */ static uint8_t dummybuf[2048]; static void pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) { struct virtio_desc *vd; struct virtio_used *vu; struct vring_hqueue *hq; struct virtio_net_rxhdr *vrx; uint8_t *buf; int i; int len; int ndescs; int didx, uidx, aidx; /* descriptor, avail and used index */ /* * Should never be called without a valid tap fd */ assert(sc->vsc_tapfd != -1); /* * But, will be called when the rx ring hasn't yet * been set up. */ if (sc->vsc_rx_ready == 0) { /* * Drop the packet and try later. */ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); return; } /* * Calculate the number of available rx buffers */ hq = &sc->vsc_hq[VTNET_RXQ]; ndescs = hq_num_avail(hq); if (ndescs == 0) { /* * Need to wait for host notification to read */ if (sc->vsc_rxpend == 0) { WPRINTF(("vtnet: no rx descriptors !\n")); sc->vsc_rxpend = 1; } /* * Drop the packet and try later */ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); return; } aidx = hq->hq_cur_aidx; uidx = *hq->hq_used_idx; for (i = 0; i < ndescs; i++) { /* * 'aidx' indexes into the an array of descriptor indexes */ didx = hq->hq_avail_ring[aidx % hq->hq_size]; assert(didx >= 0 && didx < hq->hq_size); vd = &hq->hq_dtable[didx]; /* * Get a pointer to the rx header, and use the * data immediately following it for the packet buffer. */ vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr); buf = (uint8_t *)(vrx + 1); len = read(sc->vsc_tapfd, buf, vd->vd_len - sizeof(struct virtio_net_rxhdr)); if (len < 0 && errno == EWOULDBLOCK) { break; } /* * The only valid field in the rx packet header is the * number of buffers, which is always 1 without TSO * support. */ memset(vrx, 0, sizeof(struct virtio_net_rxhdr)); vrx->vrh_bufs = 1; /* * Write this descriptor into the used ring */ vu = &hq->hq_used_ring[uidx % hq->hq_size]; vu->vu_idx = didx; vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr); uidx++; aidx++; } /* * Update the used pointer, and signal an interrupt if allowed */ *hq->hq_used_idx = uidx; hq->hq_cur_aidx = aidx; if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { if (use_msix) { pci_generate_msix(sc->vsc_pi, sc->vsc_msix_table_idx[VTNET_RXQ]); } else { sc->vsc_isr |= 1; pci_generate_msi(sc->vsc_pi, 0); } } } static void pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->vsc_mtx); pci_vtnet_tap_rx(sc); pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc) { /* * A qnotify means that the rx process can now begin */ if (sc->vsc_rx_ready == 0) { sc->vsc_rx_ready = 1; } /* * If the rx queue was empty, attempt to receive a * packet that was previously blocked due to no rx bufs * available */ if (sc->vsc_rxpend) { WPRINTF(("vtnet: rx resumed\n\r")); sc->vsc_rxpend = 0; pci_vtnet_tap_rx(sc); } } static void pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq) { struct iovec iov[VTNET_MAXSEGS + 1]; struct virtio_desc *vd; struct virtio_used *vu; int i; int plen; int tlen; int uidx, aidx, didx; uidx = *hq->hq_used_idx; aidx = hq->hq_cur_aidx; didx = hq->hq_avail_ring[aidx % hq->hq_size]; assert(didx >= 0 && didx < hq->hq_size); vd = &hq->hq_dtable[didx]; /* * Run through the chain of descriptors, ignoring the * first header descriptor. However, include the header * length in the total length that will be put into the * used queue. */ tlen = vd->vd_len; vd = &hq->hq_dtable[vd->vd_next]; for (i = 0, plen = 0; i < VTNET_MAXSEGS; i++, vd = &hq->hq_dtable[vd->vd_next]) { iov[i].iov_base = paddr_guest2host(vd->vd_addr); iov[i].iov_len = vd->vd_len; plen += vd->vd_len; tlen += vd->vd_len; if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0) break; } assert(i < VTNET_MAXSEGS); DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1)); pci_vtnet_tap_tx(sc, iov, i + 1, plen); /* * Return this chain back to the host */ vu = &hq->hq_used_ring[uidx % hq->hq_size]; vu->vu_idx = didx; vu->vu_tlen = tlen; hq->hq_cur_aidx = aidx + 1; *hq->hq_used_idx = uidx + 1; /* * Generate an interrupt if able */ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { if (use_msix) { pci_generate_msix(sc->vsc_pi, sc->vsc_msix_table_idx[VTNET_TXQ]); } else { sc->vsc_isr |= 1; pci_generate_msi(sc->vsc_pi, 0); } } } static void pci_vtnet_ping_txq(struct pci_vtnet_softc *sc) { struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ]; int i; int ndescs; /* * Calculate number of ring entries to process */ ndescs = hq_num_avail(hq); if (ndescs == 0) return; /* * Run through all the entries, placing them into iovecs and * sending when an end-of-packet is found */ for (i = 0; i < ndescs; i++) pci_vtnet_proctx(sc, hq); } static void pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc) { DPRINTF(("vtnet: control qnotify!\n\r")); } static void pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn) { struct vring_hqueue *hq; int qnum = sc->vsc_curq; assert(qnum < VTNET_MAXQ); sc->vsc_pfn[qnum] = pfn << VRING_PFN; /* * Set up host pointers to the various parts of the * queue */ hq = &sc->vsc_hq[qnum]; hq->hq_size = pci_vtnet_qsize(qnum); hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN); hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); hq->hq_avail_idx = hq->hq_avail_flags + 1; hq->hq_avail_ring = hq->hq_avail_flags + 2; hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, VRING_ALIGN); hq->hq_used_idx = hq->hq_used_flags + 1; hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); /* * Initialize queue indexes */ hq->hq_cur_aidx = 0; } static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { MD5_CTX mdctx; unsigned char digest[16]; char nstr[80]; struct pci_vtnet_softc *sc; const char *env_msi; /* * Access to guest memory is required. Fail if * memory not mapped */ if (paddr_guest2host(0) == NULL) return (1); sc = malloc(sizeof(struct pci_vtnet_softc)); memset(sc, 0, sizeof(struct pci_vtnet_softc)); pi->pi_arg = sc; sc->vsc_pi = pi; pthread_mutex_init(&sc->vsc_mtx, NULL); /* * Use MSI if set by user */ if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) { if (strcasecmp(env_msi, "yes") == 0) use_msix = 0; } /* * Attempt to open the tap device */ sc->vsc_tapfd = -1; if (opts != NULL) { char tbuf[80]; strcpy(tbuf, "/dev/"); strlcat(tbuf, opts, sizeof(tbuf)); sc->vsc_tapfd = open(tbuf, O_RDWR); if (sc->vsc_tapfd == -1) { WPRINTF(("open of tap device %s failed\n", tbuf)); } else { /* * Set non-blocking and register for read * notifications with the event loop */ int opt = 1; if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { WPRINTF(("tap device O_NONBLOCK failed\n")); close(sc->vsc_tapfd); sc->vsc_tapfd = -1; } sc->vsc_mevp = mevent_add(sc->vsc_tapfd, EVF_READ, pci_vtnet_tap_callback, sc); if (sc->vsc_mevp == NULL) { WPRINTF(("Could not register event\n")); close(sc->vsc_tapfd); sc->vsc_tapfd = -1; } } } /* * The MAC address is the standard NetApp OUI of 00-a0-98, * followed by an MD5 of the vm name. The slot/func number is * prepended to this for slots other than 1:0, so that * a bootloader can netboot from the equivalent of slot 1. */ if (pi->pi_slot == 1 && pi->pi_func == 0) { strncpy(nstr, vmname, sizeof(nstr)); } else { snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, pi->pi_func, vmname); } MD5Init(&mdctx); MD5Update(&mdctx, nstr, strlen(nstr)); MD5Final(digest, &mdctx); sc->vsc_macaddr[0] = 0x00; sc->vsc_macaddr[1] = 0xa0; sc->vsc_macaddr[2] = 0x98; sc->vsc_macaddr[3] = digest[0]; sc->vsc_macaddr[4] = digest[1]; sc->vsc_macaddr[5] = digest[2]; /* initialize config space */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); if (use_msix) { /* MSI-X support */ int i; for (i = 0; i < VTNET_MAXQ; i++) sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR; /* * BAR 1 used to map MSI-X table and PBA */ if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1)) return (1); } else { /* MSI support */ pci_emul_add_msicap(pi, 1); } pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ); return (0); } /* * Function pointer array to handle queue notifications */ static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = { pci_vtnet_ping_rxq, pci_vtnet_ping_txq, pci_vtnet_ping_ctlq }; static uint64_t vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset) { /* * Device specific offsets used by guest would change based on * whether MSI-X capability is enabled or not */ if (!pci_msix_enabled(pi)) { if (offset >= VTCFG_R_MSIX) return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX)); } return (offset); } static void pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size, uint64_t value) { struct pci_vtnet_softc *sc = pi->pi_arg; void *ptr; if (use_msix) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { pci_emul_msix_twrite(pi, offset, size, value); return; } } assert(baridx == 0); if (offset + size > pci_vtnet_iosize(pi)) { DPRINTF(("vtnet_write: 2big, offset %ld size %d\n", offset, size)); return; } pthread_mutex_lock(&sc->vsc_mtx); offset = vtnet_adjust_offset(pi, offset); switch (offset) { case VTCFG_R_GUESTCAP: assert(size == 4); sc->vsc_features = value & VTNET_S_HOSTCAPS; break; case VTCFG_R_PFN: assert(size == 4); pci_vtnet_ring_init(sc, value); break; case VTCFG_R_QSEL: assert(size == 2); assert(value < VTNET_MAXQ); sc->vsc_curq = value; break; case VTCFG_R_QNOTIFY: assert(size == 2); assert(value < VTNET_MAXQ); (*pci_vtnet_qnotify[value])(sc); break; case VTCFG_R_STATUS: assert(size == 1); pci_vtnet_update_status(sc, value); break; case VTCFG_R_CFGVEC: assert(size == 2); sc->vsc_msix_table_idx[VTNET_CTLQ] = value; break; case VTCFG_R_QVEC: assert(size == 2); assert(sc->vsc_curq != VTNET_CTLQ); sc->vsc_msix_table_idx[sc->vsc_curq] = value; break; case VTNET_R_CFG0: case VTNET_R_CFG1: case VTNET_R_CFG2: case VTNET_R_CFG3: case VTNET_R_CFG4: case VTNET_R_CFG5: assert((size + offset) <= (VTNET_R_CFG5 + 1)); ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; /* * The driver is allowed to change the MAC address */ sc->vsc_macaddr[offset - VTNET_R_CFG0] = value; if (size == 1) { *(uint8_t *) ptr = value; } else if (size == 2) { *(uint16_t *) ptr = value; } else { *(uint32_t *) ptr = value; } break; case VTCFG_R_HOSTCAP: case VTCFG_R_QNUM: case VTCFG_R_ISR: case VTNET_R_CFG6: case VTNET_R_CFG7: DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset)); break; default: DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset)); value = 0; break; } pthread_mutex_unlock(&sc->vsc_mtx); } uint64_t pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, uint64_t offset, int size) { struct pci_vtnet_softc *sc = pi->pi_arg; void *ptr; uint64_t value; if (use_msix) { if (baridx == pci_msix_table_bar(pi) || baridx == pci_msix_pba_bar(pi)) { return (pci_emul_msix_tread(pi, offset, size)); } } assert(baridx == 0); if (offset + size > pci_vtnet_iosize(pi)) { DPRINTF(("vtnet_read: 2big, offset %ld size %d\n", offset, size)); return (0); } pthread_mutex_lock(&sc->vsc_mtx); offset = vtnet_adjust_offset(pi, offset); switch (offset) { case VTCFG_R_HOSTCAP: assert(size == 4); value = VTNET_S_HOSTCAPS; break; case VTCFG_R_GUESTCAP: assert(size == 4); value = sc->vsc_features; /* XXX never read ? */ break; case VTCFG_R_PFN: assert(size == 4); value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN; break; case VTCFG_R_QNUM: assert(size == 2); value = pci_vtnet_qsize(sc->vsc_curq); break; case VTCFG_R_QSEL: assert(size == 2); value = sc->vsc_curq; /* XXX never read ? */ break; case VTCFG_R_QNOTIFY: assert(size == 2); value = sc->vsc_curq; /* XXX never read ? */ break; case VTCFG_R_STATUS: assert(size == 1); value = sc->vsc_status; break; case VTCFG_R_ISR: assert(size == 1); value = sc->vsc_isr; sc->vsc_isr = 0; /* a read clears this flag */ break; case VTCFG_R_CFGVEC: assert(size == 2); value = sc->vsc_msix_table_idx[VTNET_CTLQ]; break; case VTCFG_R_QVEC: assert(size == 2); assert(sc->vsc_curq != VTNET_CTLQ); value = sc->vsc_msix_table_idx[sc->vsc_curq]; break; case VTNET_R_CFG0: case VTNET_R_CFG1: case VTNET_R_CFG2: case VTNET_R_CFG3: case VTNET_R_CFG4: case VTNET_R_CFG5: assert((size + offset) <= (VTNET_R_CFG5 + 1)); ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; if (size == 1) { value = *(uint8_t *) ptr; } else if (size == 2) { value = *(uint16_t *) ptr; } else { value = *(uint32_t *) ptr; } break; case VTNET_R_CFG6: assert(size != 4); value = 0x01; /* XXX link always up */ break; case VTNET_R_CFG7: assert(size == 1); value = 0; /* XXX link status in LSB */ break; default: DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset)); value = 0; break; } pthread_mutex_unlock(&sc->vsc_mtx); return (value); } struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, .pe_barwrite = pci_vtnet_write, .pe_barread = pci_vtnet_read }; PCI_EMUL_SET(pci_de_vnet);