diff options
Diffstat (limited to 'usr.sbin/bhyve/pci_virtio_net.c')
-rw-r--r-- | usr.sbin/bhyve/pci_virtio_net.c | 781 |
1 files changed, 781 insertions, 0 deletions
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c new file mode 100644 index 0000000..3f6f88a --- /dev/null +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -0,0 +1,781 @@ +/*- + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/select.h> +#include <sys/uio.h> +#include <sys/ioctl.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> +#include <assert.h> +#include <md5.h> +#include <pthread.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "mevent.h" +#include "virtio.h" + +#define VTNET_RINGSZ 256 + +#define VTNET_MAXSEGS 32 + +/* + * PCI config-space register offsets + */ +#define VTNET_R_CFG0 20 +#define VTNET_R_CFG1 21 +#define VTNET_R_CFG2 22 +#define VTNET_R_CFG3 23 +#define VTNET_R_CFG4 24 +#define VTNET_R_CFG5 25 +#define VTNET_R_CFG6 26 +#define VTNET_R_CFG7 27 +#define VTNET_R_MAX 27 + +#define VTNET_REGSZ VTNET_R_MAX+1 + +/* + * Host capabilities + */ +#define VTNET_S_HOSTCAPS \ + ( 0x00000020 | /* host supplies MAC */ \ + 0x00008000 | /* host can merge Rx buffers */ \ + 0x00010000 ) /* config status available */ + +/* + * Queue definitions. + */ +#define VTNET_RXQ 0 +#define VTNET_TXQ 1 +#define VTNET_CTLQ 2 + +#define VTNET_MAXQ 3 + +struct vring_hqueue { + /* Internal state */ + uint16_t hq_size; + uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ + + /* Host-context pointers to the queue */ + struct virtio_desc *hq_dtable; + uint16_t *hq_avail_flags; + uint16_t *hq_avail_idx; /* monotonically increasing */ + uint16_t *hq_avail_ring; + + uint16_t *hq_used_flags; + uint16_t *hq_used_idx; /* monotonically increasing */ + struct virtio_used *hq_used_ring; +}; + +/* + * Fixed network header size + */ +struct virtio_net_rxhdr { + uint8_t vrh_flags; + uint8_t vrh_gso_type; + uint16_t vrh_hdr_len; + uint16_t vrh_gso_size; + uint16_t vrh_csum_start; + uint16_t vrh_csum_offset; + uint16_t vrh_bufs; +} __packed; + +/* + * Debug printf + */ +static int pci_vtnet_debug; +#define DPRINTF(params) if (pci_vtnet_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtnet_softc { + struct pci_devinst *vsc_pi; + pthread_mutex_t vsc_mtx; + struct mevent *vsc_mevp; + + int vsc_curq; + int vsc_status; + int vsc_isr; + int vsc_tapfd; + int vsc_rx_ready; + int vsc_rxpend; + + uint32_t vsc_features; + uint8_t vsc_macaddr[6]; + + uint64_t vsc_pfn[VTNET_MAXQ]; + struct vring_hqueue vsc_hq[VTNET_MAXQ]; +}; + +/* + * Return the number of available descriptors in the vring taking care + * of the 16-bit index wraparound. + */ +static int +hq_num_avail(struct vring_hqueue *hq) +{ + int ndesc; + + if (*hq->hq_avail_idx >= hq->hq_cur_aidx) + ndesc = *hq->hq_avail_idx - hq->hq_cur_aidx; + else + ndesc = UINT16_MAX - hq->hq_cur_aidx + *hq->hq_avail_idx + 1; + + assert(ndesc >= 0 && ndesc <= hq->hq_size); + + return (ndesc); +} + +static uint16_t +pci_vtnet_qsize(int qnum) +{ + /* XXX no ctl queue currently */ + if (qnum == VTNET_CTLQ) { + return (0); + } + + /* XXX fixed currently. Maybe different for tx/rx/ctl */ + return (VTNET_RINGSZ); +} + +static void +pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring) +{ + struct vring_hqueue *hq; + + assert(ring < VTNET_MAXQ); + + hq = &sc->vsc_hq[ring]; + + /* + * Reset all soft state + */ + hq->hq_cur_aidx = 0; +} + +static void +pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value) +{ + + if (value == 0) { + DPRINTF(("vtnet: device reset requested !\n")); + pci_vtnet_ring_reset(sc, VTNET_RXQ); + pci_vtnet_ring_reset(sc, VTNET_TXQ); + sc->vsc_rx_ready = 0; + } + + sc->vsc_status = value; +} + +/* + * Called to send a buffer chain out to the tap device + */ +static void +pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + char pad[60]; + + if (sc->vsc_tapfd == -1) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + memset(pad, 0, 60 - len); + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) writev(sc->vsc_tapfd, iov, iovcnt); +} + +/* + * Called when there is read activity on the tap file descriptor. + * Each buffer posted by the guest is assumed to be able to contain + * an entire ethernet frame + rx header. + * MP note: the dummybuf is only used for discarding frames, so there + * is no need for it to be per-vtnet or locked. + */ +static uint8_t dummybuf[2048]; + +static void +pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) +{ + struct virtio_desc *vd; + struct virtio_used *vu; + struct vring_hqueue *hq; + struct virtio_net_rxhdr *vrx; + uint8_t *buf; + int i; + int len; + int ndescs; + int didx, uidx, aidx; /* descriptor, avail and used index */ + + /* + * Should never be called without a valid tap fd + */ + assert(sc->vsc_tapfd != -1); + + /* + * But, will be called when the rx ring hasn't yet + * been set up. + */ + if (sc->vsc_rx_ready == 0) { + /* + * Drop the packet and try later. + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + /* + * Calculate the number of available rx buffers + */ + hq = &sc->vsc_hq[VTNET_RXQ]; + + ndescs = hq_num_avail(hq); + + if (ndescs == 0) { + /* + * Need to wait for host notification to read + */ + if (sc->vsc_rxpend == 0) { + WPRINTF(("vtnet: no rx descriptors !\n")); + sc->vsc_rxpend = 1; + } + + /* + * Drop the packet and try later + */ + (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); + return; + } + + aidx = hq->hq_cur_aidx; + uidx = *hq->hq_used_idx; + for (i = 0; i < ndescs; i++) { + /* + * 'aidx' indexes into the an array of descriptor indexes + */ + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = (struct virtio_net_rxhdr *)paddr_guest2host(vd->vd_addr); + buf = (uint8_t *)(vrx + 1); + + len = read(sc->vsc_tapfd, buf, + vd->vd_len - sizeof(struct virtio_net_rxhdr)); + + if (len < 0 && errno == EWOULDBLOCK) { + break; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers, which is always 1 without TSO + * support. + */ + memset(vrx, 0, sizeof(struct virtio_net_rxhdr)); + vrx->vrh_bufs = 1; + + /* + * Write this descriptor into the used ring + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr); + uidx++; + aidx++; + } + + /* + * Update the used pointer, and signal an interrupt if allowed + */ + *hq->hq_used_idx = uidx; + hq->hq_cur_aidx = aidx; + + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } +} + +static void +pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) +{ + struct pci_vtnet_softc *sc = param; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtnet_tap_rx(sc); + pthread_mutex_unlock(&sc->vsc_mtx); + +} + +static void +pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc) +{ + /* + * A qnotify means that the rx process can now begin + */ + if (sc->vsc_rx_ready == 0) { + sc->vsc_rx_ready = 1; + } + + /* + * If the rx queue was empty, attempt to receive a + * packet that was previously blocked due to no rx bufs + * available + */ + if (sc->vsc_rxpend) { + WPRINTF(("vtnet: rx resumed\n\r")); + sc->vsc_rxpend = 0; + pci_vtnet_tap_rx(sc); + } +} + +static void +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq) +{ + struct iovec iov[VTNET_MAXSEGS + 1]; + struct virtio_desc *vd; + struct virtio_used *vu; + int i; + int plen; + int tlen; + int uidx, aidx, didx; + + uidx = *hq->hq_used_idx; + aidx = hq->hq_cur_aidx; + didx = hq->hq_avail_ring[aidx % hq->hq_size]; + assert(didx >= 0 && didx < hq->hq_size); + + vd = &hq->hq_dtable[didx]; + + /* + * Run through the chain of descriptors, ignoring the + * first header descriptor. However, include the header + * length in the total length that will be put into the + * used queue. + */ + tlen = vd->vd_len; + vd = &hq->hq_dtable[vd->vd_next]; + + for (i = 0, plen = 0; + i < VTNET_MAXSEGS; + i++, vd = &hq->hq_dtable[vd->vd_next]) { + iov[i].iov_base = paddr_guest2host(vd->vd_addr); + iov[i].iov_len = vd->vd_len; + plen += vd->vd_len; + tlen += vd->vd_len; + + if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + } + assert(i < VTNET_MAXSEGS); + + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1)); + pci_vtnet_tap_tx(sc, iov, i + 1, plen); + + /* + * Return this chain back to the host + */ + vu = &hq->hq_used_ring[uidx % hq->hq_size]; + vu->vu_idx = didx; + vu->vu_tlen = tlen; + hq->hq_cur_aidx = aidx + 1; + *hq->hq_used_idx = uidx + 1; + + /* + * Generate an interrupt if able + */ + if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { + sc->vsc_isr |= 1; + pci_generate_msi(sc->vsc_pi, 0); + } +} + +static void +pci_vtnet_ping_txq(struct pci_vtnet_softc *sc) +{ + struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ]; + int i; + int ndescs; + + /* + * Calculate number of ring entries to process + */ + ndescs = hq_num_avail(hq); + + if (ndescs == 0) + return; + + /* + * Run through all the entries, placing them into iovecs and + * sending when an end-of-packet is found + */ + for (i = 0; i < ndescs; i++) + pci_vtnet_proctx(sc, hq); +} + +static void +pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc) +{ + + DPRINTF(("vtnet: control qnotify!\n\r")); +} + +static void +pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn) +{ + struct vring_hqueue *hq; + int qnum = sc->vsc_curq; + + assert(qnum < VTNET_MAXQ); + + sc->vsc_pfn[qnum] = pfn << VRING_PFN; + + /* + * Set up host pointers to the various parts of the + * queue + */ + hq = &sc->vsc_hq[qnum]; + hq->hq_size = pci_vtnet_qsize(qnum); + + hq->hq_dtable = paddr_guest2host(pfn << VRING_PFN); + hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); + hq->hq_avail_idx = hq->hq_avail_flags + 1; + hq->hq_avail_ring = hq->hq_avail_flags + 2; + hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, + VRING_ALIGN); + hq->hq_used_idx = hq->hq_used_flags + 1; + hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); + + /* + * Initialize queue indexes + */ + hq->hq_cur_aidx = 0; +} + +static int +pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + struct pci_vtnet_softc *sc; + + /* + * Access to guest memory is required. Fail if + * memory not mapped + */ + if (paddr_guest2host(0) == NULL) + return (1); + + sc = malloc(sizeof(struct pci_vtnet_softc)); + memset(sc, 0, sizeof(struct pci_vtnet_softc)); + + pi->pi_arg = sc; + sc->vsc_pi = pi; + + pthread_mutex_init(&sc->vsc_mtx, NULL); + + /* + * Attempt to open the tap device + */ + sc->vsc_tapfd = -1; + if (opts != NULL) { + char tbuf[80]; + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, opts, sizeof(tbuf)); + + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + } else { + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_tap_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + } + } + + /* + * The MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the vm name. The slot/func number is + * prepended to this for slots other than 1:0, so that + * a bootloader can netboot from the equivalent of slot 1. + */ + if (pi->pi_slot == 1 && pi->pi_func == 0) { + strncpy(nstr, vmname, sizeof(nstr)); + } else { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + } + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->vsc_macaddr[0] = 0x00; + sc->vsc_macaddr[1] = 0xa0; + sc->vsc_macaddr[2] = 0x98; + sc->vsc_macaddr[3] = digest[0]; + sc->vsc_macaddr[4] = digest[1]; + sc->vsc_macaddr[5] = digest[2]; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_emul_add_msicap(pi, 1); + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ); + + return (0); +} + +/* + * Function pointer array to handle queue notifications + */ +static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = { + pci_vtnet_ping_rxq, + pci_vtnet_ping_txq, + pci_vtnet_ping_ctlq +}; + +static void +pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_vtnet_softc *sc = pi->pi_arg; + void *ptr; + + assert(baridx == 0); + + if (offset + size > VTNET_REGSZ) { + DPRINTF(("vtnet_write: 2big, offset %ld size %d\n", + offset, size)); + return; + } + + pthread_mutex_lock(&sc->vsc_mtx); + + switch (offset) { + case VTCFG_R_GUESTCAP: + assert(size == 4); + sc->vsc_features = value & VTNET_S_HOSTCAPS; + break; + case VTCFG_R_PFN: + assert(size == 4); + pci_vtnet_ring_init(sc, value); + break; + case VTCFG_R_QSEL: + assert(size == 2); + assert(value < VTNET_MAXQ); + sc->vsc_curq = value; + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + assert(value < VTNET_MAXQ); + (*pci_vtnet_qnotify[value])(sc); + break; + case VTCFG_R_STATUS: + assert(size == 1); + pci_vtnet_update_status(sc, value); + break; + case VTNET_R_CFG0: + case VTNET_R_CFG1: + case VTNET_R_CFG2: + case VTNET_R_CFG3: + case VTNET_R_CFG4: + case VTNET_R_CFG5: + assert((size + offset) <= (VTNET_R_CFG5 + 1)); + ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; + /* + * The driver is allowed to change the MAC address + */ + sc->vsc_macaddr[offset - VTNET_R_CFG0] = value; + if (size == 1) { + *(uint8_t *) ptr = value; + } else if (size == 2) { + *(uint16_t *) ptr = value; + } else { + *(uint32_t *) ptr = value; + } + break; + case VTCFG_R_HOSTCAP: + case VTCFG_R_QNUM: + case VTCFG_R_ISR: + case VTNET_R_CFG6: + case VTNET_R_CFG7: + DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset)); + break; + default: + DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); +} + +uint64_t +pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct pci_vtnet_softc *sc = pi->pi_arg; + void *ptr; + uint64_t value; + + assert(baridx == 0); + + if (offset + size > VTNET_REGSZ) { + DPRINTF(("vtnet_read: 2big, offset %ld size %d\n", + offset, size)); + return (0); + } + + pthread_mutex_lock(&sc->vsc_mtx); + + switch (offset) { + case VTCFG_R_HOSTCAP: + assert(size == 4); + value = VTNET_S_HOSTCAPS; + break; + case VTCFG_R_GUESTCAP: + assert(size == 4); + value = sc->vsc_features; /* XXX never read ? */ + break; + case VTCFG_R_PFN: + assert(size == 4); + value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN; + break; + case VTCFG_R_QNUM: + assert(size == 2); + value = pci_vtnet_qsize(sc->vsc_curq); + break; + case VTCFG_R_QSEL: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_QNOTIFY: + assert(size == 2); + value = sc->vsc_curq; /* XXX never read ? */ + break; + case VTCFG_R_STATUS: + assert(size == 1); + value = sc->vsc_status; + break; + case VTCFG_R_ISR: + assert(size == 1); + value = sc->vsc_isr; + sc->vsc_isr = 0; /* a read clears this flag */ + break; + case VTNET_R_CFG0: + case VTNET_R_CFG1: + case VTNET_R_CFG2: + case VTNET_R_CFG3: + case VTNET_R_CFG4: + case VTNET_R_CFG5: + assert((size + offset) <= (VTNET_R_CFG5 + 1)); + ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; + if (size == 1) { + value = *(uint8_t *) ptr; + } else if (size == 2) { + value = *(uint16_t *) ptr; + } else { + value = *(uint32_t *) ptr; + } + break; + case VTNET_R_CFG6: + assert(size != 4); + value = 0x01; /* XXX link always up */ + break; + case VTNET_R_CFG7: + assert(size == 1); + value = 0; /* XXX link status in LSB */ + break; + default: + DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset)); + value = 0; + break; + } + + pthread_mutex_unlock(&sc->vsc_mtx); + + return (value); +} + +struct pci_devemu pci_de_vnet = { + .pe_emu = "virtio-net", + .pe_init = pci_vtnet_init, + .pe_barwrite = pci_vtnet_write, + .pe_barread = pci_vtnet_read +}; +PCI_EMUL_SET(pci_de_vnet); |