diff options
author | grehan <grehan@FreeBSD.org> | 2013-07-17 23:37:33 +0000 |
---|---|---|
committer | grehan <grehan@FreeBSD.org> | 2013-07-17 23:37:33 +0000 |
commit | a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb (patch) | |
tree | 064903149f0c797df3873dc7997273f417c81f93 /usr.sbin | |
parent | b8663d4c053e282b686f3e2a2d625b21b5944176 (diff) | |
download | FreeBSD-src-a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb.zip FreeBSD-src-a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb.tar.gz |
Major rework of the virtio code. Split out common parts, and modify
the net/block devices accordingly.
Submitted by: Chris Torek torek at torek dot net
Reviewed by: grehan
Diffstat (limited to 'usr.sbin')
-rw-r--r-- | usr.sbin/bhyve/Makefile | 2 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_virtio_block.c | 475 | ||||
-rw-r--r-- | usr.sbin/bhyve/pci_virtio_net.c | 750 | ||||
-rw-r--r-- | usr.sbin/bhyve/virtio.c | 745 | ||||
-rw-r--r-- | usr.sbin/bhyve/virtio.h | 397 |
5 files changed, 1408 insertions, 961 deletions
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile index e6aa8b2..17355c3 100644 --- a/usr.sbin/bhyve/Makefile +++ b/usr.sbin/bhyve/Makefile @@ -10,7 +10,7 @@ SRCS= acpi.c atpic.c bhyverun.c consport.c dbgport.c elcr.c inout.c SRCS+= ioapic.c mem.c mevent.c mptbl.c SRCS+= pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c SRCS+= pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c -SRCS+= xmsr.c spinup_ap.c +SRCS+= virtio.c xmsr.c spinup_ap.c .PATH: ${.CURDIR}/../../sys/amd64/vmm SRCS+= vmm_instruction_emul.c diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c index 5c42dc2..4395410 100644 --- a/usr.sbin/bhyve/pci_virtio_block.c +++ b/usr.sbin/bhyve/pci_virtio_block.c @@ -53,14 +53,6 @@ __FBSDID("$FreeBSD$"); #define VTBLK_RINGSZ 64 -#define VTBLK_CFGSZ 28 - -#define VTBLK_R_CFG VTCFG_R_CFG1 -#define VTBLK_R_CFG_END VTBLK_R_CFG + VTBLK_CFGSZ -1 -#define VTBLK_R_MAX VTBLK_R_CFG_END - -#define VTBLK_REGSZ VTBLK_R_MAX+1 - #define VTBLK_MAXSEGS 32 #define VTBLK_S_OK 0 @@ -71,28 +63,10 @@ __FBSDID("$FreeBSD$"); */ #define VTBLK_S_HOSTCAPS \ ( 0x00000004 | /* host maximum request segments */ \ - 0x10000000 ) /* supports indirect descriptors */ - -static int use_msix = 1; - -struct vring_hqueue { - /* Internal state */ - uint16_t hq_size; - uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ - - /* Host-context pointers to the queue */ - struct virtio_desc *hq_dtable; - uint16_t *hq_avail_flags; - uint16_t *hq_avail_idx; /* monotonically increasing */ - uint16_t *hq_avail_ring; - - uint16_t *hq_used_flags; - uint16_t *hq_used_idx; /* monotonically increasing */ - struct virtio_used *hq_used_ring; -}; + VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* - * Config space + * Config space "registers" */ struct vtblk_config { uint64_t vbc_capacity; @@ -104,7 +78,6 @@ struct vtblk_config { uint32_t vbc_blk_size; uint32_t vbc_sectors_max; } __packed; -CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ); /* * Fixed-size block header @@ -129,113 +102,69 @@ static int pci_vtblk_debug; * Per-device softc */ struct pci_vtblk_softc { - struct pci_devinst *vbsc_pi; + struct virtio_softc vbsc_vs; + struct vqueue_info vbsc_vq; int vbsc_fd; - int vbsc_status; - int vbsc_isr; - int vbsc_lastq; - uint32_t vbsc_features; - uint64_t vbsc_pfn; - struct vring_hqueue vbsc_q; struct vtblk_config vbsc_cfg; - uint16_t msix_table_idx_req; - uint16_t msix_table_idx_cfg; }; -#define vtblk_ctx(sc) ((sc)->vbsc_pi->pi_vmctx) - -/* - * Return the size of IO BAR that maps virtio header and device specific - * region. The size would vary depending on whether MSI-X is enabled or - * not - */ -static uint64_t -pci_vtblk_iosize(struct pci_devinst *pi) -{ - - if (pci_msix_enabled(pi)) - return (VTBLK_REGSZ); - else - return (VTBLK_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX)); -} - -/* - * Return the number of available descriptors in the vring taking care - * of the 16-bit index wraparound. - */ -static int -hq_num_avail(struct vring_hqueue *hq) -{ - uint16_t ndesc; - - /* - * We're just computing (a-b) in GF(216). - * - * The only glitch here is that in standard C, - * uint16_t promotes to (signed) int when int has - * more than 16 bits (pretty much always now), so - * we have to force it back to unsigned. - */ - ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx; - - assert(ndesc <= hq->hq_size); - return (ndesc); -} +static void pci_vtblk_reset(void *); +static void pci_vtblk_notify(void *, struct vqueue_info *); +static int pci_vtblk_cfgread(void *, int, int, uint32_t *); +static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtblk_vi_consts = { + "vtblk", /* our name */ + 1, /* we support 1 virtqueue */ + sizeof(struct vtblk_config), /* config reg size */ + pci_vtblk_reset, /* reset */ + pci_vtblk_notify, /* device-wide qnotify */ + pci_vtblk_cfgread, /* read PCI config */ + pci_vtblk_cfgwrite, /* write PCI config */ + VTBLK_S_HOSTCAPS, /* our capabilities */ +}; static void -pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value) +pci_vtblk_reset(void *vsc) { - if (value == 0) { - DPRINTF(("vtblk: device reset requested !\n")); - sc->vbsc_isr = 0; - sc->msix_table_idx_req = VIRTIO_MSI_NO_VECTOR; - sc->msix_table_idx_cfg = VIRTIO_MSI_NO_VECTOR; - sc->vbsc_features = 0; - sc->vbsc_pfn = 0; - sc->vbsc_lastq = 0; - memset(&sc->vbsc_q, 0, sizeof(struct vring_hqueue)); - } + struct pci_vtblk_softc *sc = vsc; - sc->vbsc_status = value; + DPRINTF(("vtblk: device reset requested !\n")); + vi_reset_dev(&sc->vbsc_vs); } static void -pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq) +pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { - struct iovec iov[VTBLK_MAXSEGS]; struct virtio_blk_hdr *vbh; - struct virtio_desc *vd, *vid; - struct virtio_used *vu; uint8_t *status; - int i; + int i, n; int err; int iolen; - int uidx, aidx, didx; - int indirect, writeop, type; + int writeop, type; off_t offset; + struct iovec iov[VTBLK_MAXSEGS + 2]; + uint16_t flags[VTBLK_MAXSEGS + 2]; - uidx = *hq->hq_used_idx; - aidx = hq->hq_cur_aidx; - didx = hq->hq_avail_ring[aidx % hq->hq_size]; - assert(didx >= 0 && didx < hq->hq_size); - - vd = &hq->hq_dtable[didx]; - - indirect = ((vd->vd_flags & VRING_DESC_F_INDIRECT) != 0); - - if (indirect) { - vid = paddr_guest2host(vtblk_ctx(sc), vd->vd_addr, vd->vd_len); - vd = &vid[0]; - } + n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags); /* - * The first descriptor will be the read-only fixed header + * The first descriptor will be the read-only fixed header, + * and the last is for status (hence +2 above and below). + * The remaining iov's are the actual data I/O vectors. + * + * XXX - note - this fails on crash dump, which does a + * VIRTIO_BLK_T_FLUSH with a zero transfer length */ - vbh = paddr_guest2host(vtblk_ctx(sc), vd->vd_addr, - sizeof(struct virtio_blk_hdr)); - assert(vd->vd_len == sizeof(struct virtio_blk_hdr)); - assert(vd->vd_flags & VRING_DESC_F_NEXT); - assert((vd->vd_flags & VRING_DESC_F_WRITE) == 0); + assert (n >= 3 && n < VTBLK_MAXSEGS + 2); + + assert((flags[0] & VRING_DESC_F_WRITE) == 0); + assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); + vbh = iov[0].iov_base; + + status = iov[--n].iov_base; + assert(iov[n].iov_len == 1); + assert(flags[n] & VRING_DESC_F_WRITE); /* * XXX @@ -247,120 +176,44 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq) offset = vbh->vbh_sector * DEV_BSIZE; - /* - * Build up the iovec based on the guest's data descriptors - */ - i = iolen = 0; - while (1) { - if (indirect) - vd = &vid[i + 1]; /* skip first indirect desc */ - else - vd = &hq->hq_dtable[vd->vd_next]; - - if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0) - break; - - if (i == VTBLK_MAXSEGS) - break; - + iolen = 0; + for (i = 1; i < n; i++) { /* * - write op implies read-only descriptor, * - read op implies write-only descriptor, * therefore test the inverse of the descriptor bit * to the op. */ - assert(((vd->vd_flags & VRING_DESC_F_WRITE) == 0) == - writeop); - - iov[i].iov_base = paddr_guest2host(vtblk_ctx(sc), - vd->vd_addr, - vd->vd_len); - iov[i].iov_len = vd->vd_len; - iolen += vd->vd_len; - i++; + assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); + iolen += iov[i].iov_len; } - /* Lastly, get the address of the status byte */ - status = paddr_guest2host(vtblk_ctx(sc), vd->vd_addr, 1); - assert(vd->vd_len == 1); - assert((vd->vd_flags & VRING_DESC_F_NEXT) == 0); - assert(vd->vd_flags & VRING_DESC_F_WRITE); - DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", - writeop ? "write" : "read", iolen, i, offset)); + writeop ? "write" : "read", iolen, i - 1, offset)); if (writeop) - err = pwritev(sc->vbsc_fd, iov, i, offset); + err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset); else - err = preadv(sc->vbsc_fd, iov, i, offset); + err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset); *status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK; /* - * Return the single descriptor back to the host + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. */ - vu = &hq->hq_used_ring[uidx % hq->hq_size]; - vu->vu_idx = didx; - vu->vu_tlen = 1; - hq->hq_cur_aidx++; - *hq->hq_used_idx += 1; - - /* - * Generate an interrupt if able - */ - if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { - if (use_msix) { - pci_generate_msix(sc->vbsc_pi, sc->msix_table_idx_req); - } else if (sc->vbsc_isr == 0) { - sc->vbsc_isr = 1; - pci_generate_msi(sc->vbsc_pi, 0); - } - } + vq_relchain(vq, 1); } static void -pci_vtblk_qnotify(struct pci_vtblk_softc *sc) +pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { - struct vring_hqueue *hq = &sc->vbsc_q; - int ndescs; + struct pci_vtblk_softc *sc = vsc; - while ((ndescs = hq_num_avail(hq)) != 0) { - /* - * Run through all the entries, placing them into iovecs and - * sending when an end-of-packet is found - */ - pci_vtblk_proc(sc, hq); - } -} - -static void -pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn) -{ - struct vring_hqueue *hq; - - sc->vbsc_pfn = pfn << VRING_PFN; - - /* - * Set up host pointers to the various parts of the - * queue - */ - hq = &sc->vbsc_q; - hq->hq_size = VTBLK_RINGSZ; - - hq->hq_dtable = paddr_guest2host(vtblk_ctx(sc), pfn << VRING_PFN, - vring_size(VTBLK_RINGSZ)); - hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); - hq->hq_avail_idx = hq->hq_avail_flags + 1; - hq->hq_avail_ring = hq->hq_avail_flags + 2; - hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, - VRING_ALIGN); - hq->hq_used_idx = hq->hq_used_flags + 1; - hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); - - /* - * Initialize queue indexes - */ - hq->hq_cur_aidx = 0; + vq_startchains(vq); + while (vq_has_descs(vq)) + pci_vtblk_proc(sc, vq); + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static int @@ -371,6 +224,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) off_t size; int fd; int sectsz; + int use_msix; const char *env_msi; if (opts == NULL) { @@ -412,10 +266,14 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc = malloc(sizeof(struct pci_vtblk_softc)); memset(sc, 0, sizeof(struct pci_vtblk_softc)); - pi->pi_arg = sc; - sc->vbsc_pi = pi; + /* record fd of storage device/file */ sc->vbsc_fd = fd; + /* init virtio softc and virtqueues */ + vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq); + sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ; + /* sc->vbsc_vq.vq_notify = we have no per-queue notify */ + /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / sectsz; sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS; @@ -426,206 +284,51 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->vbsc_cfg.vbc_geom_s = 0; sc->vbsc_cfg.vbc_sectors_max = 0; - /* initialize config space */ + /* + * Should we move some of this into virtio.c? Could + * have the device, class, and subdev_0 as fields in + * the virtio constants structure. + */ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK); pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + use_msix = 1; if ((env_msi = getenv("BHYVE_USE_MSI"))) { if (strcasecmp(env_msi, "yes") == 0) use_msix = 0; } - - if (use_msix) { - /* MSI-X Support */ - sc->msix_table_idx_req = VIRTIO_MSI_NO_VECTOR; - sc->msix_table_idx_cfg = VIRTIO_MSI_NO_VECTOR; - - if (pci_emul_add_msixcap(pi, 2, 1)) - return (1); - } else { - /* MSI Support */ - pci_emul_add_msicap(pi, 1); - } - - pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTBLK_REGSZ); - + if (vi_intr_init(&sc->vbsc_vs, 1, use_msix)) + return (1); + vi_set_io_bar(&sc->vbsc_vs, 0); return (0); } -static uint64_t -vtblk_adjust_offset(struct pci_devinst *pi, uint64_t offset) -{ - /* - * Device specific offsets used by guest would change - * based on whether MSI-X capability is enabled or not - */ - if (!pci_msix_enabled(pi)) { - if (offset >= VTCFG_R_MSIX) - return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX)); - } - - return (offset); -} - -static void -pci_vtblk_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int baridx, uint64_t offset, int size, uint64_t value) +static int +pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value) { - struct pci_vtblk_softc *sc = pi->pi_arg; - if (use_msix) { - if (baridx == pci_msix_table_bar(pi) || - baridx == pci_msix_pba_bar(pi)) { - pci_emul_msix_twrite(pi, offset, size, value); - return; - } - } - - assert(baridx == 0); - - if (offset + size > pci_vtblk_iosize(pi)) { - DPRINTF(("vtblk_write: 2big, offset %ld size %d\n", - offset, size)); - return; - } - - offset = vtblk_adjust_offset(pi, offset); - - switch (offset) { - case VTCFG_R_GUESTCAP: - assert(size == 4); - sc->vbsc_features = value & VTBLK_S_HOSTCAPS; - break; - case VTCFG_R_PFN: - assert(size == 4); - pci_vtblk_ring_init(sc, value); - break; - case VTCFG_R_QSEL: - assert(size == 2); - sc->vbsc_lastq = value; - break; - case VTCFG_R_QNOTIFY: - assert(size == 2); - assert(value == 0); - pci_vtblk_qnotify(sc); - break; - case VTCFG_R_STATUS: - assert(size == 1); - pci_vtblk_update_status(sc, value); - break; - case VTCFG_R_CFGVEC: - assert(size == 2); - sc->msix_table_idx_cfg = value; - break; - case VTCFG_R_QVEC: - assert(size == 2); - sc->msix_table_idx_req = value; - break; - case VTCFG_R_HOSTCAP: - case VTCFG_R_QNUM: - case VTCFG_R_ISR: - case VTBLK_R_CFG ... VTBLK_R_CFG_END: - DPRINTF(("vtblk: write to readonly reg %ld\n\r", offset)); - break; - default: - DPRINTF(("vtblk: unknown i/o write offset %ld\n\r", offset)); - value = 0; - break; - } + DPRINTF(("vtblk: write to readonly reg %d\n\r", offset)); + return (1); } -uint64_t -pci_vtblk_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int baridx, uint64_t offset, int size) +static int +pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) { - struct pci_vtblk_softc *sc = pi->pi_arg; + struct pci_vtblk_softc *sc = vsc; void *ptr; - uint32_t value; - if (use_msix) { - if (baridx == pci_msix_table_bar(pi) || - baridx == pci_msix_pba_bar(pi)) { - return (pci_emul_msix_tread(pi, offset, size)); - } - } - - assert(baridx == 0); - - if (offset + size > pci_vtblk_iosize(pi)) { - DPRINTF(("vtblk_read: 2big, offset %ld size %d\n", - offset, size)); - return (0); - } - - offset = vtblk_adjust_offset(pi, offset); - - switch (offset) { - case VTCFG_R_HOSTCAP: - assert(size == 4); - value = VTBLK_S_HOSTCAPS; - break; - case VTCFG_R_GUESTCAP: - assert(size == 4); - value = sc->vbsc_features; /* XXX never read ? */ - break; - case VTCFG_R_PFN: - assert(size == 4); - value = sc->vbsc_pfn >> VRING_PFN; - break; - case VTCFG_R_QNUM: - value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0; - break; - case VTCFG_R_QSEL: - assert(size == 2); - value = sc->vbsc_lastq; /* XXX never read ? */ - break; - case VTCFG_R_QNOTIFY: - assert(size == 2); - value = 0; /* XXX never read ? */ - break; - case VTCFG_R_STATUS: - assert(size == 1); - value = sc->vbsc_status; - break; - case VTCFG_R_ISR: - assert(size == 1); - value = sc->vbsc_isr; - sc->vbsc_isr = 0; /* a read clears this flag */ - break; - case VTCFG_R_CFGVEC: - assert(size == 2); - value = sc->msix_table_idx_cfg; - break; - case VTCFG_R_QVEC: - assert(size == 2); - value = sc->msix_table_idx_req; - break; - case VTBLK_R_CFG ... VTBLK_R_CFG_END: - assert(size + offset <= (VTBLK_R_CFG_END + 1)); - ptr = (uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG; - if (size == 1) { - value = *(uint8_t *) ptr; - } else if (size == 2) { - value = *(uint16_t *) ptr; - } else { - value = *(uint32_t *) ptr; - } - break; - default: - DPRINTF(("vtblk: unknown i/o read offset %ld\n\r", offset)); - value = 0; - break; - } - - return (value); + /* our caller has already verified offset and size */ + ptr = (uint8_t *)&sc->vbsc_cfg + offset; + memcpy(retval, ptr, size); + return (0); } struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, - .pe_barwrite = pci_vtblk_write, - .pe_barread = pci_vtblk_read + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vblk); diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c index 19f9ffe..2939949 100644 --- a/usr.sbin/bhyve/pci_virtio_net.c +++ b/usr.sbin/bhyve/pci_virtio_net.c @@ -59,56 +59,49 @@ __FBSDID("$FreeBSD$"); #define VTNET_MAXSEGS 32 /* - * PCI config-space register offsets + * Host capabilities. Note that we only offer a few of these. */ -#define VTNET_R_CFG0 24 -#define VTNET_R_CFG1 25 -#define VTNET_R_CFG2 26 -#define VTNET_R_CFG3 27 -#define VTNET_R_CFG4 28 -#define VTNET_R_CFG5 29 -#define VTNET_R_CFG6 30 -#define VTNET_R_CFG7 31 -#define VTNET_R_MAX 31 - -#define VTNET_REGSZ VTNET_R_MAX+1 +#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */ +#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */ +#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */ +#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */ +#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */ +#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */ +#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */ +#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */ +#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */ +#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */ +#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */ +#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */ +#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */ +#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */ +#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */ +#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */ +#define VIRTIO_NET_F_GUEST_ANNOUNCE \ + (1 << 21) /* guest can send gratuitous pkts */ -/* - * Host capabilities - */ #define VTNET_S_HOSTCAPS \ - ( 0x00000020 | /* host supplies MAC */ \ - 0x00008000 | /* host can merge Rx buffers */ \ - 0x00010000 | /* config status available */ \ + ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ VIRTIO_F_NOTIFY_ON_EMPTY) /* + * PCI config-space "registers" + */ +struct virtio_net_config { + uint8_t mac[6]; + uint16_t status; +} __packed; + +/* * Queue definitions. */ #define VTNET_RXQ 0 #define VTNET_TXQ 1 -#define VTNET_CTLQ 2 +#define VTNET_CTLQ 2 /* NB: not yet supported */ #define VTNET_MAXQ 3 -static int use_msix = 1; - -struct vring_hqueue { - /* Internal state */ - uint16_t hq_size; - uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */ - - /* Host-context pointers to the queue */ - struct virtio_desc *hq_dtable; - uint16_t *hq_avail_flags; - uint16_t *hq_avail_idx; /* monotonically increasing */ - uint16_t *hq_avail_ring; - - uint16_t *hq_used_flags; - uint16_t *hq_used_idx; /* monotonically increasing */ - struct virtio_used *hq_used_ring; -}; - /* * Fixed network header size */ @@ -133,23 +126,17 @@ static int pci_vtnet_debug; * Per-device softc */ struct pci_vtnet_softc { - struct pci_devinst *vsc_pi; + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTNET_MAXQ - 1]; pthread_mutex_t vsc_mtx; struct mevent *vsc_mevp; - int vsc_curq; - int vsc_status; - int vsc_isr; int vsc_tapfd; int vsc_rx_ready; - int resetting; + volatile int resetting; /* set and checked outside lock */ uint32_t vsc_features; - uint8_t vsc_macaddr[6]; - - uint64_t vsc_pfn[VTNET_MAXQ]; - struct vring_hqueue vsc_hq[VTNET_MAXQ]; - uint16_t vsc_msix_table_idx[VTNET_MAXQ]; + struct virtio_net_config vsc_config; pthread_mutex_t rx_mtx; int rx_in_progress; @@ -159,73 +146,22 @@ struct pci_vtnet_softc { pthread_cond_t tx_cond; int tx_in_progress; }; -#define vtnet_ctx(sc) ((sc)->vsc_pi->pi_vmctx) -#define notify_on_empty(sc) ((sc)->vsc_features & VIRTIO_F_NOTIFY_ON_EMPTY) - -/* - * Return the size of IO BAR that maps virtio header and device specific - * region. The size would vary depending on whether MSI-X is enabled or - * not. - */ -static uint64_t -pci_vtnet_iosize(struct pci_devinst *pi) -{ - if (pci_msix_enabled(pi)) - return (VTNET_REGSZ); - else - return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX)); -} - -/* - * Return the number of available descriptors in the vring taking care - * of the 16-bit index wraparound. - */ -static int -hq_num_avail(struct vring_hqueue *hq) -{ - uint16_t ndesc; - - /* - * We're just computing (a-b) mod 2^16 - * - * The only glitch here is that in standard C, - * uint16_t promotes to (signed) int when int has - * more than 16 bits (pretty much always now), so - * we have to force it back to unsigned. - */ - ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx; - - assert(ndesc <= hq->hq_size); - - return (ndesc); -} - -static uint16_t -pci_vtnet_qsize(int qnum) -{ - /* XXX no ctl queue currently */ - if (qnum == VTNET_CTLQ) { - return (0); - } - - /* XXX fixed currently. Maybe different for tx/rx/ctl */ - return (VTNET_RINGSZ); -} - -static void -pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring) -{ - struct vring_hqueue *hq; - - assert(ring < VTNET_MAXQ); - - hq = &sc->vsc_hq[ring]; - /* - * Reset all soft state - */ - hq->hq_cur_aidx = 0; -} +static void pci_vtnet_reset(void *); +/* static void pci_vtnet_notify(void *, struct vqueue_info *); */ +static int pci_vtnet_cfgread(void *, int, int, uint32_t *); +static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); + +static struct virtio_consts vtnet_vi_consts = { + "vtnet", /* our name */ + VTNET_MAXQ - 1, /* we currently support 2 virtqueues */ + sizeof(struct virtio_net_config), /* config reg size */ + pci_vtnet_reset, /* reset */ + NULL, /* device-wide qnotify -- not used */ + pci_vtnet_cfgread, /* read PCI config */ + pci_vtnet_cfgwrite, /* write PCI config */ + VTNET_S_HOSTCAPS, /* our capabilities */ +}; /* * If the transmit thread is active then stall until it is done. @@ -260,48 +196,27 @@ pci_vtnet_rxwait(struct pci_vtnet_softc *sc) } static void -pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value) +pci_vtnet_reset(void *vsc) { - int i; - - if (value == 0) { - DPRINTF(("vtnet: device reset requested !\n")); - - sc->resetting = 1; + struct pci_vtnet_softc *sc = vsc; - /* - * Wait for the transmit and receive threads to finish their - * processing. - */ - pci_vtnet_txwait(sc); - pci_vtnet_rxwait(sc); - - sc->vsc_rx_ready = 0; - pci_vtnet_ring_reset(sc, VTNET_RXQ); - pci_vtnet_ring_reset(sc, VTNET_TXQ); + DPRINTF(("vtnet: device reset requested !\n")); - for (i = 0; i < VTNET_MAXQ; i++) - sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR; + sc->resetting = 1; - sc->vsc_isr = 0; - sc->vsc_features = 0; - - sc->resetting = 0; - } + /* + * Wait for the transmit and receive threads to finish their + * processing. + */ + pci_vtnet_txwait(sc); + pci_vtnet_rxwait(sc); - sc->vsc_status = value; -} + sc->vsc_rx_ready = 0; -static void -vtnet_generate_interrupt(struct pci_vtnet_softc *sc, int qidx) -{ + /* now reset rings, MSI-X vectors, and negotiated capabilities */ + vi_reset_dev(&sc->vsc_vs); - if (use_msix) { - pci_generate_msix(sc->vsc_pi, sc->vsc_msix_table_idx[qidx]); - } else { - sc->vsc_isr |= 1; - pci_generate_msi(sc->vsc_pi, 0); - } + sc->resetting = 0; } /* @@ -311,7 +226,7 @@ static void pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) { - char pad[60]; + static char pad[60]; /* all zero bytes */ if (sc->vsc_tapfd == -1) return; @@ -322,7 +237,6 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, * there is always an extra iov available by the caller. */ if (len < 60) { - memset(pad, 0, 60 - len); iov[iovcnt].iov_base = pad; iov[iovcnt].iov_len = 60 - len; iovcnt++; @@ -342,15 +256,11 @@ static uint8_t dummybuf[2048]; static void pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) { - struct virtio_desc *vd; - struct virtio_used *vu; - struct vring_hqueue *hq; + struct vqueue_info *vq; struct virtio_net_rxhdr *vrx; uint8_t *buf; - int i; int len; - int ndescs; - int didx, uidx, aidx; /* descriptor, avail and used index */ + struct iovec iov; /* * Should never be called without a valid tap fd @@ -370,47 +280,45 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) } /* - * Calculate the number of available rx buffers + * Check for available rx buffers */ - hq = &sc->vsc_hq[VTNET_RXQ]; - - ndescs = hq_num_avail(hq); - - if (ndescs == 0) { + vq = &sc->vsc_queues[VTNET_RXQ]; + vq_startchains(vq); + if (!vq_has_descs(vq)) { /* - * Drop the packet and try later + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. */ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf)); - - if (notify_on_empty(sc)) - vtnet_generate_interrupt(sc, VTNET_RXQ); - + vq_endchains(vq, 1); return; } - aidx = hq->hq_cur_aidx; - uidx = *hq->hq_used_idx; - for (i = 0; i < ndescs; i++) { + do { /* - * 'aidx' indexes into the an array of descriptor indexes + * Get descriptor chain, which should have just + * one descriptor in it. + * ??? allow guests to use multiple descs? */ - didx = hq->hq_avail_ring[aidx % hq->hq_size]; - assert(didx >= 0 && didx < hq->hq_size); - - vd = &hq->hq_dtable[didx]; + assert(vq_getchain(vq, &iov, 1, NULL) == 1); /* * Get a pointer to the rx header, and use the * data immediately following it for the packet buffer. */ - vrx = paddr_guest2host(vtnet_ctx(sc), vd->vd_addr, vd->vd_len); + vrx = iov.iov_base; buf = (uint8_t *)(vrx + 1); len = read(sc->vsc_tapfd, buf, - vd->vd_len - sizeof(struct virtio_net_rxhdr)); + iov.iov_len - sizeof(struct virtio_net_rxhdr)); if (len < 0 && errno == EWOULDBLOCK) { - break; + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_endchains(vq, 0); + return; } /* @@ -422,23 +330,13 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) vrx->vrh_bufs = 1; /* - * Write this descriptor into the used ring + * Release this chain and handle more chains. */ - vu = &hq->hq_used_ring[uidx % hq->hq_size]; - vu->vu_idx = didx; - vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr); - uidx++; - aidx++; - } - - /* - * Update the used pointer, and signal an interrupt if allowed - */ - *hq->hq_used_idx = uidx; - hq->hq_cur_aidx = aidx; + vq_relchain(vq, len + sizeof(struct virtio_net_rxhdr)); + } while (vq_has_descs(vq)); - if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) - vtnet_generate_interrupt(sc, VTNET_RXQ); + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); } static void @@ -455,8 +353,10 @@ pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) } static void -pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc) +pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) { + struct pci_vtnet_softc *sc = vsc; + /* * A qnotify means that the rx process can now begin */ @@ -466,71 +366,42 @@ pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc) } static void -pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq) +pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) { struct iovec iov[VTNET_MAXSEGS + 1]; - struct virtio_desc *vd; - struct virtio_used *vu; - int i; - int plen; - int tlen; - int uidx, aidx, didx; - - uidx = *hq->hq_used_idx; - aidx = hq->hq_cur_aidx; - didx = hq->hq_avail_ring[aidx % hq->hq_size]; - assert(didx >= 0 && didx < hq->hq_size); - - vd = &hq->hq_dtable[didx]; + int i, n; + int plen, tlen; /* - * Run through the chain of descriptors, ignoring the - * first header descriptor. However, include the header - * length in the total length that will be put into the - * used queue. + * Obtain chain of descriptors. The first one is + * really the header descriptor, so we need to sum + * up two lengths: packet length and transfer length. */ - tlen = vd->vd_len; - vd = &hq->hq_dtable[vd->vd_next]; - - for (i = 0, plen = 0; - i < VTNET_MAXSEGS; - i++, vd = &hq->hq_dtable[vd->vd_next]) { - iov[i].iov_base = paddr_guest2host(vtnet_ctx(sc), - vd->vd_addr, vd->vd_len); - iov[i].iov_len = vd->vd_len; - plen += vd->vd_len; - tlen += vd->vd_len; - - if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0) - break; + n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + plen = 0; + tlen = iov[0].iov_len; + for (i = 1; i < n; i++) { + plen += iov[i].iov_len; + tlen += iov[i].iov_len; } - assert(i < VTNET_MAXSEGS); - DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1)); - pci_vtnet_tap_tx(sc, iov, i + 1, plen); + DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); + pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen); - /* - * Return this chain back to the host - */ - vu = &hq->hq_used_ring[uidx % hq->hq_size]; - vu->vu_idx = didx; - vu->vu_tlen = tlen; - hq->hq_cur_aidx = aidx + 1; - *hq->hq_used_idx = uidx + 1; + /* chain is processed, release it and set tlen */ + vq_relchain(vq, tlen); } static void -pci_vtnet_ping_txq(struct pci_vtnet_softc *sc) +pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) { - struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ]; - int ndescs; + struct pci_vtnet_softc *sc = vsc; /* - * Calculate number of ring entries to process + * Any ring entries to process? */ - ndescs = hq_num_avail(hq); - - if (ndescs == 0) + if (!vq_has_descs(vq)) return; /* Signal the tx thread for processing */ @@ -546,97 +417,65 @@ pci_vtnet_ping_txq(struct pci_vtnet_softc *sc) static void * pci_vtnet_tx_thread(void *param) { - struct pci_vtnet_softc *sc = (struct pci_vtnet_softc *) param; - struct vring_hqueue *hq; - int i, ndescs, error; - - hq = &sc->vsc_hq[VTNET_TXQ]; - - /* - * Let us wait till the tx queue pointers get initialised & - * first tx signaled + struct pci_vtnet_softc *sc = param; + struct vqueue_info *vq; + int have_work, error; + + vq = &sc->vsc_queues[VTNET_TXQ]; + + /* + * Let us wait till the tx queue pointers get initialised & + * first tx signaled */ pthread_mutex_lock(&sc->tx_mtx); error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); assert(error == 0); - + for (;;) { - pthread_mutex_lock(&sc->tx_mtx); - for (;;) { + /* note - tx mutex is locked here */ + do { if (sc->resetting) - ndescs = 0; + have_work = 0; else - ndescs = hq_num_avail(hq); - - if (ndescs != 0) - break; - - sc->tx_in_progress = 0; - error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); - assert(error == 0); - } + have_work = vq_has_descs(vq); + + if (!have_work) { + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, + &sc->tx_mtx); + assert(error == 0); + } + } while (!have_work); sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); - while (ndescs > 0) { + vq_startchains(vq); + do { /* - * Run through all the entries, placing them into - * iovecs and sending when an end-of-packet is found + * Run through entries, placing them into + * iovecs and sending when an end-of-packet + * is found */ - for (i = 0; i < ndescs; i++) - pci_vtnet_proctx(sc, hq); - - ndescs = hq_num_avail(hq); - } + pci_vtnet_proctx(sc, vq); + } while (vq_has_descs(vq)); /* * Generate an interrupt if needed. */ - if (notify_on_empty(sc) || - (*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) - vtnet_generate_interrupt(sc, VTNET_TXQ); - } -} + vq_endchains(vq, 1); -static void -pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc) -{ - - DPRINTF(("vtnet: control qnotify!\n\r")); + pthread_mutex_lock(&sc->tx_mtx); + } } +#ifdef notyet static void -pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn) +pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { - struct vring_hqueue *hq; - int qnum = sc->vsc_curq; - - assert(qnum < VTNET_MAXQ); - - sc->vsc_pfn[qnum] = pfn << VRING_PFN; - - /* - * Set up host pointers to the various parts of the - * queue - */ - hq = &sc->vsc_hq[qnum]; - hq->hq_size = pci_vtnet_qsize(qnum); - - hq->hq_dtable = paddr_guest2host(vtnet_ctx(sc), pfn << VRING_PFN, - vring_size(hq->hq_size)); - hq->hq_avail_flags = (uint16_t *)(hq->hq_dtable + hq->hq_size); - hq->hq_avail_idx = hq->hq_avail_flags + 1; - hq->hq_avail_ring = hq->hq_avail_flags + 2; - hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring, - VRING_ALIGN); - hq->hq_used_idx = hq->hq_used_flags + 1; - hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2); - /* - * Initialize queue indexes - */ - hq->hq_cur_aidx = 0; + DPRINTF(("vtnet: control qnotify!\n\r")); } +#endif static int pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) @@ -674,18 +513,27 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) char *devname; char *vtopts; int mac_provided; + int use_msix; sc = malloc(sizeof(struct pci_vtnet_softc)); memset(sc, 0, sizeof(struct pci_vtnet_softc)); - pi->pi_arg = sc; - sc->vsc_pi = pi; - pthread_mutex_init(&sc->vsc_mtx, NULL); + + vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; + sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; +#ifdef notyet + sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; + sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; +#endif /* * Use MSI if set by user */ + use_msix = 1; if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) { if (strcasecmp(env_msi, "yes") == 0) use_msix = 0; @@ -705,7 +553,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) (void) strsep(&vtopts, ","); if (vtopts != NULL) { - err = pci_vtnet_parsemac(vtopts, sc->vsc_macaddr); + err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac); if (err != 0) { free(devname); return (err); @@ -757,12 +605,12 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) MD5Update(&mdctx, nstr, strlen(nstr)); MD5Final(digest, &mdctx); - sc->vsc_macaddr[0] = 0x00; - sc->vsc_macaddr[1] = 0xa0; - sc->vsc_macaddr[2] = 0x98; - sc->vsc_macaddr[3] = digest[0]; - sc->vsc_macaddr[4] = digest[1]; - sc->vsc_macaddr[5] = digest[2]; + sc->vsc_config.mac[0] = 0x00; + sc->vsc_config.mac[1] = 0xa0; + sc->vsc_config.mac[2] = 0x98; + sc->vsc_config.mac[3] = digest[0]; + sc->vsc_config.mac[4] = digest[1]; + sc->vsc_config.mac[5] = digest[2]; } /* initialize config space */ @@ -770,25 +618,16 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); - - if (use_msix) { - /* MSI-X support */ - int i; - - for (i = 0; i < VTNET_MAXQ; i++) - sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR; - /* - * BAR 1 used to map MSI-X table and PBA - */ - if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1)) - return (1); - } else { - /* MSI support */ - pci_emul_add_msicap(pi, 1); - } + /* link always up */ + sc->vsc_config.status = 1; - pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ); + /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ + if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) + return (1); + + /* use BAR 0 to map config regs in IO space */ + vi_set_io_bar(&sc->vsc_vs, 0); sc->resetting = 0; @@ -796,7 +635,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pthread_mutex_init(&sc->rx_mtx, NULL); /* - * Initialize tx semaphore & spawn TX processing thread + * Initialize tx semaphore & spawn TX processing thread. * As of now, only one thread for TX desc processing is * spawned. */ @@ -810,234 +649,41 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) return (0); } -/* - * Function pointer array to handle queue notifications - */ -static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = { - pci_vtnet_ping_rxq, - pci_vtnet_ping_txq, - pci_vtnet_ping_ctlq -}; - -static uint64_t -vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset) -{ - /* - * Device specific offsets used by guest would change based on - * whether MSI-X capability is enabled or not - */ - if (!pci_msix_enabled(pi)) { - if (offset >= VTCFG_R_MSIX) - return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX)); - } - - return (offset); -} - -static void -pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int baridx, uint64_t offset, int size, uint64_t value) +static int +pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) { - struct pci_vtnet_softc *sc = pi->pi_arg; + struct pci_vtnet_softc *sc = vsc; void *ptr; - if (use_msix) { - if (baridx == pci_msix_table_bar(pi) || - baridx == pci_msix_pba_bar(pi)) { - pci_emul_msix_twrite(pi, offset, size, value); - return; - } - } - - assert(baridx == 0); - - if (offset + size > pci_vtnet_iosize(pi)) { - DPRINTF(("vtnet_write: 2big, offset %ld size %d\n", - offset, size)); - return; - } - - pthread_mutex_lock(&sc->vsc_mtx); - - offset = vtnet_adjust_offset(pi, offset); - - switch (offset) { - case VTCFG_R_GUESTCAP: - assert(size == 4); - sc->vsc_features = value & VTNET_S_HOSTCAPS; - break; - case VTCFG_R_PFN: - assert(size == 4); - pci_vtnet_ring_init(sc, value); - break; - case VTCFG_R_QSEL: - assert(size == 2); - assert(value < VTNET_MAXQ); - sc->vsc_curq = value; - break; - case VTCFG_R_QNOTIFY: - assert(size == 2); - assert(value < VTNET_MAXQ); - (*pci_vtnet_qnotify[value])(sc); - break; - case VTCFG_R_STATUS: - assert(size == 1); - pci_vtnet_update_status(sc, value); - break; - case VTCFG_R_CFGVEC: - assert(size == 2); - sc->vsc_msix_table_idx[VTNET_CTLQ] = value; - break; - case VTCFG_R_QVEC: - assert(size == 2); - assert(sc->vsc_curq != VTNET_CTLQ); - sc->vsc_msix_table_idx[sc->vsc_curq] = value; - break; - case VTNET_R_CFG0: - case VTNET_R_CFG1: - case VTNET_R_CFG2: - case VTNET_R_CFG3: - case VTNET_R_CFG4: - case VTNET_R_CFG5: - assert((size + offset) <= (VTNET_R_CFG5 + 1)); - ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; + if (offset < 6) { + assert(offset + size <= 6); /* * The driver is allowed to change the MAC address */ - sc->vsc_macaddr[offset - VTNET_R_CFG0] = value; - if (size == 1) { - *(uint8_t *) ptr = value; - } else if (size == 2) { - *(uint16_t *) ptr = value; - } else { - *(uint32_t *) ptr = value; - } - break; - case VTCFG_R_HOSTCAP: - case VTCFG_R_QNUM: - case VTCFG_R_ISR: - case VTNET_R_CFG6: - case VTNET_R_CFG7: - DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset)); - break; - default: - DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset)); - value = 0; - break; + ptr = &sc->vsc_config.mac[offset]; + memcpy(ptr, &value, size); + } else { + DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); + return (1); } - - pthread_mutex_unlock(&sc->vsc_mtx); + return (0); } -uint64_t -pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int baridx, uint64_t offset, int size) +static int +pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) { - struct pci_vtnet_softc *sc = pi->pi_arg; + struct pci_vtnet_softc *sc = vsc; void *ptr; - uint64_t value; - - if (use_msix) { - if (baridx == pci_msix_table_bar(pi) || - baridx == pci_msix_pba_bar(pi)) { - return (pci_emul_msix_tread(pi, offset, size)); - } - } - assert(baridx == 0); - - if (offset + size > pci_vtnet_iosize(pi)) { - DPRINTF(("vtnet_read: 2big, offset %ld size %d\n", - offset, size)); - return (0); - } - - pthread_mutex_lock(&sc->vsc_mtx); - - offset = vtnet_adjust_offset(pi, offset); - - switch (offset) { - case VTCFG_R_HOSTCAP: - assert(size == 4); - value = VTNET_S_HOSTCAPS; - break; - case VTCFG_R_GUESTCAP: - assert(size == 4); - value = sc->vsc_features; /* XXX never read ? */ - break; - case VTCFG_R_PFN: - assert(size == 4); - value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN; - break; - case VTCFG_R_QNUM: - assert(size == 2); - value = pci_vtnet_qsize(sc->vsc_curq); - break; - case VTCFG_R_QSEL: - assert(size == 2); - value = sc->vsc_curq; /* XXX never read ? */ - break; - case VTCFG_R_QNOTIFY: - assert(size == 2); - value = sc->vsc_curq; /* XXX never read ? */ - break; - case VTCFG_R_STATUS: - assert(size == 1); - value = sc->vsc_status; - break; - case VTCFG_R_ISR: - assert(size == 1); - value = sc->vsc_isr; - sc->vsc_isr = 0; /* a read clears this flag */ - break; - case VTCFG_R_CFGVEC: - assert(size == 2); - value = sc->vsc_msix_table_idx[VTNET_CTLQ]; - break; - case VTCFG_R_QVEC: - assert(size == 2); - assert(sc->vsc_curq != VTNET_CTLQ); - value = sc->vsc_msix_table_idx[sc->vsc_curq]; - break; - case VTNET_R_CFG0: - case VTNET_R_CFG1: - case VTNET_R_CFG2: - case VTNET_R_CFG3: - case VTNET_R_CFG4: - case VTNET_R_CFG5: - assert((size + offset) <= (VTNET_R_CFG5 + 1)); - ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0]; - if (size == 1) { - value = *(uint8_t *) ptr; - } else if (size == 2) { - value = *(uint16_t *) ptr; - } else { - value = *(uint32_t *) ptr; - } - break; - case VTNET_R_CFG6: - assert(size != 4); - value = 0x01; /* XXX link always up */ - break; - case VTNET_R_CFG7: - assert(size == 1); - value = 0; /* XXX link status in LSB */ - break; - default: - DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset)); - value = 0; - break; - } - - pthread_mutex_unlock(&sc->vsc_mtx); - - return (value); + ptr = (uint8_t *)&sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); } struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, - .pe_barwrite = pci_vtnet_write, - .pe_barread = pci_vtnet_read + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read }; PCI_EMUL_SET(pci_de_vnet); diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c new file mode 100644 index 0000000..cdc9228 --- /dev/null +++ b/usr.sbin/bhyve/virtio.c @@ -0,0 +1,745 @@ +/*- + * Copyright (c) 2013 Chris Torek <torek @ torek net> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/uio.h> + +#include <stdio.h> +#include <stdint.h> +#include <pthread.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +/* + * Functions for dealing with generalized "virtual devices" as + * defined by <https://www.google.com/#output=search&q=virtio+spec> + */ + +/* + * In case we decide to relax the "virtio softc comes at the + * front of virtio-based device softc" constraint, let's use + * this to convert. + */ +#define DEV_SOFTC(vs) ((void *)(vs)) + +/* + * Link a virtio_softc to its constants, the device softc, and + * the PCI emulation. + */ +void +vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues) +{ + int i; + + /* vs and dev_softc addresses must match */ + assert((void *)vs == dev_softc); + vs->vs_vc = vc; + vs->vs_pi = pi; + pi->pi_arg = vs; + + vs->vs_queues = queues; + for (i = 0; i < vc->vc_nvq; i++) { + queues[i].vq_vs = vs; + queues[i].vq_num = i; + } +} + +/* + * Reset device (device-wide). This erases all queues, i.e., + * all the queues become invalid (though we don't wipe out the + * internal pointers, we just clear the VQ_ALLOC flag). + * + * It resets negotiated features to "none". + * + * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR. + */ +void +vi_reset_dev(struct virtio_softc *vs) +{ + struct vqueue_info *vq; + int i, nvq; + + nvq = vs->vs_vc->vc_nvq; + for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { + vq->vq_flags = 0; + vq->vq_last_avail = 0; + vq->vq_pfn = 0; + vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; + } + vs->vs_negotiated_caps = 0; + vs->vs_curq = 0; + /* vs->vs_status = 0; -- redundant */ + vs->vs_isr = 0; + vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR; +} + +/* + * Set I/O BAR (usually 0) to map PCI config registers. + */ +void +vi_set_io_bar(struct virtio_softc *vs, int barnum) +{ + size_t size; + + /* + * ??? should we use CFG0 if MSI-X is disabled? + * Existing code did not... + */ + size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize; + pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size); +} + +/* + * Initialize MSI-X vector capabilities if we're to use MSI-X, + * or MSI capabilities if not. + * + * We assume we want one MSI-X vector per queue, here, plus one + * for the config vec. + */ +int +vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) +{ + int nvec; + + if (use_msix) { + vs->vs_flags |= VIRTIO_USE_MSIX; + vi_reset_dev(vs); /* set all vectors to NO_VECTOR */ + nvec = vs->vs_vc->vc_nvq + 1; + if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum)) + return (1); + } else { + vs->vs_flags &= ~VIRTIO_USE_MSIX; + pci_emul_add_msicap(vs->vs_pi, barnum); + } + return (0); +} + +/* + * Initialize the currently-selected virtio queue (vs->vs_curq). + * The guest just gave us a page frame number, from which we can + * calculate the addresses of the queue. + */ +void +vi_vq_init(struct virtio_softc *vs, uint32_t pfn) +{ + struct vqueue_info *vq; + uint64_t phys; + size_t size; + char *base; + + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_pfn = pfn; + phys = pfn << VRING_PFN; + size = vring_size(vq->vq_qsize); + base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size); + + /* First page(s) are descriptors... */ + vq->vq_desc = (struct virtio_desc *)base; + base += vq->vq_qsize * sizeof(struct virtio_desc); + + /* ... immediately followed by "avail" ring (entirely uint16_t's) */ + vq->vq_avail = (struct vring_avail *)base; + base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t); + + /* Then it's rounded up to the next page... */ + base = (char *)roundup2((uintptr_t)base, VRING_ALIGN); + + /* ... and the last page(s) are the used ring. */ + vq->vq_used = (struct vring_used *)base; + + /* Mark queue as allocated, and start at 0 when we use it. */ + vq->vq_flags = VQ_ALLOC; + vq->vq_last_avail = 0; +} + +/* + * Helper inline for vq_getchain(): record the i'th "real" + * descriptor. + */ +static inline void +_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, + struct iovec *iov, int n_iov, uint16_t *flags) { + + if (i >= n_iov) + return; + iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len); + iov[i].iov_len = vd->vd_len; + if (flags != NULL) + flags[i] = vd->vd_flags; +} +#define VQ_MAX_DESCRIPTORS 512 /* see below */ + +/* + * Examine the chain of descriptors starting at the "next one" to + * make sure that they describe a sensible request. If so, return + * the number of "real" descriptors that would be needed/used in + * acting on this request. This may be smaller than the number of + * available descriptors, e.g., if there are two available but + * they are two separate requests, this just returns 1. Or, it + * may be larger: if there are indirect descriptors involved, + * there may only be one descriptor available but it may be an + * indirect pointing to eight more. We return 8 in this case, + * i.e., we do not count the indirect descriptors, only the "real" + * ones. + * + * Basically, this vets the vd_flags and vd_next field of each + * descriptor and tells you how many are involved. Since some may + * be indirect, this also needs the vmctx (in the pci_devinst + * at vs->vs_pi) so that it can find indirect descriptors. + * + * As we process each descriptor, we copy and adjust it (guest to + * host address wise, also using the vmtctx) into the given iov[] + * array (of the given size). If the array overflows, we stop + * placing values into the array but keep processing descriptors, + * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1. + * So you, the caller, must not assume that iov[] is as big as the + * return value (you can process the same thing twice to allocate + * a larger iov array if needed, or supply a zero length to find + * out how much space is needed). + * + * If you want to verify the WRITE flag on each descriptor, pass a + * non-NULL "flags" pointer to an array of "uint16_t" of the same size + * as n_iov and we'll copy each vd_flags field after unwinding any + * indirects. + * + * If some descriptor(s) are invalid, this prints a diagnostic message + * and returns -1. If no descriptors are ready now it simply returns 0. + * + * You are assumed to have done a vq_ring_ready() if needed (note + * that vq_has_descs() does one). + */ +int +vq_getchain(struct vqueue_info *vq, + struct iovec *iov, int n_iov, uint16_t *flags) +{ + int i; + u_int ndesc, n_indir; + u_int idx, head, next; + volatile struct virtio_desc *vdir, *vindir, *vp; + struct vmctx *ctx; + struct virtio_softc *vs; + const char *name; + + vs = vq->vq_vs; + name = vs->vs_vc->vc_name; + + /* + * Note: it's the responsibility of the guest not to + * update vq->vq_avail->va_idx until all of the descriptors + * the guest has written are valid (including all their + * vd_next fields and vd_flags). + * + * Compute (last_avail - va_idx) in integers mod 2**16. This is + * the number of descriptors the device has made available + * since the last time we updated vq->vq_last_avail. + * + * We just need to do the subtraction as an unsigned int, + * then trim off excess bits. + */ + idx = vq->vq_last_avail; + ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx); + if (ndesc == 0) + return (0); + if (ndesc > vq->vq_qsize) { + /* XXX need better way to diagnose issues */ + fprintf(stderr, + "%s: ndesc (%u) out of range, driver confused?\r\n", + name, (u_int)ndesc); + return (-1); + } + + /* + * Now count/parse "involved" descriptors starting from + * the head of the chain. + * + * To prevent loops, we could be more complicated and + * check whether we're re-visiting a previously visited + * index, but we just abort if the count gets excessive. + */ + ctx = vs->vs_pi->pi_vmctx; + head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + next = head; + for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { + if (next >= vq->vq_qsize) { + fprintf(stderr, + "%s: descriptor index %u out of range, " + "driver confused?\r\n", + name, next); + return (-1); + } + vdir = &vq->vq_desc[next]; + if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { + _vq_record(i, vdir, ctx, iov, n_iov, flags); + i++; + } else if ((vs->vs_negotiated_caps & + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + fprintf(stderr, + "%s: descriptor has forbidden INDIRECT flag, " + "driver confused?\r\n", + name); + return (-1); + } else { + n_indir = vdir->vd_len / 16; + if ((vdir->vd_len & 0xf) || n_indir == 0) { + fprintf(stderr, + "%s: invalid indir len 0x%x, " + "driver confused?\r\n", + name, (u_int)vdir->vd_len); + return (-1); + } + vindir = paddr_guest2host(ctx, + vdir->vd_addr, vdir->vd_len); + /* + * Indirects start at the 0th, then follow + * their own embedded "next"s until those run + * out. Each one's indirect flag must be off + * (we don't really have to check, could just + * ignore errors...). + */ + next = 0; + for (;;) { + vp = &vindir[next]; + if (vp->vd_flags & VRING_DESC_F_INDIRECT) { + fprintf(stderr, + "%s: indirect desc has INDIR flag," + " driver confused?\r\n", + name); + return (-1); + } + _vq_record(i, vp, ctx, iov, n_iov, flags); + if (++i > VQ_MAX_DESCRIPTORS) + goto loopy; + if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0) + break; + next = vp->vd_next; + if (next >= n_indir) { + fprintf(stderr, + "%s: invalid next %u > %u, " + "driver confused?\r\n", + name, (u_int)next, n_indir); + return (-1); + } + } + } + if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) + return (i); + } +loopy: + fprintf(stderr, + "%s: descriptor loop? count > %d - driver confused?\r\n", + name, i); + return (-1); +} + +/* + * Return the currently-first request chain to the guest, setting + * its I/O length to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint32_t iolen) +{ + uint16_t head, uidx, mask; + volatile struct vring_used *vuh; + volatile struct virtio_used *vue; + + /* + * Notes: + * - mask is N-1 where N is a power of 2 so computes x % N + * - vuh points to the "used" data shared with guest + * - vue points to the "used" ring entry we want to update + * - head is the same value we compute in vq_iovecs(). + * + * (I apologize for the two fields named vu_idx; the + * virtio spec calls the one that vue points to, "id"...) + */ + mask = vq->vq_qsize - 1; + vuh = vq->vq_used; + head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask]; + + uidx = vuh->vu_idx; + vue = &vuh->vu_ring[uidx++ & mask]; + vue->vu_idx = head; /* ie, vue->id = head */ + vue->vu_tlen = iolen; + vuh->vu_idx = uidx; +} + +/* + * Driver has finished processing "available" chains and calling + * vq_relchain on each one. If driver used all the available + * chains, used_all should be set. + * + * If the "used" index moved we may need to inform the guest, i.e., + * deliver an interrupt. Even if the used index did NOT move we + * may need to deliver an interrupt, if the avail ring is empty and + * we are supposed to interrupt on empty. + * + * Note that used_all_avail is provided by the caller because it's + * a snapshot of the ring state when he decided to finish interrupt + * processing -- it's possible that descriptors became available after + * that point. (It's also typically a constant 1/True as well.) + */ +void +vq_endchains(struct vqueue_info *vq, int used_all_avail) +{ + struct virtio_softc *vs; + uint16_t event_idx, new_idx, old_idx; + int intr; + + /* + * Interrupt generation: if we're using EVENT_IDX, + * interrupt if we've crossed the event threshold. + * Otherwise interrupt is generated if we added "used" entries, + * but suppressed by VRING_AVAIL_F_NO_INTERRUPT. + * + * In any case, though, if NOTIFY_ON_EMPTY is set and the + * entire avail was processed, we need to interrupt always. + */ + vs = vq->vq_vs; + new_idx = vq->vq_used->vu_idx; + old_idx = vq->vq_save_used; + if (used_all_avail && + (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) + intr = 1; + else if (vs->vs_flags & VIRTIO_EVENT_IDX) { + event_idx = VQ_USED_EVENT_IDX(vq); + /* + * This calculation is per docs and the kernel + * (see src/sys/dev/virtio/virtio_ring.h). + */ + intr = (uint16_t)(new_idx - event_idx - 1) < + (uint16_t)(new_idx - old_idx); + } else { + intr = new_idx != old_idx && + !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT); + } + if (intr) + vq_interrupt(vs, vq); +} + +/* Note: these are in sorted order to make for a fast search */ +static struct config_reg { + uint16_t cr_offset; /* register offset */ + uint8_t cr_size; /* size (bytes) */ + uint8_t cr_ro; /* true => reg is read only */ + const char *cr_name; /* name of reg */ +} config_regs[] = { + { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" }, + { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" }, + { VTCFG_R_PFN, 4, 0, "PFN" }, + { VTCFG_R_QNUM, 2, 1, "QNUM" }, + { VTCFG_R_QSEL, 2, 0, "QSEL" }, + { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" }, + { VTCFG_R_STATUS, 1, 0, "STATUS" }, + { VTCFG_R_ISR, 1, 0, "ISR" }, + { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" }, + { VTCFG_R_QVEC, 2, 0, "QVEC" }, +}; + +static inline struct config_reg * +vi_find_cr(int offset) { + u_int hi, lo, mid; + struct config_reg *cr; + + lo = 0; + hi = sizeof(config_regs) / sizeof(*config_regs) - 1; + while (hi >= lo) { + mid = (hi + lo) >> 1; + cr = &config_regs[mid]; + if (cr->cr_offset == offset) + return (cr); + if (cr->cr_offset < offset) + lo = mid + 1; + else + hi = mid - 1; + } + return (NULL); +} + +/* + * Handle pci config space reads. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +uint64_t +vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct virtio_softc *vs = pi->pi_arg; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + uint32_t value; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + return (pci_emul_msix_tread(pi, offset, size)); + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + * If that fails, fall into general code. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size) { + if (cr != NULL) { + /* offset must be OK, so size must be bad */ + fprintf(stderr, + "%s: read from %s: bad size %d\r\n", + name, cr->cr_name, size); + } else { + fprintf(stderr, + "%s: read from bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_HOSTCAP: + value = vc->vc_hv_caps; + break; + case VTCFG_R_GUESTCAP: + value = vs->vs_negotiated_caps; + break; + case VTCFG_R_PFN: + if (vs->vs_curq < vc->vc_nvq) + value = vs->vs_queues[vs->vs_curq].vq_pfn; + break; + case VTCFG_R_QNUM: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_qsize : 0; + break; + case VTCFG_R_QSEL: + value = vs->vs_curq; + break; + case VTCFG_R_QNOTIFY: + value = 0; /* XXX */ + break; + case VTCFG_R_STATUS: + value = vs->vs_status; + break; + case VTCFG_R_ISR: + value = vs->vs_isr; + vs->vs_isr = 0; /* a read clears this flag */ + break; + case VTCFG_R_CFGVEC: + value = vs->vs_msix_cfg_idx; + break; + case VTCFG_R_QVEC: + value = vs->vs_curq < vc->vc_nvq ? + vs->vs_queues[vs->vs_curq].vq_msix_idx : + VIRTIO_MSI_NO_VECTOR; + break; + } +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); + return (value); +} + +/* + * Handle pci config space writes. + * If it's to the MSI-X info, do that. + * If it's part of the virtio standard stuff, do that. + * Otherwise dispatch to the actual driver. + */ +void +vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct virtio_softc *vs = pi->pi_arg; + struct vqueue_info *vq; + struct virtio_consts *vc; + struct config_reg *cr; + uint64_t virtio_config_size, max; + const char *name; + uint32_t newoff; + int error; + + if (vs->vs_flags & VIRTIO_USE_MSIX) { + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + } + + /* XXX probably should do something better than just assert() */ + assert(baridx == 0); + + if (vs->vs_mtx) + pthread_mutex_lock(vs->vs_mtx); + + vc = vs->vs_vc; + name = vc->vc_name; + + if (size != 1 && size != 2 && size != 4) + goto bad; + + if (pci_msix_enabled(pi)) + virtio_config_size = VTCFG_R_CFG1; + else + virtio_config_size = VTCFG_R_CFG0; + + if (offset >= virtio_config_size) { + /* + * Subtract off the standard size (including MSI-X + * registers if enabled) and dispatch to underlying driver. + */ + newoff = offset - virtio_config_size; + max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000; + if (newoff + size > max) + goto bad; + error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value); + if (!error) + goto done; + } + +bad: + cr = vi_find_cr(offset); + if (cr == NULL || cr->cr_size != size || cr->cr_ro) { + if (cr != NULL) { + /* offset must be OK, wrong size and/or reg is R/O */ + if (cr->cr_size != size) + fprintf(stderr, + "%s: write to %s: bad size %d\r\n", + name, cr->cr_name, size); + if (cr->cr_ro) + fprintf(stderr, + "%s: write to read-only reg %s\r\n", + name, cr->cr_name); + } else { + fprintf(stderr, + "%s: write to bad offset/size %jd/%d\r\n", + name, (uintmax_t)offset, size); + } + goto done; + } + + switch (offset) { + case VTCFG_R_GUESTCAP: + vs->vs_negotiated_caps = value & vc->vc_hv_caps; + break; + case VTCFG_R_PFN: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vi_vq_init(vs, value); + break; + case VTCFG_R_QSEL: + /* + * Note that the guest is allowed to select an + * invalid queue; we just need to return a QNUM + * of 0 while the bad queue is selected. + */ + vs->vs_curq = value; + break; + case VTCFG_R_QNOTIFY: + if (value >= vc->vc_nvq) { + fprintf(stderr, "%s: queue %d notify out of range\r\n", + name, (int)value); + goto done; + } + vq = &vs->vs_queues[value]; + if (vq->vq_notify) + (*vq->vq_notify)(DEV_SOFTC(vs), vq); + else if (vc->vc_qnotify) + (*vc->vc_qnotify)(DEV_SOFTC(vs), vq); + else + fprintf(stderr, + "%s: qnotify queue %d: missing vq/vc notify\r\n", + name, (int)value); + break; + case VTCFG_R_STATUS: + vs->vs_status = value; + if (value == 0) + (*vc->vc_reset)(DEV_SOFTC(vs)); + break; + case VTCFG_R_CFGVEC: + vs->vs_msix_cfg_idx = value; + break; + case VTCFG_R_QVEC: + if (vs->vs_curq >= vc->vc_nvq) + goto bad_qindex; + vq = &vs->vs_queues[vs->vs_curq]; + vq->vq_msix_idx = value; + break; + } + goto done; + +bad_qindex: + fprintf(stderr, + "%s: write config reg %s: curq %d >= max %d\r\n", + name, cr->cr_name, vs->vs_curq, vc->vc_nvq); +done: + if (vs->vs_mtx) + pthread_mutex_unlock(vs->vs_mtx); +} diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h index a512381..8975bf7 100644 --- a/usr.sbin/bhyve/virtio.h +++ b/usr.sbin/bhyve/virtio.h @@ -1,5 +1,5 @@ /*- - * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2013 Chris Torek <torek @ torek net> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -11,10 +11,10 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -29,47 +29,195 @@ #ifndef _VIRTIO_H_ #define _VIRTIO_H_ +/* + * These are derived from several virtio specifications. + * + * Some useful links: + * https://github.com/rustyrussel/virtio-spec + * http://people.redhat.com/pbonzini/virtio-spec.pdf + */ + +/* + * A virtual device has zero or more "virtual queues" (virtqueue). + * Each virtqueue uses at least two 4096-byte pages, laid out thus: + * + * +-----------------------------------------------+ + * | "desc": <N> descriptors, 16 bytes each | + * | ----------------------------------------- | + * | "avail": 2 uint16; <N> uint16; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * | "used": 2 x uint16; <N> elems; 1 uint16 | + * | ----------------------------------------- | + * | pad to 4k boundary | + * +-----------------------------------------------+ + * + * The number <N> that appears here is always a power of two and is + * limited to no more than 32768 (as it must fit in a 16-bit field). + * If <N> is sufficiently large, the above will occupy more than + * two pages. In any case, all pages must be physically contiguous + * within the guest's physical address space. + * + * The <N> 16-byte "desc" descriptors consist of a 64-bit guest + * physical address <addr>, a 32-bit length <len>, a 16-bit + * <flags>, and a 16-bit <next> field (all in guest byte order). + * + * There are three flags that may be set : + * NEXT descriptor is chained, so use its "next" field + * WRITE descriptor is for host to write into guest RAM + * (else host is to read from guest RAM) + * INDIRECT descriptor address field is (guest physical) + * address of a linear array of descriptors + * + * Unless INDIRECT is set, <len> is the number of bytes that may + * be read/written from guest physical address <addr>. If + * INDIRECT is set, WRITE is ignored and <len> provides the length + * of the indirect descriptors (and <len> must be a multiple of + * 16). Note that NEXT may still be set in the main descriptor + * pointing to the indirect, and should be set in each indirect + * descriptor that uses the next descriptor (these should generally + * be numbered sequentially). However, INDIRECT must not be set + * in the indirect descriptors. Upon reaching an indirect descriptor + * without a NEXT bit, control returns to the direct descriptors. + * + * Except inside an indirect, each <next> value must be in the + * range [0 .. N) (i.e., the half-open interval). (Inside an + * indirect, each <next> must be in the range [0 .. <len>/16).) + * + * The "avail" data structures reside in the same pages as the + * "desc" structures since both together are used by the device to + * pass information to the hypervisor's virtual driver. These + * begin with a 16-bit <flags> field and 16-bit index <idx>, then + * have <N> 16-bit <ring> values, followed by one final 16-bit + * field <used_event>. The <N> <ring> entries are simply indices + * indices into the descriptor ring (and thus must meet the same + * constraints as each <next> value). However, <idx> is counted + * up from 0 (initially) and simply wraps around after 65535; it + * is taken mod <N> to find the next available entry. + * + * The "used" ring occupies a separate page or pages, and contains + * values written from the virtual driver back to the guest OS. + * This begins with a 16-bit <flags> and 16-bit <idx>, then there + * are <N> "vring_used" elements, followed by a 16-bit <avail_event>. + * The <N> "vring_used" elements consist of a 32-bit <id> and a + * 32-bit <len> (vu_tlen below). The <id> is simply the index of + * the head of a descriptor chain the guest made available + * earlier, and the <len> is the number of bytes actually written, + * e.g., in the case of a network driver that provided a large + * receive buffer but received only a small amount of data. + * + * The two event fields, <used_event> and <avail_event>, in the + * avail and used rings (respectively -- note the reversal!), are + * always provided, but are used only if the virtual device + * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature + * negotiation. Similarly, both rings provide a flag -- + * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in + * their <flags> field, indicating that the guest does not need an + * interrupt, or that the hypervisor driver does not need a + * notify, when descriptors are added to the corresponding ring. + * (These are provided only for interrupt optimization and need + * not be implemented.) + */ #define VRING_ALIGN 4096 #define VRING_DESC_F_NEXT (1 << 0) #define VRING_DESC_F_WRITE (1 << 1) #define VRING_DESC_F_INDIRECT (1 << 2) +struct virtio_desc { /* AKA vring_desc */ + uint64_t vd_addr; /* guest physical address */ + uint32_t vd_len; /* length of scatter/gather seg */ + uint16_t vd_flags; /* VRING_F_DESC_* */ + uint16_t vd_next; /* next desc if F_NEXT */ +} __packed; + +struct virtio_used { /* AKA vring_used_elem */ + uint32_t vu_idx; /* head of used descriptor chain */ + uint32_t vu_tlen; /* length written-to */ +} __packed; + #define VRING_AVAIL_F_NO_INTERRUPT 1 -#define VIRTIO_MSI_NO_VECTOR 0xFFFF -struct virtio_desc { - uint64_t vd_addr; - uint32_t vd_len; - uint16_t vd_flags; - uint16_t vd_next; +struct vring_avail { + uint16_t va_flags; /* VRING_AVAIL_F_* */ + uint16_t va_idx; /* counts to 65535, then cycles */ + uint16_t va_ring[]; /* size N, reported in QNUM value */ +/* uint16_t va_used_event; -- after N ring entries */ } __packed; -struct virtio_used { - uint32_t vu_idx; - uint32_t vu_tlen; +#define VRING_USED_F_NO_NOTIFY 1 +struct vring_used { + uint16_t vu_flags; /* VRING_USED_F_* */ + uint16_t vu_idx; /* counts to 65535, then cycles */ + struct virtio_used vu_ring[]; /* size N */ +/* uint16_t vu_avail_event; -- after N ring entries */ } __packed; /* + * The address of any given virtual queue is determined by a single + * Page Frame Number register. The guest writes the PFN into the + * PCI config space. However, a device that has two or more + * virtqueues can have a different PFN, and size, for each queue. + * The number of queues is determinable via the PCI config space + * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means + * queue #0, 1 means queue#1, etc. Once a queue is selected, the + * remaining PFN and QNUM registers refer to that queue. + * + * QNUM is a read-only register containing a nonzero power of two + * that indicates the (hypervisor's) queue size. Or, if reading it + * produces zero, the hypervisor does not have a corresponding + * queue. (The number of possible queues depends on the virtual + * device. The block device has just one; the network device + * provides either two -- 0 = receive, 1 = transmit -- or three, + * with 2 = control.) + * + * PFN is a read/write register giving the physical page address of + * the virtqueue in guest memory (the guest must allocate enough space + * based on the hypervisor's provided QNUM). + * + * QNOTIFY is effectively write-only: when the guest writes a queue + * number to the register, the hypervisor should scan the specified + * virtqueue. (Reading QNOTIFY currently always gets 0). + */ + +/* * PFN register shift amount */ -#define VRING_PFN 12 +#define VRING_PFN 12 /* * Virtio device types + * + * XXX Should really be merged with <dev/virtio/virtio.h> defines */ -#define VIRTIO_TYPE_NET 1 -#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_NET 1 +#define VIRTIO_TYPE_BLOCK 2 +#define VIRTIO_TYPE_CONSOLE 3 +#define VIRTIO_TYPE_ENTROPY 4 +#define VIRTIO_TYPE_BALLOON 5 +#define VIRTIO_TYPE_IOMEMORY 6 +#define VIRTIO_TYPE_RPMSG 7 +#define VIRTIO_TYPE_SCSI 8 +#define VIRTIO_TYPE_9P 9 + +/* experimental IDs start at 65535 and work down */ /* * PCI vendor/device IDs */ -#define VIRTIO_VENDOR 0x1AF4 -#define VIRTIO_DEV_NET 0x1000 -#define VIRTIO_DEV_BLOCK 0x1001 +#define VIRTIO_VENDOR 0x1AF4 +#define VIRTIO_DEV_NET 0x1000 +#define VIRTIO_DEV_BLOCK 0x1001 /* - * PCI config space constants + * PCI config space constants. + * + * If MSI-X is enabled, the ISR register is generally not used, + * and the configuration vector and queue vector appear at offsets + * 20 and 22 with the remaining configuration registers at 24. + * If MSI-X is not enabled, those two registers disappear and + * the remaining configuration registers start at offset 20. */ #define VTCFG_R_HOSTCAP 0 #define VTCFG_R_GUESTCAP 4 @@ -85,22 +233,227 @@ struct virtio_used { #define VTCFG_R_CFG1 24 /* With MSI-X */ #define VTCFG_R_MSIX 20 -/* Feature flags */ +/* + * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, + * but a guest writing 0 to this register means "please reset". + */ +#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */ +#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */ +#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */ +#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */ + +/* + * Bits in VTCFG_R_ISR. These apply only if not using MSI-X. + * + * (We don't [yet?] ever use CONF_CHANGED.) + */ +#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ +#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ + +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * Feature flags. + * Note: bits 0 through 23 are reserved to each device type. + */ #define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) /* From section 2.3, "Virtqueue Configuration", of the virtio specification */ -static inline u_int +static inline size_t vring_size(u_int qsz) { - u_int size; + size_t size; + /* constant 3 below = va_flags, va_idx, va_used_event */ size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz); size = roundup2(size, VRING_ALIGN); + /* constant 3 below = vu_flags, vu_idx, vu_avail_event */ size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz; size = roundup2(size, VRING_ALIGN); return (size); } +struct vmctx; +struct pci_devinst; +struct vqueue_info; + +/* + * A virtual device, with some number (possibly 0) of virtual + * queues and some size (possibly 0) of configuration-space + * registers private to the device. The virtio_softc should come + * at the front of each "derived class", so that a pointer to the + * virtio_softc is also a pointer to the more specific, derived- + * from-virtio driver's softc. + * + * Note: inside each hypervisor virtio driver, changes to these + * data structures must be locked against other threads, if any. + * Except for PCI config space register read/write, we assume each + * driver does the required locking, but we need a pointer to the + * lock (if there is one) for PCI config space read/write ops. + * + * When the guest reads or writes the device's config space, the + * generic layer checks for operations on the special registers + * described above. If the offset of the register(s) being read + * or written is past the CFG area (CFG0 or CFG1), the request is + * passed on to the virtual device, after subtracting off the + * generic-layer size. (So, drivers can just use the offset as + * an offset into "struct config", for instance.) + * + * (The virtio layer also makes sure that the read or write is to/ + * from a "good" config offset, hence vc_cfgsize, and on BAR #0. + * However, the driver must verify the read or write size and offset + * and that no one is writing a readonly register.) + * + * The BROKED flag ("this thing done gone and broked") is for future + * use. + */ +#define VIRTIO_USE_MSIX 0x01 +#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */ +#define VIRTIO_BROKED 0x08 /* ??? */ + +struct virtio_softc { + struct virtio_consts *vs_vc; /* constants (see below) */ + int vs_flags; /* VIRTIO_* flags from above */ + pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */ + struct pci_devinst *vs_pi; /* PCI device instance */ + uint32_t vs_negotiated_caps; /* negotiated capabilities */ + struct vqueue_info *vs_queues; /* one per vc_nvq */ + int vs_curq; /* current queue */ + uint8_t vs_status; /* value from last status write */ + uint8_t vs_isr; /* ISR flags, if not MSI-X */ + uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */ +}; + +struct virtio_consts { + const char *vc_name; /* name of driver (for diagnostics) */ + int vc_nvq; /* number of virtual queues */ + size_t vc_cfgsize; /* size of dev-specific config regs */ + void (*vc_reset)(void *); /* called on virtual device reset */ + void (*vc_qnotify)(void *, struct vqueue_info *); + /* called on QNOTIFY if no VQ notify */ + int (*vc_cfgread)(void *, int, int, uint32_t *); + /* called to read config regs */ + int (*vc_cfgwrite)(void *, int, int, uint32_t); + /* called to write config regs */ + uint32_t vc_hv_caps; /* hypervisor-provided capabilities */ +}; + +/* + * Data structure allocated (statically) per virtual queue. + * + * Drivers may change vq_qsize after a reset. When the guest OS + * requests a device reset, the hypervisor first calls + * vs->vs_vc->vc_reset(); then the data structure below is + * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq). + * + * The remaining fields should only be fussed-with by the generic + * code. + * + * Note: the addresses of vq_desc, vq_avail, and vq_used are all + * computable from each other, but it's a lot simpler if we just + * keep a pointer to each one. The event indices are similarly + * (but more easily) computable, and this time we'll compute them: + * they're just XX_ring[N]. + */ +#define VQ_ALLOC 0x01 /* set once we have a pfn */ +#define VQ_BROKED 0x02 /* ??? */ +struct vqueue_info { + uint16_t vq_qsize; /* size of this queue (a power of 2) */ + void (*vq_notify)(void *, struct vqueue_info *); + /* called instead of vc_notify, if not NULL */ + + struct virtio_softc *vq_vs; /* backpointer to softc */ + uint16_t vq_num; /* we're the num'th queue in the softc */ + + uint16_t vq_flags; /* flags (see above) */ + uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */ + uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */ + uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */ + + uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */ + + volatile struct virtio_desc *vq_desc; /* descriptor array */ + volatile struct vring_avail *vq_avail; /* the "avail" ring */ + volatile struct vring_used *vq_used; /* the "used" ring */ + +}; +/* as noted above, these are sort of backwards, name-wise */ +#define VQ_AVAIL_EVENT_IDX(vq) \ + (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize]) +#define VQ_USED_EVENT_IDX(vq) \ + ((vq)->vq_avail->va_ring[(vq)->vq_qsize]) + +/* + * Is this ring ready for I/O? + */ +static inline int +vq_ring_ready(struct vqueue_info *vq) +{ + + return (vq->vq_flags & VQ_ALLOC); +} + +/* + * Are there "available" descriptors? (This does not count + * how many, just returns True if there are some.) + */ +static inline int +vq_has_descs(struct vqueue_info *vq) +{ + + return (vq_ring_ready(vq) && vq->vq_last_avail != + vq->vq_avail->va_idx); +} + +/* + * Called by virtio driver as it starts processing chains. Each + * completed chain (obtained from vq_getchain()) is released by + * calling vq_relchain(), then when all are done, vq_endchains() + * can tell if / how-many chains were processed and know whether + * and how to generate an interrupt. + */ +static inline void +vq_startchains(struct vqueue_info *vq) +{ + + vq->vq_save_used = vq->vq_used->vu_idx; +} + +/* + * Deliver an interrupt to guest on the given virtual queue + * (if possible, or a generic MSI interrupt if not using MSI-X). + */ +static inline void +vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) +{ + + if (vs->vs_flags & VIRTIO_USE_MSIX) + pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); + else { + vs->vs_isr |= VTCFG_ISR_QUEUES; + pci_generate_msi(vs->vs_pi, 0); + } +} + +struct iovec; +void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc, + void *dev_softc, struct pci_devinst *pi, + struct vqueue_info *queues); +int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); +void vi_reset_dev(struct virtio_softc *); +void vi_set_io_bar(struct virtio_softc *, int); + +int vq_getchain(struct vqueue_info *vq, + struct iovec *iov, int n_iov, uint16_t *flags); +void vq_relchain(struct vqueue_info *vq, uint32_t iolen); +void vq_endchains(struct vqueue_info *vq, int used_all_avail); + +uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size); +void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value); #endif /* _VIRTIO_H_ */ |