Major rework of the virtio code. Split out common parts, and modify

the net/block devices accordingly. Submitted by: Chris Torek torek at torek dot net Reviewed by: grehan
author: grehan <grehan@FreeBSD.org> 2013-07-17 23:37:33 +0000
committer: grehan <grehan@FreeBSD.org> 2013-07-17 23:37:33 +0000
commit: a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb (patch)
tree: 064903149f0c797df3873dc7997273f417c81f93 /usr.sbin
parent: b8663d4c053e282b686f3e2a2d625b21b5944176 (diff)
download: FreeBSD-src-a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb.zip
FreeBSD-src-a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb.tar.gz
5 files changed, 1408 insertions, 961 deletions
diff --git a/usr.sbin/bhyve/Makefile b/usr.sbin/bhyve/Makefile
index e6aa8b2..17355c3 100644
--- a/usr.sbin/bhyve/Makefile
+++ b/usr.sbin/bhyve/Makefile
@@ -10,7 +10,7 @@ SRCS=	acpi.c atpic.c bhyverun.c consport.c dbgport.c elcr.c inout.c
 SRCS+=  ioapic.c mem.c mevent.c mptbl.c
 SRCS+=	pci_emul.c pci_hostbridge.c pci_passthru.c pci_virtio_block.c
 SRCS+=	pci_virtio_net.c pci_uart.c pit_8254.c pmtmr.c post.c rtc.c
-SRCS+=	xmsr.c spinup_ap.c
+SRCS+=	virtio.c xmsr.c spinup_ap.c
 
 .PATH:	${.CURDIR}/../../sys/amd64/vmm
 SRCS+=	vmm_instruction_emul.c
diff --git a/usr.sbin/bhyve/pci_virtio_block.c b/usr.sbin/bhyve/pci_virtio_block.c
index 5c42dc2..4395410 100644
--- a/usr.sbin/bhyve/pci_virtio_block.c
+++ b/usr.sbin/bhyve/pci_virtio_block.c
@@ -53,14 +53,6 @@ __FBSDID("$FreeBSD$");
 
 #define VTBLK_RINGSZ	64
 
-#define VTBLK_CFGSZ	28
-
-#define VTBLK_R_CFG		VTCFG_R_CFG1 
-#define VTBLK_R_CFG_END		VTBLK_R_CFG + VTBLK_CFGSZ -1
-#define VTBLK_R_MAX		VTBLK_R_CFG_END
-
-#define VTBLK_REGSZ		VTBLK_R_MAX+1
-
 #define VTBLK_MAXSEGS	32
 
 #define VTBLK_S_OK	0
@@ -71,28 +63,10 @@ __FBSDID("$FreeBSD$");
  */
 #define VTBLK_S_HOSTCAPS      \
   ( 0x00000004 |	/* host maximum request segments */ \
-    0x10000000 )	/* supports indirect descriptors */
-
-static int use_msix = 1;
-
-struct vring_hqueue {
-	/* Internal state */
-	uint16_t	hq_size;
-	uint16_t	hq_cur_aidx;		/* trails behind 'avail_idx' */
-
-	 /* Host-context pointers to the queue */
-	struct virtio_desc *hq_dtable;
-	uint16_t	*hq_avail_flags;
-	uint16_t	*hq_avail_idx;		/* monotonically increasing */
-	uint16_t	*hq_avail_ring;
-
-	uint16_t	*hq_used_flags;
-	uint16_t	*hq_used_idx;		/* monotonically increasing */
-	struct virtio_used *hq_used_ring;
-};
+    VIRTIO_RING_F_INDIRECT_DESC )	/* indirect descriptors */
 
 /*
- * Config space
+ * Config space "registers"
  */
 struct vtblk_config {
 	uint64_t	vbc_capacity;
@@ -104,7 +78,6 @@ struct vtblk_config {
 	uint32_t	vbc_blk_size;
 	uint32_t	vbc_sectors_max;
 } __packed;
-CTASSERT(sizeof(struct vtblk_config) == VTBLK_CFGSZ);
 
 /*
  * Fixed-size block header
@@ -129,113 +102,69 @@ static int pci_vtblk_debug;
  * Per-device softc
  */
 struct pci_vtblk_softc {
-	struct pci_devinst *vbsc_pi;
+	struct virtio_softc vbsc_vs;
+	struct vqueue_info vbsc_vq;
 	int		vbsc_fd;
-	int		vbsc_status;
-	int		vbsc_isr;
-	int		vbsc_lastq;
-	uint32_t	vbsc_features;
-	uint64_t	vbsc_pfn;
-	struct vring_hqueue vbsc_q;
 	struct vtblk_config vbsc_cfg;	
-	uint16_t	msix_table_idx_req;
-	uint16_t	msix_table_idx_cfg;
 };
-#define	vtblk_ctx(sc)	((sc)->vbsc_pi->pi_vmctx)
-
-/* 
- * Return the size of IO BAR that maps virtio header and device specific
- * region. The size would vary depending on whether MSI-X is enabled or 
- * not
- */ 
-static uint64_t
-pci_vtblk_iosize(struct pci_devinst *pi)
-{
-
-	if (pci_msix_enabled(pi)) 
-		return (VTBLK_REGSZ);
-	else
-		return (VTBLK_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
-}
-
-/*
- * Return the number of available descriptors in the vring taking care
- * of the 16-bit index wraparound.
- */
-static int
-hq_num_avail(struct vring_hqueue *hq)
-{
-	uint16_t ndesc;
-
-	/*
-	 * We're just computing (a-b) in GF(216).
-	 *
-	 * The only glitch here is that in standard C,
-	 * uint16_t promotes to (signed) int when int has
-	 * more than 16 bits (pretty much always now), so
-	 * we have to force it back to unsigned.
-	 */
-	ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
-
-	assert(ndesc <= hq->hq_size);
 
-	return (ndesc);
-}
+static void pci_vtblk_reset(void *);
+static void pci_vtblk_notify(void *, struct vqueue_info *);
+static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
+static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtblk_vi_consts = {
+	"vtblk",		/* our name */
+	1,			/* we support 1 virtqueue */
+	sizeof(struct vtblk_config), /* config reg size */
+	pci_vtblk_reset,	/* reset */
+	pci_vtblk_notify,	/* device-wide qnotify */
+	pci_vtblk_cfgread,	/* read PCI config */
+	pci_vtblk_cfgwrite,	/* write PCI config */
+	VTBLK_S_HOSTCAPS,	/* our capabilities */
+};
 
 static void
-pci_vtblk_update_status(struct pci_vtblk_softc *sc, uint32_t value)
+pci_vtblk_reset(void *vsc)
 {
-	if (value == 0) {
-		DPRINTF(("vtblk: device reset requested !\n"));
-		sc->vbsc_isr = 0;
-		sc->msix_table_idx_req = VIRTIO_MSI_NO_VECTOR;
-		sc->msix_table_idx_cfg = VIRTIO_MSI_NO_VECTOR;
-		sc->vbsc_features = 0;
-		sc->vbsc_pfn = 0;
-		sc->vbsc_lastq = 0;
-		memset(&sc->vbsc_q, 0, sizeof(struct vring_hqueue));
-	}
+	struct pci_vtblk_softc *sc = vsc;
 
-	sc->vbsc_status = value;
+	DPRINTF(("vtblk: device reset requested !\n"));
+	vi_reset_dev(&sc->vbsc_vs);
 }
 
 static void
-pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
 {
-	struct iovec iov[VTBLK_MAXSEGS];
 	struct virtio_blk_hdr *vbh;
-	struct virtio_desc *vd, *vid;
-	struct virtio_used *vu;
 	uint8_t *status;
-	int i;
+	int i, n;
 	int err;
 	int iolen;
-	int uidx, aidx, didx;
-	int indirect, writeop, type;
+	int writeop, type;
 	off_t offset;
+	struct iovec iov[VTBLK_MAXSEGS + 2];
+	uint16_t flags[VTBLK_MAXSEGS + 2];
 
-	uidx = *hq->hq_used_idx;
-	aidx = hq->hq_cur_aidx;
-	didx = hq->hq_avail_ring[aidx % hq->hq_size];
-	assert(didx >= 0 && didx < hq->hq_size);
-
-	vd = &hq->hq_dtable[didx];
-
-	indirect = ((vd->vd_flags & VRING_DESC_F_INDIRECT) != 0);
-
-	if (indirect) {
-		vid = paddr_guest2host(vtblk_ctx(sc), vd->vd_addr, vd->vd_len);
-		vd = &vid[0];
-	}
+	n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags);
 
 	/*
-	 * The first descriptor will be the read-only fixed header
+	 * The first descriptor will be the read-only fixed header,
+	 * and the last is for status (hence +2 above and below).
+	 * The remaining iov's are the actual data I/O vectors.
+	 *
+	 * XXX - note - this fails on crash dump, which does a
+	 * VIRTIO_BLK_T_FLUSH with a zero transfer length
 	 */
-	vbh = paddr_guest2host(vtblk_ctx(sc), vd->vd_addr,
-			    sizeof(struct virtio_blk_hdr));
-	assert(vd->vd_len == sizeof(struct virtio_blk_hdr));
-	assert(vd->vd_flags & VRING_DESC_F_NEXT);
-	assert((vd->vd_flags & VRING_DESC_F_WRITE) == 0);
+	assert (n >= 3 && n < VTBLK_MAXSEGS + 2);
+
+	assert((flags[0] & VRING_DESC_F_WRITE) == 0);
+	assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
+	vbh = iov[0].iov_base;
+
+	status = iov[--n].iov_base;
+	assert(iov[n].iov_len == 1);
+	assert(flags[n] & VRING_DESC_F_WRITE);
 
 	/*
 	 * XXX
@@ -247,120 +176,44 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vring_hqueue *hq)
 
 	offset = vbh->vbh_sector * DEV_BSIZE;
 
-	/*
-	 * Build up the iovec based on the guest's data descriptors
-	 */
-	i = iolen = 0;
-	while (1) {
-		if (indirect)
-			vd = &vid[i + 1];	/* skip first indirect desc */
-		else
-			vd = &hq->hq_dtable[vd->vd_next];
-
-		if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
-			break;
-
-		if (i == VTBLK_MAXSEGS)
-			break;
-
+	iolen = 0;
+	for (i = 1; i < n; i++) {
 		/*
 		 * - write op implies read-only descriptor,
 		 * - read op implies write-only descriptor,
 		 * therefore test the inverse of the descriptor bit
 		 * to the op.
 		 */
-		assert(((vd->vd_flags & VRING_DESC_F_WRITE) == 0) ==
-		       writeop);
-
-		iov[i].iov_base = paddr_guest2host(vtblk_ctx(sc),
-						   vd->vd_addr,
-						   vd->vd_len);
-		iov[i].iov_len = vd->vd_len;
-		iolen += vd->vd_len;
-		i++;
+		assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
+		iolen += iov[i].iov_len;
 	}
 
-	/* Lastly, get the address of the status byte */
-	status = paddr_guest2host(vtblk_ctx(sc), vd->vd_addr, 1);
-	assert(vd->vd_len == 1);
-	assert((vd->vd_flags & VRING_DESC_F_NEXT) == 0);
-	assert(vd->vd_flags & VRING_DESC_F_WRITE);
-
 	DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", 
-		 writeop ? "write" : "read", iolen, i, offset));
+		 writeop ? "write" : "read", iolen, i - 1, offset));
 
 	if (writeop)
-		err = pwritev(sc->vbsc_fd, iov, i, offset);
+		err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset);
 	else
-		err = preadv(sc->vbsc_fd, iov, i, offset);
+		err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset);
 
 	*status = err < 0 ? VTBLK_S_IOERR : VTBLK_S_OK;
 
 	/*
-	 * Return the single descriptor back to the host
+	 * Return the descriptor back to the host.
+	 * We wrote 1 byte (our status) to host.
 	 */
-	vu = &hq->hq_used_ring[uidx % hq->hq_size];
-	vu->vu_idx = didx;
-	vu->vu_tlen = 1;
-	hq->hq_cur_aidx++;
-	*hq->hq_used_idx += 1;
-
-	/*
-	 * Generate an interrupt if able
-	 */
-	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) { 
-		if (use_msix) {
-			pci_generate_msix(sc->vbsc_pi, sc->msix_table_idx_req);	
-		} else if (sc->vbsc_isr == 0) {
-			sc->vbsc_isr = 1;
-			pci_generate_msi(sc->vbsc_pi, 0);
-		}
-	}
+	vq_relchain(vq, 1);
 }
 
 static void
-pci_vtblk_qnotify(struct pci_vtblk_softc *sc)
+pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
 {
-	struct vring_hqueue *hq = &sc->vbsc_q;
-	int ndescs;
+	struct pci_vtblk_softc *sc = vsc;
 
-	while ((ndescs = hq_num_avail(hq)) != 0) {
-		/*
-		 * Run through all the entries, placing them into iovecs and
-		 * sending when an end-of-packet is found
-		 */
- 		pci_vtblk_proc(sc, hq);
- 	}
-}
-
-static void
-pci_vtblk_ring_init(struct pci_vtblk_softc *sc, uint64_t pfn)
-{
-	struct vring_hqueue *hq;
-
-	sc->vbsc_pfn = pfn << VRING_PFN;
-	
-	/*
-	 * Set up host pointers to the various parts of the
-	 * queue
-	 */
-	hq = &sc->vbsc_q;
-	hq->hq_size = VTBLK_RINGSZ;
-
-	hq->hq_dtable = paddr_guest2host(vtblk_ctx(sc), pfn << VRING_PFN,
-					 vring_size(VTBLK_RINGSZ));
-	hq->hq_avail_flags =  (uint16_t *)(hq->hq_dtable + hq->hq_size);
-	hq->hq_avail_idx = hq->hq_avail_flags + 1;
-	hq->hq_avail_ring = hq->hq_avail_flags + 2;
-	hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
-						 VRING_ALIGN);
-	hq->hq_used_idx = hq->hq_used_flags + 1;
-	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
-
-	/*
-	 * Initialize queue indexes
-	 */
-	hq->hq_cur_aidx = 0;
+	vq_startchains(vq);
+	while (vq_has_descs(vq))
+		pci_vtblk_proc(sc, vq);
+	vq_endchains(vq, 1);	/* Generate interrupt if appropriate. */
 }
 
 static int
@@ -371,6 +224,7 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	off_t size;	
 	int fd;
 	int sectsz;
+	int use_msix;
 	const char *env_msi;
 
 	if (opts == NULL) {
@@ -412,10 +266,14 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	sc = malloc(sizeof(struct pci_vtblk_softc));
 	memset(sc, 0, sizeof(struct pci_vtblk_softc));
 
-	pi->pi_arg = sc;
-	sc->vbsc_pi = pi;
+	/* record fd of storage device/file */
 	sc->vbsc_fd = fd;
 
+	/* init virtio softc and virtqueues */
+	vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
+	sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
+	/* sc->vbsc_vq.vq_notify = we have no per-queue notify */
+
 	/* setup virtio block config space */
 	sc->vbsc_cfg.vbc_capacity = size / sectsz;
 	sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
@@ -426,206 +284,51 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	sc->vbsc_cfg.vbc_geom_s = 0;
 	sc->vbsc_cfg.vbc_sectors_max = 0;
 
-	/* initialize config space */
+	/*
+	 * Should we move some of this into virtio.c?  Could
+	 * have the device, class, and subdev_0 as fields in
+	 * the virtio constants structure.
+	 */
 	pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
 
+	use_msix = 1;
 	if ((env_msi = getenv("BHYVE_USE_MSI"))) {
 		if (strcasecmp(env_msi, "yes") == 0)
 			use_msix = 0;
 	} 
-
-	if (use_msix) {
-		/* MSI-X Support */
-		sc->msix_table_idx_req = VIRTIO_MSI_NO_VECTOR;	
-		sc->msix_table_idx_cfg = VIRTIO_MSI_NO_VECTOR;	
-		
-		if (pci_emul_add_msixcap(pi, 2, 1))
-			return (1);
-	} else {
-		/* MSI Support */	
-		pci_emul_add_msicap(pi, 1);
-	}	
-	
-	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTBLK_REGSZ);
-
+	if (vi_intr_init(&sc->vbsc_vs, 1, use_msix))
+		return (1);
+	vi_set_io_bar(&sc->vbsc_vs, 0);
 	return (0);
 }
 
-static uint64_t
-vtblk_adjust_offset(struct pci_devinst *pi, uint64_t offset)
-{
-	/*
-	 * Device specific offsets used by guest would change 
-	 * based on whether MSI-X capability is enabled or not
-	 */ 
-	if (!pci_msix_enabled(pi)) {
-		if (offset >= VTCFG_R_MSIX) 
-			return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
-	}
-
-	return (offset);
-}
-
-static void
-pci_vtblk_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
-		int baridx, uint64_t offset, int size, uint64_t value)
+static int
+pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
-	struct pci_vtblk_softc *sc = pi->pi_arg;
 
-	if (use_msix) {
-		if (baridx == pci_msix_table_bar(pi) ||
-		    baridx == pci_msix_pba_bar(pi)) {
-			pci_emul_msix_twrite(pi, offset, size, value);
-			return;
-		}
-	}
-	
-	assert(baridx == 0);
-
-	if (offset + size > pci_vtblk_iosize(pi)) {
-		DPRINTF(("vtblk_write: 2big, offset %ld size %d\n",
-			 offset, size));
-		return;
-	}
-
-	offset = vtblk_adjust_offset(pi, offset);
-	
-	switch (offset) {
-	case VTCFG_R_GUESTCAP:
-		assert(size == 4);
-		sc->vbsc_features = value & VTBLK_S_HOSTCAPS;
-		break;
-	case VTCFG_R_PFN:
-		assert(size == 4);
-		pci_vtblk_ring_init(sc, value);
-		break;
-	case VTCFG_R_QSEL:
-		assert(size == 2);
-		sc->vbsc_lastq = value;
-		break;
-	case VTCFG_R_QNOTIFY:
-		assert(size == 2);
-		assert(value == 0);
-		pci_vtblk_qnotify(sc);
-		break;
-	case VTCFG_R_STATUS:
-		assert(size == 1);
-		pci_vtblk_update_status(sc, value);
-		break;
-	case VTCFG_R_CFGVEC:
-		assert(size == 2);
-		sc->msix_table_idx_cfg = value;	
-		break;	
-	case VTCFG_R_QVEC:
-		assert(size == 2);
-		sc->msix_table_idx_req = value;
-		break;	
-	case VTCFG_R_HOSTCAP:
-	case VTCFG_R_QNUM:
-	case VTCFG_R_ISR:
-	case VTBLK_R_CFG ... VTBLK_R_CFG_END:
-		DPRINTF(("vtblk: write to readonly reg %ld\n\r", offset));
-		break;
-	default:
-		DPRINTF(("vtblk: unknown i/o write offset %ld\n\r", offset));
-		value = 0;
-		break;
-	}
+	DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
+	return (1);
 }
 
-uint64_t
-pci_vtblk_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
-	       int baridx, uint64_t offset, int size)
+static int
+pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
-	struct pci_vtblk_softc *sc = pi->pi_arg;
+	struct pci_vtblk_softc *sc = vsc;
 	void *ptr;
-	uint32_t value;
 
-	if (use_msix) {
-		if (baridx == pci_msix_table_bar(pi) ||
-		    baridx == pci_msix_pba_bar(pi)) {
-			return (pci_emul_msix_tread(pi, offset, size));
-		}
-	}
-
-	assert(baridx == 0);
-
-	if (offset + size > pci_vtblk_iosize(pi)) {
-		DPRINTF(("vtblk_read: 2big, offset %ld size %d\n",
-			 offset, size));
-		return (0);
-	}
-
-	offset = vtblk_adjust_offset(pi, offset);
-
-	switch (offset) {
-	case VTCFG_R_HOSTCAP:
-		assert(size == 4);
-		value = VTBLK_S_HOSTCAPS;
-		break;
-	case VTCFG_R_GUESTCAP:
-		assert(size == 4);
-		value = sc->vbsc_features; /* XXX never read ? */
-		break;
-	case VTCFG_R_PFN:
-		assert(size == 4);
-		value = sc->vbsc_pfn >> VRING_PFN;
-		break;
-	case VTCFG_R_QNUM:
-		value = (sc->vbsc_lastq == 0) ? VTBLK_RINGSZ: 0;
-		break;
-	case VTCFG_R_QSEL:
-		assert(size == 2);
-		value = sc->vbsc_lastq; /* XXX never read ? */
-		break;
-	case VTCFG_R_QNOTIFY:
-		assert(size == 2);
-		value = 0; /* XXX never read ? */
-		break;
-	case VTCFG_R_STATUS:
-		assert(size == 1);
-		value = sc->vbsc_status;
-		break;
-	case VTCFG_R_ISR:
-		assert(size == 1);
-		value = sc->vbsc_isr;
-		sc->vbsc_isr = 0;     /* a read clears this flag */
-		break;
-	case VTCFG_R_CFGVEC:
-		assert(size == 2);
-		value = sc->msix_table_idx_cfg;
-		break;
-	case VTCFG_R_QVEC:
-		assert(size == 2);
-		value = sc->msix_table_idx_req;
-		break;	
-	case VTBLK_R_CFG ... VTBLK_R_CFG_END:
-		assert(size + offset <= (VTBLK_R_CFG_END + 1));
-		ptr = (uint8_t *)&sc->vbsc_cfg + offset - VTBLK_R_CFG;
-		if (size == 1) {
-			value = *(uint8_t *) ptr;
-		} else if (size == 2) {
-			value = *(uint16_t *) ptr;
-		} else {
-			value = *(uint32_t *) ptr;
-		}
-		break;
-	default:
-		DPRINTF(("vtblk: unknown i/o read offset %ld\n\r", offset));
-		value = 0;
-		break;
-	}
-
-	return (value);
+	/* our caller has already verified offset and size */
+	ptr = (uint8_t *)&sc->vbsc_cfg + offset;
+	memcpy(retval, ptr, size);
+	return (0);
 }
 
 struct pci_devemu pci_de_vblk = {
 	.pe_emu =	"virtio-blk",
 	.pe_init =	pci_vtblk_init,
-	.pe_barwrite =	pci_vtblk_write,
-	.pe_barread =	pci_vtblk_read
+	.pe_barwrite =	vi_pci_write,
+	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr.sbin/bhyve/pci_virtio_net.c b/usr.sbin/bhyve/pci_virtio_net.c
index 19f9ffe..2939949 100644
--- a/usr.sbin/bhyve/pci_virtio_net.c
+++ b/usr.sbin/bhyve/pci_virtio_net.c
@@ -59,56 +59,49 @@ __FBSDID("$FreeBSD$");
 #define VTNET_MAXSEGS	32
 
 /*
- * PCI config-space register offsets
+ * Host capabilities.  Note that we only offer a few of these.
  */
-#define VTNET_R_CFG0	24
-#define VTNET_R_CFG1	25
-#define VTNET_R_CFG2	26
-#define VTNET_R_CFG3	27
-#define VTNET_R_CFG4	28
-#define VTNET_R_CFG5	29
-#define VTNET_R_CFG6	30
-#define VTNET_R_CFG7	31
-#define VTNET_R_MAX	31
-
-#define VTNET_REGSZ	VTNET_R_MAX+1
+#define	VIRTIO_NET_F_CSUM	(1 <<  0) /* host handles partial cksum */
+#define	VIRTIO_NET_F_GUEST_CSUM	(1 <<  1) /* guest handles partial cksum */
+#define	VIRTIO_NET_F_MAC	(1 <<  5) /* host supplies MAC */
+#define	VIRTIO_NET_F_GSO_DEPREC	(1 <<  6) /* deprecated: host handles GSO */
+#define	VIRTIO_NET_F_GUEST_TSO4	(1 <<  7) /* guest can rcv TSOv4 */
+#define	VIRTIO_NET_F_GUEST_TSO6	(1 <<  8) /* guest can rcv TSOv6 */
+#define	VIRTIO_NET_F_GUEST_ECN	(1 <<  9) /* guest can rcv TSO with ECN */
+#define	VIRTIO_NET_F_GUEST_UFO	(1 << 10) /* guest can rcv UFO */
+#define	VIRTIO_NET_F_HOST_TSO4	(1 << 11) /* host can rcv TSOv4 */
+#define	VIRTIO_NET_F_HOST_TSO6	(1 << 12) /* host can rcv TSOv6 */
+#define	VIRTIO_NET_F_HOST_ECN	(1 << 13) /* host can rcv TSO with ECN */
+#define	VIRTIO_NET_F_HOST_UFO	(1 << 14) /* host can rcv UFO */
+#define	VIRTIO_NET_F_MRG_RXBUF	(1 << 15) /* host can merge RX buffers */
+#define	VIRTIO_NET_F_STATUS	(1 << 16) /* config status field available */
+#define	VIRTIO_NET_F_CTRL_VQ	(1 << 17) /* control channel available */
+#define	VIRTIO_NET_F_CTRL_RX	(1 << 18) /* control channel RX mode support */
+#define	VIRTIO_NET_F_CTRL_VLAN	(1 << 19) /* control channel VLAN filtering */
+#define	VIRTIO_NET_F_GUEST_ANNOUNCE \
+				(1 << 21) /* guest can send gratuitous pkts */
 
-/*
- * Host capabilities
- */
 #define VTNET_S_HOSTCAPS      \
-  ( 0x00000020 |	/* host supplies MAC */ \
-    0x00008000 |	/* host can merge Rx buffers */ \
-    0x00010000 |	/* config status available */ \
+  ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
     VIRTIO_F_NOTIFY_ON_EMPTY)
 
 /*
+ * PCI config-space "registers"
+ */
+struct virtio_net_config {
+	uint8_t  mac[6];
+	uint16_t status;
+} __packed;
+
+/*
  * Queue definitions.
  */
 #define VTNET_RXQ	0
 #define VTNET_TXQ	1
-#define VTNET_CTLQ	2
+#define VTNET_CTLQ	2	/* NB: not yet supported */
 
 #define VTNET_MAXQ	3
 
-static int use_msix = 1;
-
-struct vring_hqueue {
-	/* Internal state */
-	uint16_t	hq_size;
-	uint16_t	hq_cur_aidx;		/* trails behind 'avail_idx' */
-
-	 /* Host-context pointers to the queue */
-	struct virtio_desc *hq_dtable;
-	uint16_t	*hq_avail_flags;
-	uint16_t	*hq_avail_idx;		/* monotonically increasing */
-	uint16_t	*hq_avail_ring;
-
-	uint16_t	*hq_used_flags;
-	uint16_t	*hq_used_idx;		/* monotonically increasing */
-	struct virtio_used *hq_used_ring;
-};
-
 /*
  * Fixed network header size
  */
@@ -133,23 +126,17 @@ static int pci_vtnet_debug;
  * Per-device softc
  */
 struct pci_vtnet_softc {
-	struct pci_devinst *vsc_pi;
+	struct virtio_softc vsc_vs;
+	struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
 	pthread_mutex_t vsc_mtx;
 	struct mevent	*vsc_mevp;
 
-	int		vsc_curq;
-	int		vsc_status;
-	int		vsc_isr;
 	int		vsc_tapfd;
 	int		vsc_rx_ready;
-	int		resetting;
+	volatile int	resetting;	/* set and checked outside lock */
 
 	uint32_t	vsc_features;
-	uint8_t		vsc_macaddr[6];
-
-	uint64_t	vsc_pfn[VTNET_MAXQ];
-	struct	vring_hqueue vsc_hq[VTNET_MAXQ];
-	uint16_t	vsc_msix_table_idx[VTNET_MAXQ];
+	struct virtio_net_config vsc_config;
 
 	pthread_mutex_t	rx_mtx;
 	int		rx_in_progress;
@@ -159,73 +146,22 @@ struct pci_vtnet_softc {
 	pthread_cond_t	tx_cond;
 	int		tx_in_progress;
 };
-#define	vtnet_ctx(sc)		((sc)->vsc_pi->pi_vmctx)
-#define	notify_on_empty(sc)	((sc)->vsc_features & VIRTIO_F_NOTIFY_ON_EMPTY)
-
-/*
- * Return the size of IO BAR that maps virtio header and device specific
- * region. The size would vary depending on whether MSI-X is enabled or
- * not.
- */
-static uint64_t
-pci_vtnet_iosize(struct pci_devinst *pi)
-{
-	if (pci_msix_enabled(pi))
-		return (VTNET_REGSZ);
-	else
-		return (VTNET_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
-}
-
-/*
- * Return the number of available descriptors in the vring taking care
- * of the 16-bit index wraparound.
- */
-static int
-hq_num_avail(struct vring_hqueue *hq)
-{
-	uint16_t ndesc;
-
-	/*
-	 * We're just computing (a-b) mod 2^16
-	 *
-	 * The only glitch here is that in standard C,
-	 * uint16_t promotes to (signed) int when int has
-	 * more than 16 bits (pretty much always now), so
-	 * we have to force it back to unsigned.
-	 */
-	ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
-
-	assert(ndesc <= hq->hq_size);
-
-	return (ndesc);
-}
-
-static uint16_t
-pci_vtnet_qsize(int qnum)
-{
-	/* XXX no ctl queue currently */
-	if (qnum == VTNET_CTLQ) {
-		return (0);
-	}
-
-	/* XXX fixed currently. Maybe different for tx/rx/ctl */
-	return (VTNET_RINGSZ);
-}
-
-static void
-pci_vtnet_ring_reset(struct pci_vtnet_softc *sc, int ring)
-{
-	struct vring_hqueue *hq;
-
-	assert(ring < VTNET_MAXQ);
-
-	hq = &sc->vsc_hq[ring];
 
-	/*
-	 * Reset all soft state
-	 */
-	hq->hq_cur_aidx = 0;
-}
+static void pci_vtnet_reset(void *);
+/* static void pci_vtnet_notify(void *, struct vqueue_info *); */
+static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
+static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtnet_vi_consts = {
+	"vtnet",		/* our name */
+	VTNET_MAXQ - 1,		/* we currently support 2 virtqueues */
+	sizeof(struct virtio_net_config), /* config reg size */
+	pci_vtnet_reset,	/* reset */
+	NULL,			/* device-wide qnotify -- not used */
+	pci_vtnet_cfgread,	/* read PCI config */
+	pci_vtnet_cfgwrite,	/* write PCI config */
+	VTNET_S_HOSTCAPS,	/* our capabilities */
+};
 
 /*
  * If the transmit thread is active then stall until it is done.
@@ -260,48 +196,27 @@ pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
 }
 
 static void
-pci_vtnet_update_status(struct pci_vtnet_softc *sc, uint32_t value)
+pci_vtnet_reset(void *vsc)
 {
-	int i;
-
-	if (value == 0) {
-		DPRINTF(("vtnet: device reset requested !\n"));
-		
-		sc->resetting = 1;
+	struct pci_vtnet_softc *sc = vsc;
 
-		/*
-		 * Wait for the transmit and receive threads to finish their
-		 * processing.
-		 */
-		pci_vtnet_txwait(sc);
-		pci_vtnet_rxwait(sc);
-
-		sc->vsc_rx_ready = 0;
-		pci_vtnet_ring_reset(sc, VTNET_RXQ);
-		pci_vtnet_ring_reset(sc, VTNET_TXQ);
+	DPRINTF(("vtnet: device reset requested !\n"));
 
-		for (i = 0; i < VTNET_MAXQ; i++)
-			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
+	sc->resetting = 1;
 
-		sc->vsc_isr = 0;
-		sc->vsc_features = 0;
-
-		sc->resetting = 0;
-	}
+	/*
+	 * Wait for the transmit and receive threads to finish their
+	 * processing.
+	 */
+	pci_vtnet_txwait(sc);
+	pci_vtnet_rxwait(sc);
 
-	sc->vsc_status = value;
-}
+	sc->vsc_rx_ready = 0;
 
-static void
-vtnet_generate_interrupt(struct pci_vtnet_softc *sc, int qidx)
-{
+	/* now reset rings, MSI-X vectors, and negotiated capabilities */
+	vi_reset_dev(&sc->vsc_vs);
 
-	if (use_msix) {
-		pci_generate_msix(sc->vsc_pi, sc->vsc_msix_table_idx[qidx]);
-	} else {
-		sc->vsc_isr |= 1;
-		pci_generate_msi(sc->vsc_pi, 0);
-	}
+	sc->resetting = 0;
 }
 
 /*
@@ -311,7 +226,7 @@ static void
 pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 		 int len)
 {
-	char pad[60];
+	static char pad[60]; /* all zero bytes */
 
 	if (sc->vsc_tapfd == -1)
 		return;
@@ -322,7 +237,6 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
 	 * there is always an extra iov available by the caller.
 	 */
 	if (len < 60) {
-		memset(pad, 0, 60 - len);
 		iov[iovcnt].iov_base = pad;
 		iov[iovcnt].iov_len = 60 - len;
 		iovcnt++;
@@ -342,15 +256,11 @@ static uint8_t dummybuf[2048];
 static void
 pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 {
-	struct virtio_desc *vd;
-	struct virtio_used *vu;
-	struct vring_hqueue *hq;
+	struct vqueue_info *vq;
 	struct virtio_net_rxhdr *vrx;
 	uint8_t *buf;
-	int i;
 	int len;
-	int ndescs;
-	int didx, uidx, aidx;	/* descriptor, avail and used index */
+	struct iovec iov;
 
 	/*
 	 * Should never be called without a valid tap fd
@@ -370,47 +280,45 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 	}
 
 	/*
-	 * Calculate the number of available rx buffers
+	 * Check for available rx buffers
 	 */
-	hq = &sc->vsc_hq[VTNET_RXQ];
-
-	ndescs = hq_num_avail(hq);
-
-	if (ndescs == 0) {
+	vq = &sc->vsc_queues[VTNET_RXQ];
+	vq_startchains(vq);
+	if (!vq_has_descs(vq)) {
 		/*
-		 * Drop the packet and try later
+		 * Drop the packet and try later.  Interrupt on
+		 * empty, if that's negotiated.
 		 */
 		(void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
-
-		if (notify_on_empty(sc))
-			vtnet_generate_interrupt(sc, VTNET_RXQ);
-
+		vq_endchains(vq, 1);
 		return;
 	}
 
-	aidx = hq->hq_cur_aidx;
-	uidx = *hq->hq_used_idx;
-	for (i = 0; i < ndescs; i++) {
+	do {
 		/*
-		 * 'aidx' indexes into the an array of descriptor indexes
+		 * Get descriptor chain, which should have just
+		 * one descriptor in it.
+		 * ??? allow guests to use multiple descs?
 		 */
-		didx = hq->hq_avail_ring[aidx % hq->hq_size];
-		assert(didx >= 0 && didx < hq->hq_size);
-
-		vd = &hq->hq_dtable[didx];
+		assert(vq_getchain(vq, &iov, 1, NULL) == 1);
 
 		/*
 		 * Get a pointer to the rx header, and use the
 		 * data immediately following it for the packet buffer.
 		 */
-		vrx = paddr_guest2host(vtnet_ctx(sc), vd->vd_addr, vd->vd_len);
+		vrx = iov.iov_base;
 		buf = (uint8_t *)(vrx + 1);
 
 		len = read(sc->vsc_tapfd, buf,
-			   vd->vd_len - sizeof(struct virtio_net_rxhdr));
+			   iov.iov_len - sizeof(struct virtio_net_rxhdr));
 
 		if (len < 0 && errno == EWOULDBLOCK) {
-			break;
+			/*
+			 * No more packets, but still some avail ring
+			 * entries.  Interrupt if needed/appropriate.
+			 */
+			vq_endchains(vq, 0);
+			return;
 		}
 
 		/*
@@ -422,23 +330,13 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
 		vrx->vrh_bufs = 1;
 
 		/*
-		 * Write this descriptor into the used ring
+		 * Release this chain and handle more chains.
 		 */
-		vu = &hq->hq_used_ring[uidx % hq->hq_size];
-		vu->vu_idx = didx;
-		vu->vu_tlen = len + sizeof(struct virtio_net_rxhdr);
-		uidx++;
-		aidx++;
-	}
-
-	/*
-	 * Update the used pointer, and signal an interrupt if allowed
-	 */
-	*hq->hq_used_idx = uidx;
-	hq->hq_cur_aidx = aidx;
+		vq_relchain(vq, len + sizeof(struct virtio_net_rxhdr));
+	} while (vq_has_descs(vq));
 
-	if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0)
-		vtnet_generate_interrupt(sc, VTNET_RXQ);
+	/* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+	vq_endchains(vq, 1);
 }
 
 static void
@@ -455,8 +353,10 @@ pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
 }
 
 static void
-pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
+pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
 {
+	struct pci_vtnet_softc *sc = vsc;
+
 	/*
 	 * A qnotify means that the rx process can now begin
 	 */
@@ -466,71 +366,42 @@ pci_vtnet_ping_rxq(struct pci_vtnet_softc *sc)
 }
 
 static void
-pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vring_hqueue *hq)
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
 {
 	struct iovec iov[VTNET_MAXSEGS + 1];
-	struct virtio_desc *vd;
-	struct virtio_used *vu;
-	int i;
-	int plen;
-	int tlen;
-	int uidx, aidx, didx;
-
-	uidx = *hq->hq_used_idx;
-	aidx = hq->hq_cur_aidx;
-	didx = hq->hq_avail_ring[aidx % hq->hq_size];
-	assert(didx >= 0 && didx < hq->hq_size);
-
-	vd = &hq->hq_dtable[didx];
+	int i, n;
+	int plen, tlen;
 
 	/*
-	 * Run through the chain of descriptors, ignoring the
-	 * first header descriptor. However, include the header
-	 * length in the total length that will be put into the
-	 * used queue.
+	 * Obtain chain of descriptors.  The first one is
+	 * really the header descriptor, so we need to sum
+	 * up two lengths: packet length and transfer length.
 	 */
-	tlen = vd->vd_len;
-	vd = &hq->hq_dtable[vd->vd_next];
-
-	for (i = 0, plen = 0;
-	     i < VTNET_MAXSEGS;
-	     i++, vd = &hq->hq_dtable[vd->vd_next]) {
-		iov[i].iov_base = paddr_guest2host(vtnet_ctx(sc),
-						   vd->vd_addr, vd->vd_len);
-		iov[i].iov_len = vd->vd_len;
-		plen += vd->vd_len;
-		tlen += vd->vd_len;
-
-		if ((vd->vd_flags & VRING_DESC_F_NEXT) == 0)
-			break;
+	n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL);
+	assert(n >= 1 && n <= VTNET_MAXSEGS);
+	plen = 0;
+	tlen = iov[0].iov_len;
+	for (i = 1; i < n; i++) {
+		plen += iov[i].iov_len;
+		tlen += iov[i].iov_len;
 	}
-	assert(i < VTNET_MAXSEGS);
 
-	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, i + 1));
-	pci_vtnet_tap_tx(sc, iov, i + 1, plen);
+	DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
+	pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen);
 
-	/*
-	 * Return this chain back to the host
-	 */
-	vu = &hq->hq_used_ring[uidx % hq->hq_size];
-	vu->vu_idx = didx;
-	vu->vu_tlen = tlen;
-	hq->hq_cur_aidx = aidx + 1;
-	*hq->hq_used_idx = uidx + 1;
+	/* chain is processed, release it and set tlen */
+	vq_relchain(vq, tlen);
 }
 
 static void
-pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
+pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
 {
-	struct vring_hqueue *hq = &sc->vsc_hq[VTNET_TXQ];
-	int ndescs;
+	struct pci_vtnet_softc *sc = vsc;
 
 	/*
-	 * Calculate number of ring entries to process
+	 * Any ring entries to process?
 	 */
-	ndescs = hq_num_avail(hq);
-
-	if (ndescs == 0)
+	if (!vq_has_descs(vq))
 		return;
 
 	/* Signal the tx thread for processing */
@@ -546,97 +417,65 @@ pci_vtnet_ping_txq(struct pci_vtnet_softc *sc)
 static void *
 pci_vtnet_tx_thread(void *param)
 {
-	struct pci_vtnet_softc *sc = (struct pci_vtnet_softc *) param;
-	struct vring_hqueue *hq; 
-	int i, ndescs, error;
-	
-	hq = &sc->vsc_hq[VTNET_TXQ];
-	
-	/* 
-	 * Let us wait till the tx queue pointers get initialised & 
-	 * first tx signaled 
+	struct pci_vtnet_softc *sc = param;
+	struct vqueue_info *vq;
+	int have_work, error;
+
+	vq = &sc->vsc_queues[VTNET_TXQ];
+
+	/*
+	 * Let us wait till the tx queue pointers get initialised &
+	 * first tx signaled
 	 */
 	pthread_mutex_lock(&sc->tx_mtx);
 	error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
 	assert(error == 0);
-	
+
 	for (;;) {
-		pthread_mutex_lock(&sc->tx_mtx);
-		for (;;) {
+		/* note - tx mutex is locked here */
+		do {
 			if (sc->resetting)
-				ndescs = 0;
+				have_work = 0;
 			else
-				ndescs = hq_num_avail(hq);
-			
-			if (ndescs != 0) 
-				break;
-
-			sc->tx_in_progress = 0;
-			error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
-			assert(error == 0);
-		}
+				have_work = vq_has_descs(vq);
+
+			if (!have_work) {
+				sc->tx_in_progress = 0;
+				error = pthread_cond_wait(&sc->tx_cond,
+							  &sc->tx_mtx);
+				assert(error == 0);
+			}
+		} while (!have_work);
 		sc->tx_in_progress = 1;
 		pthread_mutex_unlock(&sc->tx_mtx);
 
-		while (ndescs > 0) {
+		vq_startchains(vq);
+		do {
 			/*
-			 * Run through all the entries, placing them into
-			 * iovecs and sending when an end-of-packet is found
+			 * Run through entries, placing them into
+			 * iovecs and sending when an end-of-packet
+			 * is found
 			 */
-			for (i = 0; i < ndescs; i++)
-				pci_vtnet_proctx(sc, hq);
-
-			ndescs = hq_num_avail(hq);
-		}
+			pci_vtnet_proctx(sc, vq);
+		} while (vq_has_descs(vq));
 
 		/*
 		 * Generate an interrupt if needed.
 		 */
-		if (notify_on_empty(sc) ||
-		    (*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0)
-			vtnet_generate_interrupt(sc, VTNET_TXQ);
-	}
-}	
+		vq_endchains(vq, 1);
 
-static void
-pci_vtnet_ping_ctlq(struct pci_vtnet_softc *sc)
-{
-
-	DPRINTF(("vtnet: control qnotify!\n\r"));	
+		pthread_mutex_lock(&sc->tx_mtx);
+	}
 }
 
+#ifdef notyet
 static void
-pci_vtnet_ring_init(struct pci_vtnet_softc *sc, uint64_t pfn)
+pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
 {
-	struct vring_hqueue *hq;
-	int qnum = sc->vsc_curq;
-
-	assert(qnum < VTNET_MAXQ);
-
-	sc->vsc_pfn[qnum] = pfn << VRING_PFN;
-	
-	/*
-	 * Set up host pointers to the various parts of the
-	 * queue
-	 */
-	hq = &sc->vsc_hq[qnum];
-	hq->hq_size = pci_vtnet_qsize(qnum);
-
-	hq->hq_dtable = paddr_guest2host(vtnet_ctx(sc), pfn << VRING_PFN,
-					 vring_size(hq->hq_size));
-	hq->hq_avail_flags =  (uint16_t *)(hq->hq_dtable + hq->hq_size);
-	hq->hq_avail_idx = hq->hq_avail_flags + 1;
-	hq->hq_avail_ring = hq->hq_avail_flags + 2;
-	hq->hq_used_flags = (uint16_t *)roundup2((uintptr_t)hq->hq_avail_ring,
-						 VRING_ALIGN);
-	hq->hq_used_idx = hq->hq_used_flags + 1;
-	hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
 
-	/*
-	 * Initialize queue indexes
-	 */
-	hq->hq_cur_aidx = 0;
+	DPRINTF(("vtnet: control qnotify!\n\r"));
 }
+#endif
 
 static int
 pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
@@ -674,18 +513,27 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	char *devname;
 	char *vtopts;
 	int mac_provided;
+	int use_msix;
 
 	sc = malloc(sizeof(struct pci_vtnet_softc));
 	memset(sc, 0, sizeof(struct pci_vtnet_softc));
 
-	pi->pi_arg = sc;
-	sc->vsc_pi = pi;
-
 	pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+	vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
+	sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
+	sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
+	sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
+	sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
+#ifdef notyet
+	sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
+        sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
+#endif
  
 	/*
 	 * Use MSI if set by user
 	 */
+	use_msix = 1;
 	if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
 		if (strcasecmp(env_msi, "yes") == 0)
 			use_msix = 0;
@@ -705,7 +553,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		(void) strsep(&vtopts, ",");
 
 		if (vtopts != NULL) {
-			err = pci_vtnet_parsemac(vtopts, sc->vsc_macaddr);
+			err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
 			if (err != 0) {
 				free(devname);
 				return (err);
@@ -757,12 +605,12 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 		MD5Update(&mdctx, nstr, strlen(nstr));
 		MD5Final(digest, &mdctx);
 
-		sc->vsc_macaddr[0] = 0x00;
-		sc->vsc_macaddr[1] = 0xa0;
-		sc->vsc_macaddr[2] = 0x98;
-		sc->vsc_macaddr[3] = digest[0];
-		sc->vsc_macaddr[4] = digest[1];
-		sc->vsc_macaddr[5] = digest[2];
+		sc->vsc_config.mac[0] = 0x00;
+		sc->vsc_config.mac[1] = 0xa0;
+		sc->vsc_config.mac[2] = 0x98;
+		sc->vsc_config.mac[3] = digest[0];
+		sc->vsc_config.mac[4] = digest[1];
+		sc->vsc_config.mac[5] = digest[2];
 	}
 
 	/* initialize config space */
@@ -770,25 +618,16 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
 	pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
 	pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
-	
-	if (use_msix) {
-		/* MSI-X support */
-		int i;
-
-		for (i = 0; i < VTNET_MAXQ; i++)
-			sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
 
-		/*
-		 * BAR 1 used to map MSI-X table and PBA
-		 */
-		if (pci_emul_add_msixcap(pi, VTNET_MAXQ, 1))
-			return (1);
-	} else {
-		/* MSI support */
-		pci_emul_add_msicap(pi, 1);
-	}
+	/* link always up */
+	sc->vsc_config.status = 1;
 	
-	pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VTNET_REGSZ);
+	/* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
+	if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
+		return (1);
+
+	/* use BAR 0 to map config regs in IO space */
+	vi_set_io_bar(&sc->vsc_vs, 0);
 
 	sc->resetting = 0;
 
@@ -796,7 +635,7 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	pthread_mutex_init(&sc->rx_mtx, NULL); 
 
 	/* 
-	 * Initialize tx semaphore & spawn TX processing thread
+	 * Initialize tx semaphore & spawn TX processing thread.
 	 * As of now, only one thread for TX desc processing is
 	 * spawned. 
 	 */
@@ -810,234 +649,41 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
 	return (0);
 }
 
-/*
- * Function pointer array to handle queue notifications
- */
-static void (*pci_vtnet_qnotify[VTNET_MAXQ])(struct pci_vtnet_softc *) = {
-	pci_vtnet_ping_rxq,
-	pci_vtnet_ping_txq,
-	pci_vtnet_ping_ctlq
-};
-
-static uint64_t
-vtnet_adjust_offset(struct pci_devinst *pi, uint64_t offset)
-{
-	/*
-	 * Device specific offsets used by guest would change based on
-	 * whether MSI-X capability is enabled or not
-	 */
-	if (!pci_msix_enabled(pi)) {
-		if (offset >= VTCFG_R_MSIX)
-			return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
-	}
-
-	return (offset);
-}
-
-static void
-pci_vtnet_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
-		int baridx, uint64_t offset, int size, uint64_t value)
+static int
+pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
 {
-	struct pci_vtnet_softc *sc = pi->pi_arg;
+	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
 
-	if (use_msix) {
-		if (baridx == pci_msix_table_bar(pi) ||
-		    baridx == pci_msix_pba_bar(pi)) {
-			pci_emul_msix_twrite(pi, offset, size, value);
-			return;
-		}
-	}
-
-	assert(baridx == 0);
-
-	if (offset + size > pci_vtnet_iosize(pi)) {
-		DPRINTF(("vtnet_write: 2big, offset %ld size %d\n",
-			 offset, size));
-		return;
-	}
-
-	pthread_mutex_lock(&sc->vsc_mtx);
-
-	offset = vtnet_adjust_offset(pi, offset);
-
-	switch (offset) {
-	case VTCFG_R_GUESTCAP:
-		assert(size == 4);
-		sc->vsc_features = value & VTNET_S_HOSTCAPS;
-		break;
-	case VTCFG_R_PFN:
-		assert(size == 4);
-		pci_vtnet_ring_init(sc, value);
-		break;
-	case VTCFG_R_QSEL:
-		assert(size == 2);
-		assert(value < VTNET_MAXQ);
-		sc->vsc_curq = value;
-		break;
-	case VTCFG_R_QNOTIFY:
-		assert(size == 2);
-		assert(value < VTNET_MAXQ);
-		(*pci_vtnet_qnotify[value])(sc);
-		break;
-	case VTCFG_R_STATUS:
-		assert(size == 1);
-		pci_vtnet_update_status(sc, value);
-		break;
-	case VTCFG_R_CFGVEC:
-		assert(size == 2);
-		sc->vsc_msix_table_idx[VTNET_CTLQ] = value;
-		break;
-	case VTCFG_R_QVEC:
-		assert(size == 2);
-		assert(sc->vsc_curq != VTNET_CTLQ);
-		sc->vsc_msix_table_idx[sc->vsc_curq] = value;
-		break;
-	case VTNET_R_CFG0:
-	case VTNET_R_CFG1:
-	case VTNET_R_CFG2:
-	case VTNET_R_CFG3:
-	case VTNET_R_CFG4:
-	case VTNET_R_CFG5:
-		assert((size + offset) <= (VTNET_R_CFG5 + 1));
-		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
+	if (offset < 6) {
+		assert(offset + size <= 6);
 		/*
 		 * The driver is allowed to change the MAC address
 		 */
-		sc->vsc_macaddr[offset - VTNET_R_CFG0] = value;
-		if (size == 1) {
-			*(uint8_t *) ptr = value;
-		} else if (size == 2) {
-			*(uint16_t *) ptr = value;
-		} else {
-			*(uint32_t *) ptr = value;
-		}
-		break;
-	case VTCFG_R_HOSTCAP:
-	case VTCFG_R_QNUM:
-	case VTCFG_R_ISR:
-	case VTNET_R_CFG6:
-	case VTNET_R_CFG7:
-		DPRINTF(("vtnet: write to readonly reg %ld\n\r", offset));
-		break;
-	default:
-		DPRINTF(("vtnet: unknown i/o write offset %ld\n\r", offset));
-		value = 0;
-		break;
+		ptr = &sc->vsc_config.mac[offset];
+		memcpy(ptr, &value, size);
+	} else {
+		DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
+		return (1);
 	}
-
-	pthread_mutex_unlock(&sc->vsc_mtx);
+	return (0);
 }
 
-uint64_t
-pci_vtnet_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
-	       int baridx, uint64_t offset, int size)
+static int
+pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
 {
-	struct pci_vtnet_softc *sc = pi->pi_arg;
+	struct pci_vtnet_softc *sc = vsc;
 	void *ptr;
-	uint64_t value;
-
-	if (use_msix) {
-		if (baridx == pci_msix_table_bar(pi) ||
-		    baridx == pci_msix_pba_bar(pi)) {
-			return (pci_emul_msix_tread(pi, offset, size));
-		}
-	}
 
-	assert(baridx == 0);
-
-	if (offset + size > pci_vtnet_iosize(pi)) {
-		DPRINTF(("vtnet_read: 2big, offset %ld size %d\n",
-			 offset, size));
-		return (0);
-	}
-
-	pthread_mutex_lock(&sc->vsc_mtx);
-
-	offset = vtnet_adjust_offset(pi, offset);
-
-	switch (offset) {
-	case VTCFG_R_HOSTCAP:
-		assert(size == 4);
-		value = VTNET_S_HOSTCAPS;
-		break;
-	case VTCFG_R_GUESTCAP:
-		assert(size == 4);
-		value = sc->vsc_features; /* XXX never read ? */
-		break;
-	case VTCFG_R_PFN:
-		assert(size == 4);
-		value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
-		break;
-	case VTCFG_R_QNUM:
-		assert(size == 2);
-		value = pci_vtnet_qsize(sc->vsc_curq);
-		break;
-	case VTCFG_R_QSEL:
-		assert(size == 2);
-		value = sc->vsc_curq;  /* XXX never read ? */
-		break;
-	case VTCFG_R_QNOTIFY:
-		assert(size == 2);
-		value = sc->vsc_curq;  /* XXX never read ? */
-		break;
-	case VTCFG_R_STATUS:
-		assert(size == 1);
-		value = sc->vsc_status;
-		break;
-	case VTCFG_R_ISR:
-		assert(size == 1);
-		value = sc->vsc_isr;
-		sc->vsc_isr = 0;     /* a read clears this flag */
-		break;
-	case VTCFG_R_CFGVEC:
-		assert(size == 2);
-		value = sc->vsc_msix_table_idx[VTNET_CTLQ];
-		break;
-	case VTCFG_R_QVEC:
-		assert(size == 2);
-		assert(sc->vsc_curq != VTNET_CTLQ);
-		value = sc->vsc_msix_table_idx[sc->vsc_curq];
-		break;
-	case VTNET_R_CFG0:
-	case VTNET_R_CFG1:
-	case VTNET_R_CFG2:
-	case VTNET_R_CFG3:
-	case VTNET_R_CFG4:
-	case VTNET_R_CFG5:
-		assert((size + offset) <= (VTNET_R_CFG5 + 1));
-		ptr = &sc->vsc_macaddr[offset - VTNET_R_CFG0];
-		if (size == 1) {
-			value = *(uint8_t *) ptr;
-		} else if (size == 2) {
-			value = *(uint16_t *) ptr;
-		} else {
-			value = *(uint32_t *) ptr;
-		}
-		break;
-	case VTNET_R_CFG6:
-		assert(size != 4);
-		value = 0x01; /* XXX link always up */
-		break;
-	case VTNET_R_CFG7:
-		assert(size == 1);
-		value = 0; /* XXX link status in LSB */
-		break;
-	default:
-		DPRINTF(("vtnet: unknown i/o read offset %ld\n\r", offset));
-		value = 0;
-		break;
-	}
-
-	pthread_mutex_unlock(&sc->vsc_mtx);
-
-	return (value);
+	ptr = (uint8_t *)&sc->vsc_config + offset;
+	memcpy(retval, ptr, size);
+	return (0);
 }
 
 struct pci_devemu pci_de_vnet = {
 	.pe_emu = 	"virtio-net",
 	.pe_init =	pci_vtnet_init,
-	.pe_barwrite =	pci_vtnet_write,
-	.pe_barread =	pci_vtnet_read
+	.pe_barwrite =	vi_pci_write,
+	.pe_barread =	vi_pci_read
 };
 PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr.sbin/bhyve/virtio.c b/usr.sbin/bhyve/virtio.c
new file mode 100644
index 0000000..cdc9228
--- /dev/null
+++ b/usr.sbin/bhyve/virtio.c
@@ -0,0 +1,745 @@
+/*-
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <pthread.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+/*
+ * Functions for dealing with generalized "virtual devices" as
+ * defined by <https://www.google.com/#output=search&q=virtio+spec>
+ */
+
+/*
+ * In case we decide to relax the "virtio softc comes at the
+ * front of virtio-based device softc" constraint, let's use
+ * this to convert.
+ */
+#define DEV_SOFTC(vs) ((void *)(vs))
+
+/*
+ * Link a virtio_softc to its constants, the device softc, and
+ * the PCI emulation.
+ */
+void
+vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+		void *dev_softc, struct pci_devinst *pi,
+		struct vqueue_info *queues)
+{
+	int i;
+
+	/* vs and dev_softc addresses must match */
+	assert((void *)vs == dev_softc);
+	vs->vs_vc = vc;
+	vs->vs_pi = pi;
+	pi->pi_arg = vs;
+
+	vs->vs_queues = queues;
+	for (i = 0; i < vc->vc_nvq; i++) {
+		queues[i].vq_vs = vs;
+		queues[i].vq_num = i;
+	}
+}
+
+/*
+ * Reset device (device-wide).  This erases all queues, i.e.,
+ * all the queues become invalid (though we don't wipe out the
+ * internal pointers, we just clear the VQ_ALLOC flag).
+ *
+ * It resets negotiated features to "none".
+ *
+ * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
+ */
+void
+vi_reset_dev(struct virtio_softc *vs)
+{
+	struct vqueue_info *vq;
+	int i, nvq;
+
+	nvq = vs->vs_vc->vc_nvq;
+	for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
+		vq->vq_flags = 0;
+		vq->vq_last_avail = 0;
+		vq->vq_pfn = 0;
+		vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
+	}
+	vs->vs_negotiated_caps = 0;
+	vs->vs_curq = 0;
+	/* vs->vs_status = 0; -- redundant */
+	vs->vs_isr = 0;
+	vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
+}
+
+/*
+ * Set I/O BAR (usually 0) to map PCI config registers.
+ */
+void
+vi_set_io_bar(struct virtio_softc *vs, int barnum)
+{
+	size_t size;
+
+	/*
+	 * ??? should we use CFG0 if MSI-X is disabled?
+	 * Existing code did not...
+	 */
+	size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
+	pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
+}
+
+/*
+ * Initialize MSI-X vector capabilities if we're to use MSI-X,
+ * or MSI capabilities if not.
+ *
+ * We assume we want one MSI-X vector per queue, here, plus one
+ * for the config vec.
+ */
+int
+vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
+{
+	int nvec;
+
+	if (use_msix) {
+		vs->vs_flags |= VIRTIO_USE_MSIX;
+		vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
+		nvec = vs->vs_vc->vc_nvq + 1;
+		if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
+			return (1);
+	} else {
+		vs->vs_flags &= ~VIRTIO_USE_MSIX;
+		pci_emul_add_msicap(vs->vs_pi, barnum);
+	}
+	return (0);
+}
+
+/*
+ * Initialize the currently-selected virtio queue (vs->vs_curq).
+ * The guest just gave us a page frame number, from which we can
+ * calculate the addresses of the queue.
+ */
+void
+vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
+{
+	struct vqueue_info *vq;
+	uint64_t phys;
+	size_t size;
+	char *base;
+
+	vq = &vs->vs_queues[vs->vs_curq];
+	vq->vq_pfn = pfn;
+	phys = pfn << VRING_PFN;
+	size = vring_size(vq->vq_qsize);
+	base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
+
+	/* First page(s) are descriptors... */
+	vq->vq_desc = (struct virtio_desc *)base;
+	base += vq->vq_qsize * sizeof(struct virtio_desc);
+
+	/* ... immediately followed by "avail" ring (entirely uint16_t's) */
+	vq->vq_avail = (struct vring_avail *)base;
+	base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
+
+	/* Then it's rounded up to the next page... */
+	base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
+
+	/* ... and the last page(s) are the used ring. */
+	vq->vq_used = (struct vring_used *)base;
+
+	/* Mark queue as allocated, and start at 0 when we use it. */
+	vq->vq_flags = VQ_ALLOC;
+	vq->vq_last_avail = 0;
+}
+
+/*
+ * Helper inline for vq_getchain(): record the i'th "real"
+ * descriptor.
+ */
+static inline void
+_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
+	   struct iovec *iov, int n_iov, uint16_t *flags) {
+
+	if (i >= n_iov)
+		return;
+	iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
+	iov[i].iov_len = vd->vd_len;
+	if (flags != NULL)
+		flags[i] = vd->vd_flags;
+}
+#define	VQ_MAX_DESCRIPTORS	512	/* see below */
+
+/*
+ * Examine the chain of descriptors starting at the "next one" to
+ * make sure that they describe a sensible request.  If so, return
+ * the number of "real" descriptors that would be needed/used in
+ * acting on this request.  This may be smaller than the number of
+ * available descriptors, e.g., if there are two available but
+ * they are two separate requests, this just returns 1.  Or, it
+ * may be larger: if there are indirect descriptors involved,
+ * there may only be one descriptor available but it may be an
+ * indirect pointing to eight more.  We return 8 in this case,
+ * i.e., we do not count the indirect descriptors, only the "real"
+ * ones.
+ *
+ * Basically, this vets the vd_flags and vd_next field of each
+ * descriptor and tells you how many are involved.  Since some may
+ * be indirect, this also needs the vmctx (in the pci_devinst
+ * at vs->vs_pi) so that it can find indirect descriptors.
+ *
+ * As we process each descriptor, we copy and adjust it (guest to
+ * host address wise, also using the vmtctx) into the given iov[]
+ * array (of the given size).  If the array overflows, we stop
+ * placing values into the array but keep processing descriptors,
+ * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
+ * So you, the caller, must not assume that iov[] is as big as the
+ * return value (you can process the same thing twice to allocate
+ * a larger iov array if needed, or supply a zero length to find
+ * out how much space is needed).
+ *
+ * If you want to verify the WRITE flag on each descriptor, pass a
+ * non-NULL "flags" pointer to an array of "uint16_t" of the same size
+ * as n_iov and we'll copy each vd_flags field after unwinding any
+ * indirects.
+ *
+ * If some descriptor(s) are invalid, this prints a diagnostic message
+ * and returns -1.  If no descriptors are ready now it simply returns 0.
+ *
+ * You are assumed to have done a vq_ring_ready() if needed (note
+ * that vq_has_descs() does one).
+ */
+int
+vq_getchain(struct vqueue_info *vq,
+	    struct iovec *iov, int n_iov, uint16_t *flags)
+{
+	int i;
+	u_int ndesc, n_indir;
+	u_int idx, head, next;
+	volatile struct virtio_desc *vdir, *vindir, *vp;
+	struct vmctx *ctx;
+	struct virtio_softc *vs;
+	const char *name;
+
+	vs = vq->vq_vs;
+	name = vs->vs_vc->vc_name;
+
+	/*
+	 * Note: it's the responsibility of the guest not to
+	 * update vq->vq_avail->va_idx until all of the descriptors
+         * the guest has written are valid (including all their
+         * vd_next fields and vd_flags).
+	 *
+	 * Compute (last_avail - va_idx) in integers mod 2**16.  This is
+	 * the number of descriptors the device has made available
+	 * since the last time we updated vq->vq_last_avail.
+	 *
+	 * We just need to do the subtraction as an unsigned int,
+	 * then trim off excess bits.
+	 */
+	idx = vq->vq_last_avail;
+	ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
+	if (ndesc == 0)
+		return (0);
+	if (ndesc > vq->vq_qsize) {
+		/* XXX need better way to diagnose issues */
+		fprintf(stderr,
+		    "%s: ndesc (%u) out of range, driver confused?\r\n",
+		    name, (u_int)ndesc);
+		return (-1);
+	}
+
+	/*
+	 * Now count/parse "involved" descriptors starting from
+	 * the head of the chain.
+	 *
+	 * To prevent loops, we could be more complicated and
+	 * check whether we're re-visiting a previously visited
+	 * index, but we just abort if the count gets excessive.
+	 */
+	ctx = vs->vs_pi->pi_vmctx;
+	head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
+	next = head;
+	for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
+		if (next >= vq->vq_qsize) {
+			fprintf(stderr,
+			    "%s: descriptor index %u out of range, "
+			    "driver confused?\r\n",
+			    name, next);
+			return (-1);
+		}
+		vdir = &vq->vq_desc[next];
+		if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+			_vq_record(i, vdir, ctx, iov, n_iov, flags);
+			i++;
+		} else if ((vs->vs_negotiated_caps &
+		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+			fprintf(stderr,
+			    "%s: descriptor has forbidden INDIRECT flag, "
+			    "driver confused?\r\n",
+			    name);
+			return (-1);
+		} else {
+			n_indir = vdir->vd_len / 16;
+			if ((vdir->vd_len & 0xf) || n_indir == 0) {
+				fprintf(stderr,
+				    "%s: invalid indir len 0x%x, "
+				    "driver confused?\r\n",
+				    name, (u_int)vdir->vd_len);
+				return (-1);
+			}
+			vindir = paddr_guest2host(ctx,
+			    vdir->vd_addr, vdir->vd_len);
+			/*
+			 * Indirects start at the 0th, then follow
+			 * their own embedded "next"s until those run
+			 * out.  Each one's indirect flag must be off
+			 * (we don't really have to check, could just
+			 * ignore errors...).
+			 */
+			next = 0;
+			for (;;) {
+				vp = &vindir[next];
+				if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
+					fprintf(stderr,
+					    "%s: indirect desc has INDIR flag,"
+					    " driver confused?\r\n",
+					    name);
+					return (-1);
+				}
+				_vq_record(i, vp, ctx, iov, n_iov, flags);
+				if (++i > VQ_MAX_DESCRIPTORS)
+					goto loopy;
+				if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
+					break;
+				next = vp->vd_next;
+				if (next >= n_indir) {
+					fprintf(stderr,
+					    "%s: invalid next %u > %u, "
+					    "driver confused?\r\n",
+					    name, (u_int)next, n_indir);
+					return (-1);
+				}
+			}
+		}
+		if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
+			return (i);
+	}
+loopy:
+	fprintf(stderr,
+	    "%s: descriptor loop? count > %d - driver confused?\r\n",
+	    name, i);
+	return (-1);
+}
+
+/*
+ * Return the currently-first request chain to the guest, setting
+ * its I/O length to the provided value.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_relchain(struct vqueue_info *vq, uint32_t iolen)
+{
+	uint16_t head, uidx, mask;
+	volatile struct vring_used *vuh;
+	volatile struct virtio_used *vue;
+
+	/*
+	 * Notes:
+	 *  - mask is N-1 where N is a power of 2 so computes x % N
+	 *  - vuh points to the "used" data shared with guest
+	 *  - vue points to the "used" ring entry we want to update
+	 *  - head is the same value we compute in vq_iovecs().
+	 *
+	 * (I apologize for the two fields named vu_idx; the
+	 * virtio spec calls the one that vue points to, "id"...)
+	 */
+	mask = vq->vq_qsize - 1;
+	vuh = vq->vq_used;
+	head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask];
+
+	uidx = vuh->vu_idx;
+	vue = &vuh->vu_ring[uidx++ & mask];
+	vue->vu_idx = head; /* ie, vue->id = head */
+	vue->vu_tlen = iolen;
+	vuh->vu_idx = uidx;
+}
+
+/*
+ * Driver has finished processing "available" chains and calling
+ * vq_relchain on each one.  If driver used all the available
+ * chains, used_all should be set.
+ *
+ * If the "used" index moved we may need to inform the guest, i.e.,
+ * deliver an interrupt.  Even if the used index did NOT move we
+ * may need to deliver an interrupt, if the avail ring is empty and
+ * we are supposed to interrupt on empty.
+ *
+ * Note that used_all_avail is provided by the caller because it's
+ * a snapshot of the ring state when he decided to finish interrupt
+ * processing -- it's possible that descriptors became available after
+ * that point.  (It's also typically a constant 1/True as well.)
+ */
+void
+vq_endchains(struct vqueue_info *vq, int used_all_avail)
+{
+	struct virtio_softc *vs;
+	uint16_t event_idx, new_idx, old_idx;
+	int intr;
+
+	/*
+	 * Interrupt generation: if we're using EVENT_IDX,
+	 * interrupt if we've crossed the event threshold.
+	 * Otherwise interrupt is generated if we added "used" entries,
+	 * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
+	 *
+	 * In any case, though, if NOTIFY_ON_EMPTY is set and the
+	 * entire avail was processed, we need to interrupt always.
+	 */
+	vs = vq->vq_vs;
+	new_idx = vq->vq_used->vu_idx;
+	old_idx = vq->vq_save_used;
+	if (used_all_avail &&
+	    (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
+		intr = 1;
+	else if (vs->vs_flags & VIRTIO_EVENT_IDX) {
+		event_idx = VQ_USED_EVENT_IDX(vq);
+		/*
+		 * This calculation is per docs and the kernel
+		 * (see src/sys/dev/virtio/virtio_ring.h).
+		 */
+		intr = (uint16_t)(new_idx - event_idx - 1) <
+			(uint16_t)(new_idx - old_idx);
+	} else {
+		intr = new_idx != old_idx &&
+		    !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
+	}
+	if (intr)
+		vq_interrupt(vs, vq);
+}
+
+/* Note: these are in sorted order to make for a fast search */
+static struct config_reg {
+	uint16_t	cr_offset;	/* register offset */
+	uint8_t		cr_size;	/* size (bytes) */
+	uint8_t		cr_ro;		/* true => reg is read only */
+	const char	*cr_name;	/* name of reg */
+} config_regs[] = {
+	{ VTCFG_R_HOSTCAP,	4, 1, "HOSTCAP" },
+	{ VTCFG_R_GUESTCAP,	4, 0, "GUESTCAP" },
+	{ VTCFG_R_PFN,		4, 0, "PFN" },
+	{ VTCFG_R_QNUM,		2, 1, "QNUM" },
+	{ VTCFG_R_QSEL,		2, 0, "QSEL" },
+	{ VTCFG_R_QNOTIFY,	2, 0, "QNOTIFY" },
+	{ VTCFG_R_STATUS,	1, 0, "STATUS" },
+	{ VTCFG_R_ISR,		1, 0, "ISR" },
+	{ VTCFG_R_CFGVEC,	2, 0, "CFGVEC" },
+	{ VTCFG_R_QVEC,		2, 0, "QVEC" },
+};
+
+static inline struct config_reg *
+vi_find_cr(int offset) {
+	u_int hi, lo, mid;
+	struct config_reg *cr;
+
+	lo = 0;
+	hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
+	while (hi >= lo) {
+		mid = (hi + lo) >> 1;
+		cr = &config_regs[mid];
+		if (cr->cr_offset == offset)
+			return (cr);
+		if (cr->cr_offset < offset)
+			lo = mid + 1;
+		else
+			hi = mid - 1;
+	}
+	return (NULL);
+}
+
+/*
+ * Handle pci config space reads.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+uint64_t
+vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	    int baridx, uint64_t offset, int size)
+{
+	struct virtio_softc *vs = pi->pi_arg;
+	struct virtio_consts *vc;
+	struct config_reg *cr;
+	uint64_t virtio_config_size, max;
+	const char *name;
+	uint32_t newoff;
+	uint32_t value;
+	int error;
+
+	if (vs->vs_flags & VIRTIO_USE_MSIX) {
+		if (baridx == pci_msix_table_bar(pi) ||
+		    baridx == pci_msix_pba_bar(pi)) {
+			return (pci_emul_msix_tread(pi, offset, size));
+		}
+	}
+
+	/* XXX probably should do something better than just assert() */
+	assert(baridx == 0);
+
+	if (vs->vs_mtx)
+		pthread_mutex_lock(vs->vs_mtx);
+
+	vc = vs->vs_vc;
+	name = vc->vc_name;
+	value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
+
+	if (size != 1 && size != 2 && size != 4)
+		goto bad;
+
+	if (pci_msix_enabled(pi))
+		virtio_config_size = VTCFG_R_CFG1;
+	else
+		virtio_config_size = VTCFG_R_CFG0;
+
+	if (offset >= virtio_config_size) {
+		/*
+		 * Subtract off the standard size (including MSI-X
+		 * registers if enabled) and dispatch to underlying driver.
+		 * If that fails, fall into general code.
+		 */
+		newoff = offset - virtio_config_size;
+		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+		if (newoff + size > max)
+			goto bad;
+		error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
+		if (!error)
+			goto done;
+	}
+
+bad:
+	cr = vi_find_cr(offset);
+	if (cr == NULL || cr->cr_size != size) {
+		if (cr != NULL) {
+			/* offset must be OK, so size must be bad */
+			fprintf(stderr,
+			    "%s: read from %s: bad size %d\r\n",
+			    name, cr->cr_name, size);
+		} else {
+			fprintf(stderr,
+			    "%s: read from bad offset/size %jd/%d\r\n",
+			    name, (uintmax_t)offset, size);
+		}
+		goto done;
+	}
+
+	switch (offset) {
+	case VTCFG_R_HOSTCAP:
+		value = vc->vc_hv_caps;
+		break;
+	case VTCFG_R_GUESTCAP:
+		value = vs->vs_negotiated_caps;
+		break;
+	case VTCFG_R_PFN:
+		if (vs->vs_curq < vc->vc_nvq)
+			value = vs->vs_queues[vs->vs_curq].vq_pfn;
+		break;
+	case VTCFG_R_QNUM:
+		value = vs->vs_curq < vc->vc_nvq ?
+		    vs->vs_queues[vs->vs_curq].vq_qsize : 0;
+		break;
+	case VTCFG_R_QSEL:
+		value = vs->vs_curq;
+		break;
+	case VTCFG_R_QNOTIFY:
+		value = 0;	/* XXX */
+		break;
+	case VTCFG_R_STATUS:
+		value = vs->vs_status;
+		break;
+	case VTCFG_R_ISR:
+		value = vs->vs_isr;
+		vs->vs_isr = 0;		/* a read clears this flag */
+		break;
+	case VTCFG_R_CFGVEC:
+		value = vs->vs_msix_cfg_idx;
+		break;
+	case VTCFG_R_QVEC:
+		value = vs->vs_curq < vc->vc_nvq ?
+		    vs->vs_queues[vs->vs_curq].vq_msix_idx :
+		    VIRTIO_MSI_NO_VECTOR;
+		break;
+	}
+done:
+	if (vs->vs_mtx)
+		pthread_mutex_unlock(vs->vs_mtx);
+	return (value);
+}
+
+/*
+ * Handle pci config space writes.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+void
+vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+	     int baridx, uint64_t offset, int size, uint64_t value)
+{
+	struct virtio_softc *vs = pi->pi_arg;
+	struct vqueue_info *vq;
+	struct virtio_consts *vc;
+	struct config_reg *cr;
+	uint64_t virtio_config_size, max;
+	const char *name;
+	uint32_t newoff;
+	int error;
+
+	if (vs->vs_flags & VIRTIO_USE_MSIX) {
+		if (baridx == pci_msix_table_bar(pi) ||
+		    baridx == pci_msix_pba_bar(pi)) {
+			pci_emul_msix_twrite(pi, offset, size, value);
+			return;
+		}
+	}
+
+	/* XXX probably should do something better than just assert() */
+	assert(baridx == 0);
+
+	if (vs->vs_mtx)
+		pthread_mutex_lock(vs->vs_mtx);
+
+	vc = vs->vs_vc;
+	name = vc->vc_name;
+
+	if (size != 1 && size != 2 && size != 4)
+		goto bad;
+
+	if (pci_msix_enabled(pi))
+		virtio_config_size = VTCFG_R_CFG1;
+	else
+		virtio_config_size = VTCFG_R_CFG0;
+
+	if (offset >= virtio_config_size) {
+		/*
+		 * Subtract off the standard size (including MSI-X
+		 * registers if enabled) and dispatch to underlying driver.
+		 */
+		newoff = offset - virtio_config_size;
+		max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+		if (newoff + size > max)
+			goto bad;
+		error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
+		if (!error)
+			goto done;
+	}
+
+bad:
+	cr = vi_find_cr(offset);
+	if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
+		if (cr != NULL) {
+			/* offset must be OK, wrong size and/or reg is R/O */
+			if (cr->cr_size != size)
+				fprintf(stderr,
+				    "%s: write to %s: bad size %d\r\n",
+				    name, cr->cr_name, size);
+			if (cr->cr_ro)
+				fprintf(stderr,
+				    "%s: write to read-only reg %s\r\n",
+				    name, cr->cr_name);
+		} else {
+			fprintf(stderr,
+			    "%s: write to bad offset/size %jd/%d\r\n",
+			    name, (uintmax_t)offset, size);
+		}
+		goto done;
+	}
+
+	switch (offset) {
+	case VTCFG_R_GUESTCAP:
+		vs->vs_negotiated_caps = value & vc->vc_hv_caps;
+		break;
+	case VTCFG_R_PFN:
+		if (vs->vs_curq >= vc->vc_nvq)
+			goto bad_qindex;
+		vi_vq_init(vs, value);
+		break;
+	case VTCFG_R_QSEL:
+		/*
+		 * Note that the guest is allowed to select an
+		 * invalid queue; we just need to return a QNUM
+		 * of 0 while the bad queue is selected.
+		 */
+		vs->vs_curq = value;
+		break;
+	case VTCFG_R_QNOTIFY:
+		if (value >= vc->vc_nvq) {
+			fprintf(stderr, "%s: queue %d notify out of range\r\n",
+				name, (int)value);
+			goto done;
+		}
+		vq = &vs->vs_queues[value];
+		if (vq->vq_notify)
+			(*vq->vq_notify)(DEV_SOFTC(vs), vq);
+		else if (vc->vc_qnotify)
+			(*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
+		else
+			fprintf(stderr,
+			    "%s: qnotify queue %d: missing vq/vc notify\r\n",
+				name, (int)value);
+		break;
+	case VTCFG_R_STATUS:
+		vs->vs_status = value;
+		if (value == 0)
+			(*vc->vc_reset)(DEV_SOFTC(vs));
+		break;
+	case VTCFG_R_CFGVEC:
+		vs->vs_msix_cfg_idx = value;
+		break;
+	case VTCFG_R_QVEC:
+		if (vs->vs_curq >= vc->vc_nvq)
+			goto bad_qindex;
+		vq = &vs->vs_queues[vs->vs_curq];
+		vq->vq_msix_idx = value;
+		break;
+	}
+	goto done;
+
+bad_qindex:
+	fprintf(stderr,
+	    "%s: write config reg %s: curq %d >= max %d\r\n",
+	    name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
+done:
+	if (vs->vs_mtx)
+		pthread_mutex_unlock(vs->vs_mtx);
+}
diff --git a/usr.sbin/bhyve/virtio.h b/usr.sbin/bhyve/virtio.h
index a512381..8975bf7 100644
--- a/usr.sbin/bhyve/virtio.h
+++ b/usr.sbin/bhyve/virtio.h
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2011 NetApp, Inc.
+ * Copyright (c) 2013  Chris Torek <torek @ torek net>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -11,10 +11,10 @@
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
- * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -29,47 +29,195 @@
 #ifndef	_VIRTIO_H_
 #define	_VIRTIO_H_
 
+/*
+ * These are derived from several virtio specifications.
+ *
+ * Some useful links:
+ *    https://github.com/rustyrussel/virtio-spec
+ *    http://people.redhat.com/pbonzini/virtio-spec.pdf
+ */
+
+/*
+ * A virtual device has zero or more "virtual queues" (virtqueue).
+ * Each virtqueue uses at least two 4096-byte pages, laid out thus:
+ *
+ *      +-----------------------------------------------+
+ *      |    "desc":  <N> descriptors, 16 bytes each    |
+ *      |   -----------------------------------------   |
+ *      |   "avail":   2 uint16; <N> uint16; 1 uint16   |
+ *      |   -----------------------------------------   |
+ *      |              pad to 4k boundary               |
+ *      +-----------------------------------------------+
+ *      |   "used": 2 x uint16; <N> elems; 1 uint16     |
+ *      |   -----------------------------------------   |
+ *      |              pad to 4k boundary               |
+ *      +-----------------------------------------------+
+ *
+ * The number <N> that appears here is always a power of two and is
+ * limited to no more than 32768 (as it must fit in a 16-bit field).
+ * If <N> is sufficiently large, the above will occupy more than
+ * two pages.  In any case, all pages must be physically contiguous
+ * within the guest's physical address space.
+ *
+ * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
+ * physical address <addr>, a 32-bit length <len>, a 16-bit
+ * <flags>, and a 16-bit <next> field (all in guest byte order).
+ *
+ * There are three flags that may be set :
+ *	NEXT    descriptor is chained, so use its "next" field
+ *	WRITE   descriptor is for host to write into guest RAM
+ *		(else host is to read from guest RAM)
+ *	INDIRECT   descriptor address field is (guest physical)
+ *		address of a linear array of descriptors
+ *
+ * Unless INDIRECT is set, <len> is the number of bytes that may
+ * be read/written from guest physical address <addr>.  If
+ * INDIRECT is set, WRITE is ignored and <len> provides the length
+ * of the indirect descriptors (and <len> must be a multiple of
+ * 16).  Note that NEXT may still be set in the main descriptor
+ * pointing to the indirect, and should be set in each indirect
+ * descriptor that uses the next descriptor (these should generally
+ * be numbered sequentially).  However, INDIRECT must not be set
+ * in the indirect descriptors.  Upon reaching an indirect descriptor
+ * without a NEXT bit, control returns to the direct descriptors.
+ *
+ * Except inside an indirect, each <next> value must be in the
+ * range [0 .. N) (i.e., the half-open interval).  (Inside an
+ * indirect, each <next> must be in the range [0 .. <len>/16).)
+ *
+ * The "avail" data structures reside in the same pages as the
+ * "desc" structures since both together are used by the device to
+ * pass information to the hypervisor's virtual driver.  These
+ * begin with a 16-bit <flags> field and 16-bit index <idx>, then
+ * have <N> 16-bit <ring> values, followed by one final 16-bit
+ * field <used_event>.  The <N> <ring> entries are simply indices
+ * indices into the descriptor ring (and thus must meet the same
+ * constraints as each <next> value).  However, <idx> is counted
+ * up from 0 (initially) and simply wraps around after 65535; it
+ * is taken mod <N> to find the next available entry.
+ *
+ * The "used" ring occupies a separate page or pages, and contains
+ * values written from the virtual driver back to the guest OS.
+ * This begins with a 16-bit <flags> and 16-bit <idx>, then there
+ * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
+ * The <N> "vring_used" elements consist of a 32-bit <id> and a
+ * 32-bit <len> (vu_tlen below).  The <id> is simply the index of
+ * the head of a descriptor chain the guest made available
+ * earlier, and the <len> is the number of bytes actually written,
+ * e.g., in the case of a network driver that provided a large
+ * receive buffer but received only a small amount of data.
+ *
+ * The two event fields, <used_event> and <avail_event>, in the
+ * avail and used rings (respectively -- note the reversal!), are
+ * always provided, but are used only if the virtual device
+ * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
+ * negotiation.  Similarly, both rings provide a flag --
+ * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
+ * their <flags> field, indicating that the guest does not need an
+ * interrupt, or that the hypervisor driver does not need a
+ * notify, when descriptors are added to the corresponding ring.
+ * (These are provided only for interrupt optimization and need
+ * not be implemented.)
+ */
 #define VRING_ALIGN	4096
 
 #define VRING_DESC_F_NEXT	(1 << 0)
 #define VRING_DESC_F_WRITE	(1 << 1)
 #define VRING_DESC_F_INDIRECT	(1 << 2)
 
+struct virtio_desc {			/* AKA vring_desc */
+	uint64_t	vd_addr;	/* guest physical address */
+	uint32_t	vd_len;		/* length of scatter/gather seg */
+	uint16_t	vd_flags;	/* VRING_F_DESC_* */
+	uint16_t	vd_next;	/* next desc if F_NEXT */
+} __packed;
+
+struct virtio_used {			/* AKA vring_used_elem */
+	uint32_t	vu_idx;		/* head of used descriptor chain */
+	uint32_t	vu_tlen;	/* length written-to */
+} __packed;
+
 #define VRING_AVAIL_F_NO_INTERRUPT   1
-#define VIRTIO_MSI_NO_VECTOR	0xFFFF
 
-struct virtio_desc {
-	uint64_t	vd_addr;
-	uint32_t	vd_len;
-	uint16_t	vd_flags;
-	uint16_t	vd_next;
+struct vring_avail {
+	uint16_t	va_flags;	/* VRING_AVAIL_F_* */
+	uint16_t	va_idx;		/* counts to 65535, then cycles */
+	uint16_t	va_ring[];	/* size N, reported in QNUM value */
+/*	uint16_t	va_used_event;	-- after N ring entries */
 } __packed;
 
-struct virtio_used {
-	uint32_t	vu_idx;
-	uint32_t	vu_tlen;
+#define	VRING_USED_F_NO_NOTIFY		1
+struct vring_used {
+	uint16_t	vu_flags;	/* VRING_USED_F_* */
+	uint16_t	vu_idx;		/* counts to 65535, then cycles */
+	struct virtio_used vu_ring[];	/* size N */
+/*	uint16_t	vu_avail_event;	-- after N ring entries */
 } __packed;
 
 /*
+ * The address of any given virtual queue is determined by a single
+ * Page Frame Number register.  The guest writes the PFN into the
+ * PCI config space.  However, a device that has two or more
+ * virtqueues can have a different PFN, and size, for each queue.
+ * The number of queues is determinable via the PCI config space
+ * VTCFG_R_QSEL register.  Writes to QSEL select the queue: 0 means
+ * queue #0, 1 means queue#1, etc.  Once a queue is selected, the
+ * remaining PFN and QNUM registers refer to that queue.
+ *
+ * QNUM is a read-only register containing a nonzero power of two
+ * that indicates the (hypervisor's) queue size.  Or, if reading it
+ * produces zero, the hypervisor does not have a corresponding
+ * queue.  (The number of possible queues depends on the virtual
+ * device.  The block device has just one; the network device
+ * provides either two -- 0 = receive, 1 = transmit -- or three,
+ * with 2 = control.)
+ *
+ * PFN is a read/write register giving the physical page address of
+ * the virtqueue in guest memory (the guest must allocate enough space
+ * based on the hypervisor's provided QNUM).
+ *
+ * QNOTIFY is effectively write-only: when the guest writes a queue
+ * number to the register, the hypervisor should scan the specified
+ * virtqueue. (Reading QNOTIFY currently always gets 0).
+ */
+
+/*
  * PFN register shift amount
  */
-#define VRING_PFN		12
+#define VRING_PFN               12
 
 /*
  * Virtio device types
+ *
+ * XXX Should really be merged with <dev/virtio/virtio.h> defines
  */
-#define VIRTIO_TYPE_NET		1
-#define VIRTIO_TYPE_BLOCK	2
+#define	VIRTIO_TYPE_NET		1
+#define	VIRTIO_TYPE_BLOCK	2
+#define	VIRTIO_TYPE_CONSOLE	3
+#define	VIRTIO_TYPE_ENTROPY	4
+#define	VIRTIO_TYPE_BALLOON	5
+#define	VIRTIO_TYPE_IOMEMORY	6
+#define	VIRTIO_TYPE_RPMSG	7
+#define	VIRTIO_TYPE_SCSI	8
+#define	VIRTIO_TYPE_9P		9
+
+/* experimental IDs start at 65535 and work down */
 
 /*
  * PCI vendor/device IDs
  */
-#define VIRTIO_VENDOR		0x1AF4
-#define VIRTIO_DEV_NET		0x1000
-#define VIRTIO_DEV_BLOCK	0x1001
+#define	VIRTIO_VENDOR		0x1AF4
+#define	VIRTIO_DEV_NET		0x1000
+#define	VIRTIO_DEV_BLOCK	0x1001
 
 /*
- * PCI config space constants
+ * PCI config space constants.
+ *
+ * If MSI-X is enabled, the ISR register is generally not used,
+ * and the configuration vector and queue vector appear at offsets
+ * 20 and 22 with the remaining configuration registers at 24.
+ * If MSI-X is not enabled, those two registers disappear and
+ * the remaining configuration registers start at offset 20.
  */
 #define VTCFG_R_HOSTCAP		0
 #define VTCFG_R_GUESTCAP	4
@@ -85,22 +233,227 @@ struct virtio_used {
 #define VTCFG_R_CFG1		24	/* With MSI-X */
 #define VTCFG_R_MSIX		20
 
-/* Feature flags */
+/*
+ * Bits in VTCFG_R_STATUS.  Guests need not actually set any of these,
+ * but a guest writing 0 to this register means "please reset".
+ */
+#define	VTCFG_STATUS_ACK	0x01	/* guest OS has acknowledged dev */
+#define	VTCFG_STATUS_DRIVER	0x02	/* guest OS driver is loaded */
+#define	VTCFG_STATUS_DRIVER_OK	0x04	/* guest OS driver ready */
+#define	VTCFG_STATUS_FAILED	0x80	/* guest has given up on this dev */
+
+/*
+ * Bits in VTCFG_R_ISR.  These apply only if not using MSI-X.
+ *
+ * (We don't [yet?] ever use CONF_CHANGED.)
+ */
+#define	VTCFG_ISR_QUEUES	0x01	/* re-scan queues */
+#define	VTCFG_ISR_CONF_CHANGED	0x80	/* configuration changed */
+
+#define VIRTIO_MSI_NO_VECTOR	0xFFFF
+
+/*
+ * Feature flags.
+ * Note: bits 0 through 23 are reserved to each device type.
+ */
 #define	VIRTIO_F_NOTIFY_ON_EMPTY	(1 << 24)
+#define	VIRTIO_RING_F_INDIRECT_DESC	(1 << 28)
+#define	VIRTIO_RING_F_EVENT_IDX		(1 << 29)
 
 /* From section 2.3, "Virtqueue Configuration", of the virtio specification */
-static inline u_int
+static inline size_t
 vring_size(u_int qsz)
 {
-	u_int size;
+	size_t size;
 
+	/* constant 3 below = va_flags, va_idx, va_used_event */
 	size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
 	size = roundup2(size, VRING_ALIGN);
 
+	/* constant 3 below = vu_flags, vu_idx, vu_avail_event */
 	size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
 	size = roundup2(size, VRING_ALIGN);
 
 	return (size);
 }
 
+struct vmctx;
+struct pci_devinst;
+struct vqueue_info;
+
+/*
+ * A virtual device, with some number (possibly 0) of virtual
+ * queues and some size (possibly 0) of configuration-space
+ * registers private to the device.  The virtio_softc should come
+ * at the front of each "derived class", so that a pointer to the
+ * virtio_softc is also a pointer to the more specific, derived-
+ * from-virtio driver's softc.
+ *
+ * Note: inside each hypervisor virtio driver, changes to these
+ * data structures must be locked against other threads, if any.
+ * Except for PCI config space register read/write, we assume each
+ * driver does the required locking, but we need a pointer to the
+ * lock (if there is one) for PCI config space read/write ops.
+ *
+ * When the guest reads or writes the device's config space, the
+ * generic layer checks for operations on the special registers
+ * described above.  If the offset of the register(s) being read
+ * or written is past the CFG area (CFG0 or CFG1), the request is
+ * passed on to the virtual device, after subtracting off the
+ * generic-layer size.  (So, drivers can just use the offset as
+ * an offset into "struct config", for instance.)
+ *
+ * (The virtio layer also makes sure that the read or write is to/
+ * from a "good" config offset, hence vc_cfgsize, and on BAR #0.
+ * However, the driver must verify the read or write size and offset
+ * and that no one is writing a readonly register.)
+ *
+ * The BROKED flag ("this thing done gone and broked") is for future
+ * use.
+ */
+#define	VIRTIO_USE_MSIX		0x01
+#define	VIRTIO_EVENT_IDX	0x02	/* use the event-index values */
+#define	VIRTIO_BROKED		0x08	/* ??? */
+
+struct virtio_softc {
+	struct virtio_consts *vs_vc;	/* constants (see below) */
+	int	vs_flags;		/* VIRTIO_* flags from above */
+	pthread_mutex_t *vs_mtx;	/* POSIX mutex, if any */
+	struct pci_devinst *vs_pi;	/* PCI device instance */
+	uint32_t vs_negotiated_caps;	/* negotiated capabilities */
+	struct vqueue_info *vs_queues;	/* one per vc_nvq */
+	int	vs_curq;		/* current queue */
+	uint8_t	vs_status;		/* value from last status write */
+	uint8_t	vs_isr;			/* ISR flags, if not MSI-X */
+	uint16_t vs_msix_cfg_idx;	/* MSI-X vector for config event */
+};
+
+struct virtio_consts {
+	const char *vc_name;		/* name of driver (for diagnostics) */
+	int	vc_nvq;			/* number of virtual queues */
+	size_t	vc_cfgsize;		/* size of dev-specific config regs */
+	void	(*vc_reset)(void *);	/* called on virtual device reset */
+	void	(*vc_qnotify)(void *, struct vqueue_info *);
+					/* called on QNOTIFY if no VQ notify */
+	int	(*vc_cfgread)(void *, int, int, uint32_t *);
+					/* called to read config regs */
+	int	(*vc_cfgwrite)(void *, int, int, uint32_t);
+					/* called to write config regs */
+	uint32_t vc_hv_caps;		/* hypervisor-provided capabilities */
+};
+
+/*
+ * Data structure allocated (statically) per virtual queue.
+ *
+ * Drivers may change vq_qsize after a reset.  When the guest OS
+ * requests a device reset, the hypervisor first calls
+ * vs->vs_vc->vc_reset(); then the data structure below is
+ * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
+ *
+ * The remaining fields should only be fussed-with by the generic
+ * code.
+ *
+ * Note: the addresses of vq_desc, vq_avail, and vq_used are all
+ * computable from each other, but it's a lot simpler if we just
+ * keep a pointer to each one.  The event indices are similarly
+ * (but more easily) computable, and this time we'll compute them:
+ * they're just XX_ring[N].
+ */
+#define	VQ_ALLOC	0x01	/* set once we have a pfn */
+#define	VQ_BROKED	0x02	/* ??? */
+struct vqueue_info {
+	uint16_t vq_qsize;	/* size of this queue (a power of 2) */
+	void	(*vq_notify)(void *, struct vqueue_info *);
+				/* called instead of vc_notify, if not NULL */
+
+	struct virtio_softc *vq_vs;	/* backpointer to softc */
+	uint16_t vq_num;	/* we're the num'th queue in the softc */
+
+	uint16_t vq_flags;	/* flags (see above) */
+	uint16_t vq_last_avail;	/* a recent value of vq_avail->va_idx */
+	uint16_t vq_save_used;	/* saved vq_used->vu_idx; see vq_endchains */
+	uint16_t vq_msix_idx;	/* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
+
+	uint32_t vq_pfn;	/* PFN of virt queue (not shifted!) */
+
+	volatile struct virtio_desc *vq_desc;	/* descriptor array */
+	volatile struct vring_avail *vq_avail;	/* the "avail" ring */
+	volatile struct vring_used *vq_used;	/* the "used" ring */
+
+};
+/* as noted above, these are sort of backwards, name-wise */
+#define VQ_AVAIL_EVENT_IDX(vq) \
+	(*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
+#define VQ_USED_EVENT_IDX(vq) \
+	((vq)->vq_avail->va_ring[(vq)->vq_qsize])
+
+/*
+ * Is this ring ready for I/O?
+ */
+static inline int
+vq_ring_ready(struct vqueue_info *vq)
+{
+
+	return (vq->vq_flags & VQ_ALLOC);
+}
+
+/*
+ * Are there "available" descriptors?  (This does not count
+ * how many, just returns True if there are some.)
+ */
+static inline int
+vq_has_descs(struct vqueue_info *vq)
+{
+
+	return (vq_ring_ready(vq) && vq->vq_last_avail !=
+	    vq->vq_avail->va_idx);
+}
+
+/*
+ * Called by virtio driver as it starts processing chains.  Each
+ * completed chain (obtained from vq_getchain()) is released by
+ * calling vq_relchain(), then when all are done, vq_endchains()
+ * can tell if / how-many chains were processed and know whether
+ * and how to generate an interrupt.
+ */
+static inline void
+vq_startchains(struct vqueue_info *vq)
+{
+
+	vq->vq_save_used = vq->vq_used->vu_idx;
+}
+
+/*
+ * Deliver an interrupt to guest on the given virtual queue
+ * (if possible, or a generic MSI interrupt if not using MSI-X).
+ */
+static inline void
+vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
+{
+
+	if (vs->vs_flags & VIRTIO_USE_MSIX)
+		pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
+	else {
+		vs->vs_isr |= VTCFG_ISR_QUEUES;
+		pci_generate_msi(vs->vs_pi, 0);
+	}
+}
+
+struct iovec;
+void	vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+			void *dev_softc, struct pci_devinst *pi,
+			struct vqueue_info *queues);
+int	vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
+void	vi_reset_dev(struct virtio_softc *);
+void	vi_set_io_bar(struct virtio_softc *, int);
+
+int	vq_getchain(struct vqueue_info *vq,
+		    struct iovec *iov, int n_iov, uint16_t *flags);
+void	vq_relchain(struct vqueue_info *vq, uint32_t iolen);
+void	vq_endchains(struct vqueue_info *vq, int used_all_avail);
+
+uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		     int baridx, uint64_t offset, int size);
+void	vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+		     int baridx, uint64_t offset, int size, uint64_t value);
 #endif	/* _VIRTIO_H_ */
author	grehan <grehan@FreeBSD.org>	2013-07-17 23:37:33 +0000
committer	grehan <grehan@FreeBSD.org>	2013-07-17 23:37:33 +0000
commit	a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb (patch)
tree	064903149f0c797df3873dc7997273f417c81f93 /usr.sbin
parent	b8663d4c053e282b686f3e2a2d625b21b5944176 (diff)
download	FreeBSD-src-a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb.zip FreeBSD-src-a6cf66c6cfea279d740ce36eac27ac9ec27ae0cb.tar.gz