Import virtio base, PCI front-end, and net/block/balloon drivers.

Tested on Qemu/KVM, VirtualBox, and BHyVe. Currently built as modules-only on i386/amd64. Man pages not yet hooked up, pending review. Submitted by: Bryan Venteicher bryanv at daemoninthecloset dot org Reviewed by: bz MFC after: 4 weeks or so
author: grehan <grehan@FreeBSD.org> 2011-11-18 05:43:43 +0000
committer: grehan <grehan@FreeBSD.org> 2011-11-18 05:43:43 +0000
commit: 1a42b19ed0cb934631927da4b71fde34c8afdb34 (patch)
tree: 4d3e53fb89135e392dab5a548569b236b9ba213e /sys/dev/virtio
parent: 7b8778fe5a226bee40cd88fc68926e898e9bd8f7 (diff)
download: FreeBSD-src-1a42b19ed0cb934631927da4b71fde34c8afdb34.zip
FreeBSD-src-1a42b19ed0cb934631927da4b71fde34c8afdb34.tar.gz
16 files changed, 7654 insertions, 0 deletions
diff --git a/sys/dev/virtio/balloon/virtio_balloon.c b/sys/dev/virtio/balloon/virtio_balloon.c
new file mode 100644
index 0000000..ef7aca9
--- /dev/null
+++ b/sys/dev/virtio/balloon/virtio_balloon.c
@@ -0,0 +1,569 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for VirtIO memory balloon devices. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/endian.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sglist.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/balloon/virtio_balloon.h>
+
+#include "virtio_if.h"
+
+struct vtballoon_softc {
+	device_t		 vtballoon_dev;
+	struct mtx		 vtballoon_mtx;
+	uint64_t		 vtballoon_features;
+	uint32_t		 vtballoon_flags;
+#define VTBALLOON_FLAG_DETACH	 0x01
+
+	struct virtqueue	*vtballoon_inflate_vq;
+	struct virtqueue	*vtballoon_deflate_vq;
+
+	uint32_t		 vtballoon_desired_npages;
+	uint32_t		 vtballoon_current_npages;
+	TAILQ_HEAD(,vm_page)	 vtballoon_pages;
+
+	struct proc		*vtballoon_kproc;
+	uint32_t		*vtballoon_page_frames;
+	int			 vtballoon_timeout;
+};
+
+static struct virtio_feature_desc vtballoon_feature_desc[] = {
+	{ VIRTIO_BALLOON_F_MUST_TELL_HOST,	"MustTellHost"	},
+	{ VIRTIO_BALLOON_F_STATS_VQ,		"StatsVq"	},
+
+	{ 0, NULL }
+};
+
+static int	vtballoon_probe(device_t);
+static int	vtballoon_attach(device_t);
+static int	vtballoon_detach(device_t);
+static int	vtballoon_config_change(device_t);
+
+static void	vtballoon_negotiate_features(struct vtballoon_softc *);
+static int	vtballoon_alloc_virtqueues(struct vtballoon_softc *);
+
+static int	vtballoon_vq_intr(void *);
+
+static void	vtballoon_inflate(struct vtballoon_softc *, int);
+static void	vtballoon_deflate(struct vtballoon_softc *, int);
+
+static void	vtballoon_send_page_frames(struct vtballoon_softc *,
+		    struct virtqueue *, int);
+
+static void	vtballoon_pop(struct vtballoon_softc *);
+static void	vtballoon_stop(struct vtballoon_softc *);
+
+static vm_page_t
+		vtballoon_alloc_page(struct vtballoon_softc *);
+static void	vtballoon_free_page(struct vtballoon_softc *, vm_page_t);
+
+static int	vtballoon_sleep(struct vtballoon_softc *);
+static void	vtballoon_thread(void *);
+static void	vtballoon_add_sysctl(struct vtballoon_softc *);
+
+/* Features desired/implemented by this driver. */
+#define VTBALLOON_FEATURES		0
+
+/* Timeout between retries when the balloon needs inflating. */
+#define VTBALLOON_LOWMEM_TIMEOUT	hz
+
+/*
+ * Maximum number of pages we'll request to inflate or deflate
+ * the balloon in one virtqueue request. Both Linux and NetBSD
+ * have settled on 256, doing up to 1MB at a time.
+ */
+#define VTBALLOON_PAGES_PER_REQUEST	256
+
+#define VTBALLOON_MTX(_sc)		&(_sc)->vtballoon_mtx
+#define VTBALLOON_LOCK_INIT(_sc, _name)	mtx_init(VTBALLOON_MTX((_sc)), _name, \
+					    "VirtIO Balloon Lock", MTX_SPIN)
+#define VTBALLOON_LOCK(_sc)		mtx_lock_spin(VTBALLOON_MTX((_sc)))
+#define VTBALLOON_UNLOCK(_sc)		mtx_unlock_spin(VTBALLOON_MTX((_sc)))
+#define VTBALLOON_LOCK_DESTROY(_sc)	mtx_destroy(VTBALLOON_MTX((_sc)))
+
+static device_method_t vtballoon_methods[] = {
+	/* Device methods. */
+	DEVMETHOD(device_probe,		vtballoon_probe),
+	DEVMETHOD(device_attach,	vtballoon_attach),
+	DEVMETHOD(device_detach,	vtballoon_detach),
+
+	/* VirtIO methods. */
+	DEVMETHOD(virtio_config_change, vtballoon_config_change),
+
+	{ 0, 0 }
+};
+
+static driver_t vtballoon_driver = {
+	"vtballoon",
+	vtballoon_methods,
+	sizeof(struct vtballoon_softc)
+};
+static devclass_t vtballoon_devclass;
+
+DRIVER_MODULE(virtio_balloon, virtio_pci, vtballoon_driver,
+    vtballoon_devclass, 0, 0);
+MODULE_VERSION(virtio_balloon, 1);
+MODULE_DEPEND(virtio_balloon, virtio, 1, 1, 1);
+
+static int
+vtballoon_probe(device_t dev)
+{
+
+	if (virtio_get_device_type(dev) != VIRTIO_ID_BALLOON)
+		return (ENXIO);
+
+	device_set_desc(dev, "VirtIO Balloon Adapter");
+
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtballoon_attach(device_t dev)
+{
+	struct vtballoon_softc *sc;
+	int error;
+
+	sc = device_get_softc(dev);
+	sc->vtballoon_dev = dev;
+
+	VTBALLOON_LOCK_INIT(sc, device_get_nameunit(dev));
+	TAILQ_INIT(&sc->vtballoon_pages);
+
+	vtballoon_add_sysctl(sc);
+
+	virtio_set_feature_desc(dev, vtballoon_feature_desc);
+	vtballoon_negotiate_features(sc);
+
+	sc->vtballoon_page_frames = malloc(VTBALLOON_PAGES_PER_REQUEST *
+	    sizeof(uint32_t), M_DEVBUF, M_NOWAIT | M_ZERO);
+	if (sc->vtballoon_page_frames == NULL) {
+		error = ENOMEM;
+		device_printf(dev,
+		    "cannot allocate page frame request array\n");
+		goto fail;
+	}
+
+	error = vtballoon_alloc_virtqueues(sc);
+	if (error) {
+		device_printf(dev, "cannot allocate virtqueues\n");
+		goto fail;
+	}
+
+	error = virtio_setup_intr(dev, INTR_TYPE_MISC);
+	if (error) {
+		device_printf(dev, "cannot setup virtqueue interrupts\n");
+		goto fail;
+	}
+
+	error = kproc_create(vtballoon_thread, sc, &sc->vtballoon_kproc,
+	    0, 0, "virtio_balloon");
+	if (error) {
+		device_printf(dev, "cannot create balloon kproc\n");
+		goto fail;
+	}
+
+	virtqueue_enable_intr(sc->vtballoon_inflate_vq);
+	virtqueue_enable_intr(sc->vtballoon_deflate_vq);
+
+fail:
+	if (error)
+		vtballoon_detach(dev);
+
+	return (error);
+}
+
+static int
+vtballoon_detach(device_t dev)
+{
+	struct vtballoon_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	if (sc->vtballoon_kproc != NULL) {
+		VTBALLOON_LOCK(sc);
+		sc->vtballoon_flags |= VTBALLOON_FLAG_DETACH;
+		wakeup_one(sc);
+		msleep_spin(sc->vtballoon_kproc, VTBALLOON_MTX(sc),
+		    "vtbdth", 0);
+		VTBALLOON_UNLOCK(sc);
+
+		sc->vtballoon_kproc = NULL;
+	}
+
+	if (device_is_attached(dev)) {
+		vtballoon_pop(sc);
+		vtballoon_stop(sc);
+	}
+
+	if (sc->vtballoon_page_frames != NULL) {
+		free(sc->vtballoon_page_frames, M_DEVBUF);
+		sc->vtballoon_page_frames = NULL;
+	}
+
+	VTBALLOON_LOCK_DESTROY(sc);
+
+	return (0);
+}
+
+static int
+vtballoon_config_change(device_t dev)
+{
+	struct vtballoon_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	VTBALLOON_LOCK(sc);
+	wakeup_one(sc);
+	VTBALLOON_UNLOCK(sc);
+
+	return (1);
+}
+
+static void
+vtballoon_negotiate_features(struct vtballoon_softc *sc)
+{
+	device_t dev;
+	uint64_t features;
+
+	dev = sc->vtballoon_dev;
+	features = virtio_negotiate_features(dev, VTBALLOON_FEATURES);
+	sc->vtballoon_features = features;
+}
+
+static int
+vtballoon_alloc_virtqueues(struct vtballoon_softc *sc)
+{
+	device_t dev;
+	struct vq_alloc_info vq_info[2];
+	int nvqs;
+
+	dev = sc->vtballoon_dev;
+	nvqs = 2;
+
+	VQ_ALLOC_INFO_INIT(&vq_info[0], 0, vtballoon_vq_intr, sc,
+	    &sc->vtballoon_inflate_vq, "%s inflate", device_get_nameunit(dev));
+
+	VQ_ALLOC_INFO_INIT(&vq_info[1], 0, vtballoon_vq_intr, sc,
+	    &sc->vtballoon_deflate_vq, "%s deflate", device_get_nameunit(dev));
+
+	return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info));
+}
+
+static int
+vtballoon_vq_intr(void *xsc)
+{
+	struct vtballoon_softc *sc;
+
+	sc = xsc;
+
+	VTBALLOON_LOCK(sc);
+	wakeup_one(sc);
+	VTBALLOON_UNLOCK(sc);
+
+	return (1);
+}
+
+static void
+vtballoon_inflate(struct vtballoon_softc *sc, int npages)
+{
+	struct virtqueue *vq;
+	vm_page_t m;
+	int i;
+
+	vq = sc->vtballoon_inflate_vq;
+	m = NULL;
+
+	if (npages > VTBALLOON_PAGES_PER_REQUEST)
+		npages = VTBALLOON_PAGES_PER_REQUEST;
+	KASSERT(npages > 0, ("balloon doesn't need inflating?"));
+
+	for (i = 0; i < npages; i++) {
+		if ((m = vtballoon_alloc_page(sc)) == NULL)
+			break;
+
+		sc->vtballoon_page_frames[i] =
+		    VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT;
+
+		KASSERT(m->queue == PQ_NONE, ("allocated page on queue"));
+		TAILQ_INSERT_TAIL(&sc->vtballoon_pages, m, pageq);
+	}
+
+	if (i > 0)
+		vtballoon_send_page_frames(sc, vq, i);
+
+	if (m == NULL)
+		sc->vtballoon_timeout = VTBALLOON_LOWMEM_TIMEOUT;
+}
+
+static void
+vtballoon_deflate(struct vtballoon_softc *sc, int npages)
+{
+	TAILQ_HEAD(, vm_page) free_pages;
+	struct virtqueue *vq;
+	vm_page_t m;
+	int i;
+
+	vq = sc->vtballoon_deflate_vq;
+	TAILQ_INIT(&free_pages);
+
+	if (npages > VTBALLOON_PAGES_PER_REQUEST)
+		npages = VTBALLOON_PAGES_PER_REQUEST;
+	KASSERT(npages > 0, ("balloon doesn't need deflating?"));
+
+	for (i = 0; i < npages; i++) {
+		m = TAILQ_FIRST(&sc->vtballoon_pages);
+		KASSERT(m != NULL, ("no more pages to deflate"));
+
+		sc->vtballoon_page_frames[i] =
+		    VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT;
+
+		TAILQ_REMOVE(&sc->vtballoon_pages, m, pageq);
+		TAILQ_INSERT_TAIL(&free_pages, m, pageq);
+	}
+
+	if (i > 0) {
+		/* Always tell host first before freeing the pages. */
+		vtballoon_send_page_frames(sc, vq, i);
+
+		while ((m = TAILQ_FIRST(&free_pages)) != NULL) {
+			TAILQ_REMOVE(&free_pages, m, pageq);
+			vtballoon_free_page(sc, m);
+		}
+	}
+
+	KASSERT((TAILQ_EMPTY(&sc->vtballoon_pages) &&
+	    sc->vtballoon_current_npages == 0) ||
+	    (!TAILQ_EMPTY(&sc->vtballoon_pages) &&
+	    sc->vtballoon_current_npages != 0), ("balloon empty?"));
+}
+
+static void
+vtballoon_send_page_frames(struct vtballoon_softc *sc, struct virtqueue *vq,
+    int npages)
+{
+	struct sglist sg;
+	struct sglist_seg segs[1];
+	void *c;
+	int error;
+
+	sglist_init(&sg, 1, segs);
+
+	error = sglist_append(&sg, sc->vtballoon_page_frames,
+	    npages * sizeof(uint32_t));
+	KASSERT(error == 0, ("error adding page frames to sglist"));
+
+	error = virtqueue_enqueue(vq, vq, &sg, 1, 0);
+	KASSERT(error == 0, ("error enqueuing page frames to virtqueue"));
+
+	/*
+	 * Inflate and deflate operations are done synchronously. The
+	 * interrupt handler will wake us up.
+	 */
+	VTBALLOON_LOCK(sc);
+	virtqueue_notify(vq);
+
+	while ((c = virtqueue_dequeue(vq, NULL)) == NULL)
+		msleep_spin(sc, VTBALLOON_MTX(sc), "vtbspf", 0);
+	VTBALLOON_UNLOCK(sc);
+
+	KASSERT(c == vq, ("unexpected balloon operation response"));
+}
+
+static void
+vtballoon_pop(struct vtballoon_softc *sc)
+{
+
+	while (!TAILQ_EMPTY(&sc->vtballoon_pages))
+		vtballoon_deflate(sc, sc->vtballoon_current_npages);
+}
+
+static void
+vtballoon_stop(struct vtballoon_softc *sc)
+{
+
+	virtqueue_disable_intr(sc->vtballoon_inflate_vq);
+	virtqueue_disable_intr(sc->vtballoon_deflate_vq);
+
+	virtio_stop(sc->vtballoon_dev);
+}
+
+static vm_page_t
+vtballoon_alloc_page(struct vtballoon_softc *sc)
+{
+	vm_page_t m;
+
+	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED |
+	    VM_ALLOC_NOOBJ);
+	if (m != NULL)
+		sc->vtballoon_current_npages++;
+
+	return (m);
+}
+
+static void
+vtballoon_free_page(struct vtballoon_softc *sc, vm_page_t m)
+{
+
+	vm_page_unwire(m, 0);
+	vm_page_free(m);
+	sc->vtballoon_current_npages--;
+}
+
+static uint32_t
+vtballoon_desired_size(struct vtballoon_softc *sc)
+{
+	uint32_t desired;
+
+	desired = virtio_read_dev_config_4(sc->vtballoon_dev,
+	    offsetof(struct virtio_balloon_config, num_pages));
+
+	return (le32toh(desired));
+}
+
+static void
+vtballoon_update_size(struct vtballoon_softc *sc)
+{
+
+	virtio_write_dev_config_4(sc->vtballoon_dev,
+	    offsetof(struct virtio_balloon_config, actual),
+	    htole32(sc->vtballoon_current_npages));
+
+}
+
+static int
+vtballoon_sleep(struct vtballoon_softc *sc)
+{
+	int rc, timeout;
+	uint32_t current, desired;
+
+	rc = 0;
+	current = sc->vtballoon_current_npages;
+
+	VTBALLOON_LOCK(sc);
+	for (;;) {
+		if (sc->vtballoon_flags & VTBALLOON_FLAG_DETACH) {
+			rc = 1;
+			break;
+		}
+
+		desired = vtballoon_desired_size(sc);
+		sc->vtballoon_desired_npages = desired;
+
+		/*
+		 * If given, use non-zero timeout on the first time through
+		 * the loop. On subsequent times, timeout will be zero so
+		 * we will reevaluate the desired size of the balloon and
+		 * break out to retry if needed.
+		 */
+		timeout = sc->vtballoon_timeout;
+		sc->vtballoon_timeout = 0;
+
+		if (current > desired)
+			break;
+		if (current < desired && timeout == 0)
+			break;
+
+		msleep_spin(sc, VTBALLOON_MTX(sc), "vtbslp", timeout);
+	}
+	VTBALLOON_UNLOCK(sc);
+
+	return (rc);
+}
+
+static void
+vtballoon_thread(void *xsc)
+{
+	struct vtballoon_softc *sc;
+	uint32_t current, desired;
+
+	sc = xsc;
+
+	for (;;) {
+		if (vtballoon_sleep(sc) != 0)
+			break;
+
+		current = sc->vtballoon_current_npages;
+		desired = sc->vtballoon_desired_npages;
+
+		if (desired != current) {
+			if (desired > current)
+				vtballoon_inflate(sc, desired - current);
+			else
+				vtballoon_deflate(sc, current - desired);
+
+			vtballoon_update_size(sc);
+		}
+	}
+
+	kproc_exit(0);
+}
+
+static void
+vtballoon_add_sysctl(struct vtballoon_softc *sc)
+{
+	device_t dev;
+	struct sysctl_ctx_list *ctx;
+	struct sysctl_oid *tree;
+	struct sysctl_oid_list *child;
+
+	dev = sc->vtballoon_dev;
+	ctx = device_get_sysctl_ctx(dev);
+	tree = device_get_sysctl_tree(dev);
+	child = SYSCTL_CHILDREN(tree);
+
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "desired",
+	    CTLFLAG_RD, &sc->vtballoon_desired_npages, sizeof(uint32_t),
+	    "Desired balloon size in pages");
+
+	SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "current",
+	    CTLFLAG_RD, &sc->vtballoon_current_npages, sizeof(uint32_t),
+	    "Current balloon size in pages");
+}
diff --git a/sys/dev/virtio/balloon/virtio_balloon.h b/sys/dev/virtio/balloon/virtio_balloon.h
new file mode 100644
index 0000000..cea84ba
--- /dev/null
+++ b/sys/dev/virtio/balloon/virtio_balloon.h
@@ -0,0 +1,41 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_BALLOON_H
+#define _VIRTIO_BALLOON_H
+
+#include <sys/types.h>
+
+/* Feature bits. */
+#define VIRTIO_BALLOON_F_MUST_TELL_HOST	0x1 /* Tell before reclaiming pages */
+#define VIRTIO_BALLOON_F_STATS_VQ	0x2 /* Memory stats virtqueue */
+
+/* Size of a PFN in the balloon interface. */
+#define VIRTIO_BALLOON_PFN_SHIFT 12
+
+struct virtio_balloon_config {
+	/* Number of pages host wants Guest to give up. */
+	uint32_t num_pages;
+
+	/* Number of pages we've actually got in balloon. */
+	uint32_t actual;
+};
+
+#define VIRTIO_BALLOON_S_SWAP_IN  0   /* Amount of memory swapped in */
+#define VIRTIO_BALLOON_S_SWAP_OUT 1   /* Amount of memory swapped out */
+#define VIRTIO_BALLOON_S_MAJFLT   2   /* Number of major faults */
+#define VIRTIO_BALLOON_S_MINFLT   3   /* Number of minor faults */
+#define VIRTIO_BALLOON_S_MEMFREE  4   /* Total amount of free memory */
+#define VIRTIO_BALLOON_S_MEMTOT   5   /* Total amount of memory */
+#define VIRTIO_BALLOON_S_NR       6
+
+struct virtio_balloon_stat {
+	uint16_t tag;
+	uint64_t val;
+} __packed;
+
+#endif /* _VIRTIO_BALLOON_H */
diff --git a/sys/dev/virtio/block/virtio_blk.c b/sys/dev/virtio/block/virtio_blk.c
new file mode 100644
index 0000000..09783a8
--- /dev/null
+++ b/sys/dev/virtio/block/virtio_blk.c
@@ -0,0 +1,1149 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for VirtIO block devices. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+
+#include <geom/geom_disk.h>
+#include <vm/uma.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/block/virtio_blk.h>
+
+#include "virtio_if.h"
+
+struct vtblk_request {
+	struct virtio_blk_outhdr	 vbr_hdr;
+	struct bio			*vbr_bp;
+	uint8_t				 vbr_ack;
+
+	TAILQ_ENTRY(vtblk_request)	 vbr_link;
+};
+
+struct vtblk_softc {
+	device_t		 vtblk_dev;
+	struct mtx		 vtblk_mtx;
+	uint64_t		 vtblk_features;
+	uint32_t		 vtblk_flags;
+#define VTBLK_FLAG_INDIRECT	0x0001
+#define VTBLK_FLAG_READONLY	0x0002
+#define VTBLK_FLAG_DETACHING	0x0004
+#define VTBLK_FLAG_SUSPENDED	0x0008
+#define VTBLK_FLAG_DUMPING	0x0010
+
+	struct virtqueue	*vtblk_vq;
+	struct sglist		*vtblk_sglist;
+	struct disk		*vtblk_disk;
+
+	struct bio_queue_head	 vtblk_bioq;
+	TAILQ_HEAD(, vtblk_request)
+				 vtblk_req_free;
+	TAILQ_HEAD(, vtblk_request)
+				 vtblk_req_ready;
+
+	struct taskqueue	*vtblk_tq;
+	struct task		 vtblk_intr_task;
+
+	int			 vtblk_sector_size;
+	int			 vtblk_max_nsegs;
+	int			 vtblk_unit;
+	int			 vtblk_request_count;
+
+	struct vtblk_request	 vtblk_dump_request;
+};
+
+static struct virtio_feature_desc vtblk_feature_desc[] = {
+	{ VIRTIO_BLK_F_BARRIER,		"HostBarrier"	},
+	{ VIRTIO_BLK_F_SIZE_MAX,	"MaxSegSize"	},
+	{ VIRTIO_BLK_F_SEG_MAX,		"MaxNumSegs"	},
+	{ VIRTIO_BLK_F_GEOMETRY,	"DiskGeometry"	},
+	{ VIRTIO_BLK_F_RO,		"ReadOnly"	},
+	{ VIRTIO_BLK_F_BLK_SIZE,	"BlockSize"	},
+	{ VIRTIO_BLK_F_SCSI,		"SCSICmds"	},
+	{ VIRTIO_BLK_F_FLUSH,		"FlushCmd"	},
+	{ VIRTIO_BLK_F_TOPOLOGY,	"Topology"	},
+
+	{ 0, NULL }
+};
+
+static int	vtblk_modevent(module_t, int, void *);
+
+static int	vtblk_probe(device_t);
+static int	vtblk_attach(device_t);
+static int	vtblk_detach(device_t);
+static int	vtblk_suspend(device_t);
+static int	vtblk_resume(device_t);
+static int	vtblk_shutdown(device_t);
+
+static void	vtblk_negotiate_features(struct vtblk_softc *);
+static int	vtblk_maximum_segments(struct vtblk_softc *,
+		    struct virtio_blk_config *);
+static int	vtblk_alloc_virtqueue(struct vtblk_softc *);
+static void	vtblk_alloc_disk(struct vtblk_softc *,
+		    struct virtio_blk_config *);
+static void	vtblk_create_disk(struct vtblk_softc *);
+
+static int	vtblk_open(struct disk *);
+static int	vtblk_close(struct disk *);
+static int	vtblk_ioctl(struct disk *, u_long, void *, int,
+		    struct thread *);
+static int	vtblk_dump(void *, void *, vm_offset_t, off_t, size_t);
+static void	vtblk_strategy(struct bio *);
+
+static void	vtblk_startio(struct vtblk_softc *);
+static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
+static int	vtblk_execute_request(struct vtblk_softc *,
+		    struct vtblk_request *);
+
+static int	vtblk_vq_intr(void *);
+static void	vtblk_intr_task(void *, int);
+
+static void	vtblk_stop(struct vtblk_softc *);
+
+static void	vtblk_get_ident(struct vtblk_softc *);
+static void	vtblk_prepare_dump(struct vtblk_softc *);
+static int	vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
+static int	vtblk_flush_dump(struct vtblk_softc *);
+static int	vtblk_poll_request(struct vtblk_softc *,
+		    struct vtblk_request *);
+
+static void	vtblk_drain_vq(struct vtblk_softc *, int);
+static void	vtblk_drain(struct vtblk_softc *);
+
+static int	vtblk_alloc_requests(struct vtblk_softc *);
+static void	vtblk_free_requests(struct vtblk_softc *);
+static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
+static void	vtblk_enqueue_request(struct vtblk_softc *,
+		    struct vtblk_request *);
+
+static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
+static void	vtblk_enqueue_ready(struct vtblk_softc *,
+		    struct vtblk_request *);
+
+static void	vtblk_bio_error(struct bio *, int);
+
+/* Tunables. */
+static int vtblk_no_ident = 0;
+TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
+
+/* Features desired/implemented by this driver. */
+#define VTBLK_FEATURES \
+    (VIRTIO_BLK_F_BARRIER		| \
+     VIRTIO_BLK_F_SIZE_MAX		| \
+     VIRTIO_BLK_F_SEG_MAX		| \
+     VIRTIO_BLK_F_GEOMETRY		| \
+     VIRTIO_BLK_F_RO			| \
+     VIRTIO_BLK_F_BLK_SIZE		| \
+     VIRTIO_BLK_F_FLUSH			| \
+     VIRTIO_RING_F_INDIRECT_DESC)
+
+#define VTBLK_MTX(_sc)		&(_sc)->vtblk_mtx
+#define VTBLK_LOCK_INIT(_sc, _name) \
+				mtx_init(VTBLK_MTX((_sc)), (_name), \
+				    "VTBLK Lock", MTX_DEF)
+#define VTBLK_LOCK(_sc)		mtx_lock(VTBLK_MTX((_sc)))
+#define VTBLK_TRYLOCK(_sc)	mtx_trylock(VTBLK_MTX((_sc)))
+#define VTBLK_UNLOCK(_sc)	mtx_unlock(VTBLK_MTX((_sc)))
+#define VTBLK_LOCK_DESTROY(_sc)	mtx_destroy(VTBLK_MTX((_sc)))
+#define VTBLK_LOCK_ASSERT(_sc)	mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
+#define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
+				mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
+
+#define VTBLK_BIO_SEGMENTS(_bp)	sglist_count((_bp)->bio_data, (_bp)->bio_bcount)
+
+#define VTBLK_DISK_NAME		"vtbd"
+
+/*
+ * Each block request uses at least two segments - one for the header
+ * and one for the status.
+ */
+#define VTBLK_MIN_SEGMENTS	2
+
+static uma_zone_t vtblk_req_zone;
+
+static device_method_t vtblk_methods[] = {
+	/* Device methods. */
+	DEVMETHOD(device_probe,		vtblk_probe),
+	DEVMETHOD(device_attach,	vtblk_attach),
+	DEVMETHOD(device_detach,	vtblk_detach),
+	DEVMETHOD(device_suspend,	vtblk_suspend),
+	DEVMETHOD(device_resume,	vtblk_resume),
+	DEVMETHOD(device_shutdown,	vtblk_shutdown),
+
+	{ 0, 0 }
+};
+
+static driver_t vtblk_driver = {
+	"vtblk",
+	vtblk_methods,
+	sizeof(struct vtblk_softc)
+};
+static devclass_t vtblk_devclass;
+
+DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
+    vtblk_modevent, 0);
+MODULE_VERSION(virtio_blk, 1);
+MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
+
+static int
+vtblk_modevent(module_t mod, int type, void *unused)
+{
+	int error;
+
+	error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		vtblk_req_zone = uma_zcreate("vtblk_request",
+		    sizeof(struct vtblk_request),
+		    NULL, NULL, NULL, NULL, 0, 0);
+		break;
+	case MOD_QUIESCE:
+	case MOD_UNLOAD:
+		if (uma_zone_get_cur(vtblk_req_zone) > 0)
+			error = EBUSY;
+		else if (type == MOD_UNLOAD) {
+			uma_zdestroy(vtblk_req_zone);
+			vtblk_req_zone = NULL;
+		}
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+static int
+vtblk_probe(device_t dev)
+{
+
+	if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
+		return (ENXIO);
+
+	device_set_desc(dev, "VirtIO Block Adapter");
+
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtblk_attach(device_t dev)
+{
+	struct vtblk_softc *sc;
+	struct virtio_blk_config blkcfg;
+	int error;
+
+	sc = device_get_softc(dev);
+	sc->vtblk_dev = dev;
+	sc->vtblk_unit = device_get_unit(dev);
+
+	VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
+
+	bioq_init(&sc->vtblk_bioq);
+	TAILQ_INIT(&sc->vtblk_req_free);
+	TAILQ_INIT(&sc->vtblk_req_ready);
+
+	virtio_set_feature_desc(dev, vtblk_feature_desc);
+	vtblk_negotiate_features(sc);
+
+	if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
+		sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
+
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
+		sc->vtblk_flags |= VTBLK_FLAG_READONLY;
+
+	/* Get local copy of config. */
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) == 0) {
+		bzero(&blkcfg, sizeof(struct virtio_blk_config));
+		virtio_read_device_config(dev, 0, &blkcfg,
+		    offsetof(struct virtio_blk_config, physical_block_exp));
+	} else
+		virtio_read_device_config(dev, 0, &blkcfg,
+		    sizeof(struct virtio_blk_config));
+
+	/*
+	 * With the current sglist(9) implementation, it is not easy
+	 * for us to support a maximum segment size as adjacent
+	 * segments are coalesced. For now, just make sure it's larger
+	 * than the maximum supported transfer size.
+	 */
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
+		if (blkcfg.size_max < MAXPHYS) {
+			error = ENOTSUP;
+			device_printf(dev, "host requires unsupported "
+			    "maximum segment size feature\n");
+			goto fail;
+		}
+	}
+
+	sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
+
+	/*
+	 * Allocate working sglist. The number of segments may be too
+	 * large to safely store on the stack.
+	 */
+	sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
+	if (sc->vtblk_sglist == NULL) {
+		error = ENOMEM;
+		device_printf(dev, "cannot allocate sglist\n");
+		goto fail;
+	}
+
+	error = vtblk_alloc_virtqueue(sc);
+	if (error) {
+		device_printf(dev, "cannot allocate virtqueue\n");
+		goto fail;
+	}
+
+	error = vtblk_alloc_requests(sc);
+	if (error) {
+		device_printf(dev, "cannot preallocate requests\n");
+		goto fail;
+	}
+
+	vtblk_alloc_disk(sc, &blkcfg);
+
+	TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
+	sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT,
+	    taskqueue_thread_enqueue, &sc->vtblk_tq);
+	if (sc->vtblk_tq == NULL) {
+		error = ENOMEM;
+		device_printf(dev, "cannot allocate taskqueue\n");
+		goto fail;
+	}
+	taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq",
+	    device_get_nameunit(dev));
+
+	error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
+	if (error) {
+		device_printf(dev, "cannot setup virtqueue interrupt\n");
+		goto fail;
+	}
+
+	vtblk_create_disk(sc);
+
+	virtqueue_enable_intr(sc->vtblk_vq);
+
+fail:
+	if (error)
+		vtblk_detach(dev);
+
+	return (error);
+}
+
+static int
+vtblk_detach(device_t dev)
+{
+	struct vtblk_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	VTBLK_LOCK(sc);
+	sc->vtblk_flags |= VTBLK_FLAG_DETACHING;
+	if (device_is_attached(dev))
+		vtblk_stop(sc);
+	VTBLK_UNLOCK(sc);
+
+	if (sc->vtblk_tq != NULL) {
+		taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task);
+		taskqueue_free(sc->vtblk_tq);
+		sc->vtblk_tq = NULL;
+	}
+
+	vtblk_drain(sc);
+
+	if (sc->vtblk_disk != NULL) {
+		disk_destroy(sc->vtblk_disk);
+		sc->vtblk_disk = NULL;
+	}
+
+	if (sc->vtblk_sglist != NULL) {
+		sglist_free(sc->vtblk_sglist);
+		sc->vtblk_sglist = NULL;
+	}
+
+	VTBLK_LOCK_DESTROY(sc);
+
+	return (0);
+}
+
+static int
+vtblk_suspend(device_t dev)
+{
+	struct vtblk_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	VTBLK_LOCK(sc);
+	sc->vtblk_flags |= VTBLK_FLAG_SUSPENDED;
+	/* TODO Wait for any inflight IO to complete? */
+	VTBLK_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+vtblk_resume(device_t dev)
+{
+	struct vtblk_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	VTBLK_LOCK(sc);
+	sc->vtblk_flags &= ~VTBLK_FLAG_SUSPENDED;
+	/* TODO Resume IO? */
+	VTBLK_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+vtblk_shutdown(device_t dev)
+{
+
+	return (0);
+}
+
+static int
+vtblk_open(struct disk *dp)
+{
+	struct vtblk_softc *sc;
+
+	if ((sc = dp->d_drv1) == NULL)
+		return (ENXIO);
+
+	return (sc->vtblk_flags & VTBLK_FLAG_DETACHING ? ENXIO : 0);
+}
+
+static int
+vtblk_close(struct disk *dp)
+{
+	struct vtblk_softc *sc;
+
+	if ((sc = dp->d_drv1) == NULL)
+		return (ENXIO);
+
+	return (0);
+}
+
+static int
+vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
+    struct thread *td)
+{
+	struct vtblk_softc *sc;
+
+	if ((sc = dp->d_drv1) == NULL)
+		return (ENXIO);
+
+	return (ENOTTY);
+}
+
+static int
+vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
+    size_t length)
+{
+	struct disk *dp;
+	struct vtblk_softc *sc;
+	int error;
+
+	dp = arg;
+	error = 0;
+
+	if ((sc = dp->d_drv1) == NULL)
+		return (ENXIO);
+
+	if (VTBLK_TRYLOCK(sc) == 0) {
+		device_printf(sc->vtblk_dev,
+		    "softc already locked, cannot dump...\n");
+		return (EBUSY);
+	}
+
+	if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
+		vtblk_prepare_dump(sc);
+		sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
+	}
+
+	if (length > 0)
+		error = vtblk_write_dump(sc, virtual, offset, length);
+	else if (virtual == NULL && offset == 0)
+		error = vtblk_flush_dump(sc);
+
+	VTBLK_UNLOCK(sc);
+
+	return (error);
+}
+
+static void
+vtblk_strategy(struct bio *bp)
+{
+	struct vtblk_softc *sc;
+
+	if ((sc = bp->bio_disk->d_drv1) == NULL) {
+		vtblk_bio_error(bp, EINVAL);
+		return;
+	}
+
+	/*
+	 * Fail any write if RO. Unfortunately, there does not seem to
+	 * be a better way to report our readonly'ness to GEOM above.
+	 */
+	if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
+	    (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
+		vtblk_bio_error(bp, EROFS);
+		return;
+	}
+
+	/*
+	 * Prevent read/write buffers spanning too many segments from
+	 * getting into the queue. This should only trip if d_maxsize
+	 * was incorrectly set.
+	 */
+	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
+		KASSERT(VTBLK_BIO_SEGMENTS(bp) <= sc->vtblk_max_nsegs -
+		    VTBLK_MIN_SEGMENTS,
+		    ("bio spanned too many segments: %d, max: %d",
+		    VTBLK_BIO_SEGMENTS(bp),
+		    sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS));
+	}
+
+	VTBLK_LOCK(sc);
+	if ((sc->vtblk_flags & VTBLK_FLAG_DETACHING) == 0) {
+		bioq_disksort(&sc->vtblk_bioq, bp);
+		vtblk_startio(sc);
+	} else
+		vtblk_bio_error(bp, ENXIO);
+	VTBLK_UNLOCK(sc);
+}
+
+static void
+vtblk_negotiate_features(struct vtblk_softc *sc)
+{
+	device_t dev;
+	uint64_t features;
+
+	dev = sc->vtblk_dev;
+	features = VTBLK_FEATURES;
+
+	sc->vtblk_features = virtio_negotiate_features(dev, features);
+}
+
+static int
+vtblk_maximum_segments(struct vtblk_softc *sc,
+    struct virtio_blk_config *blkcfg)
+{
+	device_t dev;
+	int nsegs;
+
+	dev = sc->vtblk_dev;
+	nsegs = VTBLK_MIN_SEGMENTS;
+
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
+		nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
+		if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
+			nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
+	} else
+		nsegs += 1;
+
+	return (nsegs);
+}
+
+static int
+vtblk_alloc_virtqueue(struct vtblk_softc *sc)
+{
+	device_t dev;
+	struct vq_alloc_info vq_info;
+
+	dev = sc->vtblk_dev;
+
+	VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
+	    vtblk_vq_intr, sc, &sc->vtblk_vq,
+	    "%s request", device_get_nameunit(dev));
+
+	return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
+}
+
+static void
+vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
+{
+	device_t dev;
+	struct disk *dp;
+
+	dev = sc->vtblk_dev;
+
+	sc->vtblk_disk = dp = disk_alloc();
+	dp->d_open = vtblk_open;
+	dp->d_close = vtblk_close;
+	dp->d_ioctl = vtblk_ioctl;
+	dp->d_strategy = vtblk_strategy;
+	dp->d_name = VTBLK_DISK_NAME;
+	dp->d_unit = sc->vtblk_unit;
+	dp->d_drv1 = sc;
+
+	if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
+		dp->d_dump = vtblk_dump;
+
+	/* Capacity is always in 512-byte units. */
+	dp->d_mediasize = blkcfg->capacity * 512;
+
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
+		sc->vtblk_sector_size = blkcfg->blk_size;
+	else
+		sc->vtblk_sector_size = 512;
+	dp->d_sectorsize = sc->vtblk_sector_size;
+
+	/*
+	 * The VirtIO maximum I/O size is given in terms of segments.
+	 * However, FreeBSD limits I/O size by logical buffer size, not
+	 * by physically contiguous pages. Therefore, we have to assume
+	 * no pages are contiguous. This may impose an artificially low
+	 * maximum I/O size. But in practice, since QEMU advertises 128
+	 * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
+	 * which is typically greater than MAXPHYS. Eventually we should
+	 * just advertise MAXPHYS and split buffers that are too big.
+	 *
+	 * Note we must subtract one additional segment in case of non
+	 * page aligned buffers.
+	 */
+	dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
+	    PAGE_SIZE;
+	if (dp->d_maxsize < PAGE_SIZE)
+		dp->d_maxsize = PAGE_SIZE; /* XXX */
+
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
+		dp->d_fwsectors = blkcfg->geometry.sectors;
+		dp->d_fwheads = blkcfg->geometry.heads;
+	}
+
+	if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
+		dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
+}
+
+static void
+vtblk_create_disk(struct vtblk_softc *sc)
+{
+	struct disk *dp;
+
+	dp = sc->vtblk_disk;
+
+	/*
+	 * Retrieving the identification string must be done after
+	 * the virtqueue interrupt is setup otherwise it will hang.
+	 */
+	vtblk_get_ident(sc);
+
+	device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
+	    (uintmax_t) dp->d_mediasize >> 20,
+	    (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
+	    dp->d_sectorsize);
+
+	disk_create(dp, DISK_VERSION);
+}
+
+static void
+vtblk_startio(struct vtblk_softc *sc)
+{
+	struct virtqueue *vq;
+	struct vtblk_request *req;
+	int enq;
+
+	vq = sc->vtblk_vq;
+	enq = 0;
+
+	VTBLK_LOCK_ASSERT(sc);
+
+	if (sc->vtblk_flags & VTBLK_FLAG_SUSPENDED)
+		return;
+
+	while (!virtqueue_full(vq)) {
+		if ((req = vtblk_dequeue_ready(sc)) == NULL)
+			req = vtblk_bio_request(sc);
+		if (req == NULL)
+			break;
+
+		if (vtblk_execute_request(sc, req) != 0) {
+			vtblk_enqueue_ready(sc, req);
+			break;
+		}
+
+		enq++;
+	}
+
+	if (enq > 0)
+		virtqueue_notify(vq);
+}
+
+static struct vtblk_request *
+vtblk_bio_request(struct vtblk_softc *sc)
+{
+	struct bio_queue_head *bioq;
+	struct vtblk_request *req;
+	struct bio *bp;
+
+	bioq = &sc->vtblk_bioq;
+
+	if (bioq_first(bioq) == NULL)
+		return (NULL);
+
+	req = vtblk_dequeue_request(sc);
+	if (req == NULL)
+		return (NULL);
+
+	bp = bioq_takefirst(bioq);
+	req->vbr_bp = bp;
+	req->vbr_ack = -1;
+	req->vbr_hdr.ioprio = 1;
+
+	switch (bp->bio_cmd) {
+	case BIO_FLUSH:
+		req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
+		break;
+	case BIO_READ:
+		req->vbr_hdr.type = VIRTIO_BLK_T_IN;
+		req->vbr_hdr.sector = bp->bio_offset / 512;
+		break;
+	case BIO_WRITE:
+		req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
+		req->vbr_hdr.sector = bp->bio_offset / 512;
+		break;
+	default:
+		KASSERT(0, ("bio with unhandled cmd: %d", bp->bio_cmd));
+		req->vbr_hdr.type = -1;
+		break;
+	}
+
+	if (bp->bio_flags & BIO_ORDERED)
+		req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
+
+	return (req);
+}
+
+static int
+vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+	struct sglist *sg;
+	struct bio *bp;
+	int writable, error;
+
+	sg = sc->vtblk_sglist;
+	bp = req->vbr_bp;
+	writable = 0;
+
+	VTBLK_LOCK_ASSERT(sc);
+
+	sglist_reset(sg);
+	error = sglist_append(sg, &req->vbr_hdr,
+	    sizeof(struct virtio_blk_outhdr));
+	KASSERT(error == 0, ("error adding header to sglist"));
+	KASSERT(sg->sg_nseg == 1,
+	    ("header spanned multiple segments: %d", sg->sg_nseg));
+
+	if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
+		error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
+		KASSERT(error == 0, ("error adding buffer to sglist"));
+
+		/* BIO_READ means the host writes into our buffer. */
+		if (bp->bio_cmd == BIO_READ)
+			writable += sg->sg_nseg - 1;
+	}
+
+	error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
+	KASSERT(error == 0, ("error adding ack to sglist"));
+	writable++;
+
+	KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
+	    ("fewer than min segments: %d", sg->sg_nseg));
+
+	error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
+	    sg->sg_nseg - writable, writable);
+
+	return (error);
+}
+
+static int
+vtblk_vq_intr(void *xsc)
+{
+	struct vtblk_softc *sc;
+
+	sc = xsc;
+
+	virtqueue_disable_intr(sc->vtblk_vq);
+	taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task);
+
+	return (1);
+}
+
+static void
+vtblk_intr_task(void *arg, int pending)
+{
+	struct vtblk_softc *sc;
+	struct vtblk_request *req;
+	struct virtqueue *vq;
+	struct bio *bp;
+
+	sc = arg;
+	vq = sc->vtblk_vq;
+
+	VTBLK_LOCK(sc);
+	if (sc->vtblk_flags & VTBLK_FLAG_DETACHING) {
+		VTBLK_UNLOCK(sc);
+		return;
+	}
+
+	while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
+		bp = req->vbr_bp;
+
+		if (req->vbr_ack == VIRTIO_BLK_S_OK)
+			bp->bio_resid = 0;
+		else {
+			bp->bio_flags |= BIO_ERROR;
+			if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP)
+				bp->bio_error = ENOTSUP;
+			else
+				bp->bio_error = EIO;
+		}
+
+		biodone(bp);
+		vtblk_enqueue_request(sc, req);
+	}
+
+	vtblk_startio(sc);
+
+	if (virtqueue_enable_intr(vq) != 0) {
+		virtqueue_disable_intr(vq);
+		VTBLK_UNLOCK(sc);
+		taskqueue_enqueue_fast(sc->vtblk_tq,
+		    &sc->vtblk_intr_task);
+		return;
+	}
+
+	VTBLK_UNLOCK(sc);
+}
+
+static void
+vtblk_stop(struct vtblk_softc *sc)
+{
+
+	virtqueue_disable_intr(sc->vtblk_vq);
+	virtio_stop(sc->vtblk_dev);
+}
+
+static void
+vtblk_get_ident(struct vtblk_softc *sc)
+{
+	struct bio buf;
+	struct disk *dp;
+	struct vtblk_request *req;
+	int len, error;
+
+	dp = sc->vtblk_disk;
+	len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
+
+	if (vtblk_no_ident != 0)
+		return;
+
+	req = vtblk_dequeue_request(sc);
+	if (req == NULL)
+		return;
+
+	req->vbr_ack = -1;
+	req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
+	req->vbr_hdr.ioprio = 1;
+	req->vbr_hdr.sector = 0;
+
+	req->vbr_bp = &buf;
+	bzero(&buf, sizeof(struct bio));
+
+	buf.bio_cmd = BIO_READ;
+	buf.bio_data = dp->d_ident;
+	buf.bio_bcount = len;
+
+	VTBLK_LOCK(sc);
+	error = vtblk_poll_request(sc, req);
+	vtblk_enqueue_request(sc, req);
+	VTBLK_UNLOCK(sc);
+
+	if (error) {
+		device_printf(sc->vtblk_dev,
+		    "error getting device identifier: %d\n", error);
+	}
+}
+
+static void
+vtblk_prepare_dump(struct vtblk_softc *sc)
+{
+	device_t dev;
+	struct virtqueue *vq;
+
+	dev = sc->vtblk_dev;
+	vq = sc->vtblk_vq;
+
+	vtblk_stop(sc);
+
+	/*
+	 * Drain all requests caught in-flight in the virtqueue,
+	 * skipping biodone(). When dumping, only one request is
+	 * outstanding at a time, and we just poll the virtqueue
+	 * for the response.
+	 */
+	vtblk_drain_vq(sc, 1);
+
+	if (virtio_reinit(dev, sc->vtblk_features) != 0)
+		panic("cannot reinit VirtIO block device during dump");
+
+	virtqueue_disable_intr(vq);
+	virtio_reinit_complete(dev);
+}
+
+static int
+vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
+    size_t length)
+{
+	struct bio buf;
+	struct vtblk_request *req;
+
+	req = &sc->vtblk_dump_request;
+	req->vbr_ack = -1;
+	req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
+	req->vbr_hdr.ioprio = 1;
+	req->vbr_hdr.sector = offset / 512;
+
+	req->vbr_bp = &buf;
+	bzero(&buf, sizeof(struct bio));
+
+	buf.bio_cmd = BIO_WRITE;
+	buf.bio_data = virtual;
+	buf.bio_bcount = length;
+
+	return (vtblk_poll_request(sc, req));
+}
+
+static int
+vtblk_flush_dump(struct vtblk_softc *sc)
+{
+	struct bio buf;
+	struct vtblk_request *req;
+
+	req = &sc->vtblk_dump_request;
+	req->vbr_ack = -1;
+	req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
+	req->vbr_hdr.ioprio = 1;
+	req->vbr_hdr.sector = 0;
+
+	req->vbr_bp = &buf;
+	bzero(&buf, sizeof(struct bio));
+
+	buf.bio_cmd = BIO_FLUSH;
+
+	return (vtblk_poll_request(sc, req));
+}
+
+static int
+vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+	device_t dev;
+	struct virtqueue *vq;
+	struct vtblk_request *r;
+	int error;
+
+	dev = sc->vtblk_dev;
+	vq = sc->vtblk_vq;
+
+	if (!virtqueue_empty(vq))
+		return (EBUSY);
+
+	error = vtblk_execute_request(sc, req);
+	if (error)
+		return (error);
+
+	virtqueue_notify(vq);
+
+	r = virtqueue_poll(vq, NULL);
+	KASSERT(r == req, ("unexpected request response"));
+
+	if (req->vbr_ack != VIRTIO_BLK_S_OK) {
+		error = req->vbr_ack == VIRTIO_BLK_S_UNSUPP ? ENOTSUP : EIO;
+		if (bootverbose)
+			device_printf(dev,
+			    "vtblk_poll_request: IO error: %d\n", error);
+	}
+
+	return (error);
+}
+
+static void
+vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
+{
+	struct virtqueue *vq;
+	struct vtblk_request *req;
+	int last;
+
+	vq = sc->vtblk_vq;
+	last = 0;
+
+	while ((req = virtqueue_drain(vq, &last)) != NULL) {
+		if (!skip_done)
+			vtblk_bio_error(req->vbr_bp, ENXIO);
+
+		vtblk_enqueue_request(sc, req);
+	}
+
+	KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
+}
+
+static void
+vtblk_drain(struct vtblk_softc *sc)
+{
+	struct bio_queue_head *bioq;
+	struct vtblk_request *req;
+	struct bio *bp;
+
+	bioq = &sc->vtblk_bioq;
+
+	if (sc->vtblk_vq != NULL)
+		vtblk_drain_vq(sc, 0);
+
+	while ((req = vtblk_dequeue_ready(sc)) != NULL) {
+		vtblk_bio_error(req->vbr_bp, ENXIO);
+		vtblk_enqueue_request(sc, req);
+	}
+
+	while (bioq_first(bioq) != NULL) {
+		bp = bioq_takefirst(bioq);
+		vtblk_bio_error(bp, ENXIO);
+	}
+
+	vtblk_free_requests(sc);
+}
+
+static int
+vtblk_alloc_requests(struct vtblk_softc *sc)
+{
+	struct vtblk_request *req;
+	int i, size;
+
+	size = virtqueue_size(sc->vtblk_vq);
+
+	/*
+	 * Preallocate sufficient requests to keep the virtqueue full. Each
+	 * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
+	 * the number allocated when indirect descriptors are not available.
+	 */
+	if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
+		size /= VTBLK_MIN_SEGMENTS;
+
+	for (i = 0; i < size; i++) {
+		req = uma_zalloc(vtblk_req_zone, M_NOWAIT);
+		if (req == NULL)
+			return (ENOMEM);
+
+		sc->vtblk_request_count++;
+		vtblk_enqueue_request(sc, req);
+	}
+
+	return (0);
+}
+
+static void
+vtblk_free_requests(struct vtblk_softc *sc)
+{
+	struct vtblk_request *req;
+
+	while ((req = vtblk_dequeue_request(sc)) != NULL) {
+		sc->vtblk_request_count--;
+		uma_zfree(vtblk_req_zone, req);
+	}
+
+	KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
+}
+
+static struct vtblk_request *
+vtblk_dequeue_request(struct vtblk_softc *sc)
+{
+	struct vtblk_request *req;
+
+	req = TAILQ_FIRST(&sc->vtblk_req_free);
+	if (req != NULL)
+		TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
+
+	return (req);
+}
+
+static void
+vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+
+	bzero(req, sizeof(struct vtblk_request));
+	TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
+}
+
+static struct vtblk_request *
+vtblk_dequeue_ready(struct vtblk_softc *sc)
+{
+	struct vtblk_request *req;
+
+	req = TAILQ_FIRST(&sc->vtblk_req_ready);
+	if (req != NULL)
+		TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
+
+	return (req);
+}
+
+static void
+vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+
+	TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
+}
+
+static void
+vtblk_bio_error(struct bio *bp, int error)
+{
+
+	biofinish(bp, NULL, error);
+}
diff --git a/sys/dev/virtio/block/virtio_blk.h b/sys/dev/virtio/block/virtio_blk.h
new file mode 100644
index 0000000..4fb32e0
--- /dev/null
+++ b/sys/dev/virtio/block/virtio_blk.h
@@ -0,0 +1,106 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_BLK_H
+#define _VIRTIO_BLK_H
+
+#include <sys/types.h>
+
+/* Feature bits */
+#define VIRTIO_BLK_F_BARRIER	0x0001	/* Does host support barriers? */
+#define VIRTIO_BLK_F_SIZE_MAX	0x0002	/* Indicates maximum segment size */
+#define VIRTIO_BLK_F_SEG_MAX	0x0004	/* Indicates maximum # of segments */
+#define VIRTIO_BLK_F_GEOMETRY	0x0010	/* Legacy geometry available  */
+#define VIRTIO_BLK_F_RO		0x0020	/* Disk is read-only */
+#define VIRTIO_BLK_F_BLK_SIZE	0x0040	/* Block size of disk is available*/
+#define VIRTIO_BLK_F_SCSI	0x0080	/* Supports scsi command passthru */
+#define VIRTIO_BLK_F_FLUSH	0x0200	/* Cache flush command support */
+#define VIRTIO_BLK_F_TOPOLOGY	0x0400	/* Topology information is available */
+
+#define VIRTIO_BLK_ID_BYTES	20	/* ID string length */
+
+struct virtio_blk_config {
+	/* The capacity (in 512-byte sectors). */
+	uint64_t capacity;
+	/* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */
+	uint32_t size_max;
+	/* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
+	uint32_t seg_max;
+	/* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */
+	struct virtio_blk_geometry {
+		uint16_t cylinders;
+		uint8_t heads;
+		uint8_t sectors;
+	} geometry;
+
+	/* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
+	uint32_t blk_size;
+
+	/* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY  */
+	/* exponent for physical block per logical block. */
+	uint8_t physical_block_exp;
+	/* alignment offset in logical blocks. */
+	uint8_t alignment_offset;
+	/* minimum I/O size without performance penalty in logical blocks. */
+	uint16_t min_io_size;
+	/* optimal sustained I/O size in logical blocks. */
+	uint32_t opt_io_size;
+} __packed;
+
+/*
+ * Command types
+ *
+ * Usage is a bit tricky as some bits are used as flags and some are not.
+ *
+ * Rules:
+ *   VIRTIO_BLK_T_OUT may be combined with VIRTIO_BLK_T_SCSI_CMD or
+ *   VIRTIO_BLK_T_BARRIER.  VIRTIO_BLK_T_FLUSH is a command of its own
+ *   and may not be combined with any of the other flags.
+ */
+
+/* These two define direction. */
+#define VIRTIO_BLK_T_IN		0
+#define VIRTIO_BLK_T_OUT	1
+
+/* This bit says it's a scsi command, not an actual read or write. */
+#define VIRTIO_BLK_T_SCSI_CMD	2
+
+/* Cache flush command */
+#define VIRTIO_BLK_T_FLUSH	4
+
+/* Get device ID command */
+#define VIRTIO_BLK_T_GET_ID	8
+
+/* Barrier before this op. */
+#define VIRTIO_BLK_T_BARRIER	0x80000000
+
+/* ID string length */
+#define VIRTIO_BLK_ID_BYTES	20
+
+/* This is the first element of the read scatter-gather list. */
+struct virtio_blk_outhdr {
+	/* VIRTIO_BLK_T* */
+	uint32_t type;
+	/* io priority. */
+	uint32_t ioprio;
+	/* Sector (ie. 512 byte offset) */
+	uint64_t sector;
+};
+
+struct virtio_scsi_inhdr {
+	uint32_t errors;
+	uint32_t data_len;
+	uint32_t sense_len;
+	uint32_t residual;
+};
+
+/* And this is the final byte of the write scatter-gather list. */
+#define VIRTIO_BLK_S_OK		0
+#define VIRTIO_BLK_S_IOERR	1
+#define VIRTIO_BLK_S_UNSUPP	2
+
+#endif /* _VIRTIO_BLK_H */
diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c
new file mode 100644
index 0000000..22becb1
--- /dev/null
+++ b/sys/dev/virtio/network/if_vtnet.c
@@ -0,0 +1,2746 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for VirtIO network devices. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef HAVE_KERNEL_OPTION_HEADERS
+#include "opt_device_polling.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/random.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <vm/uma.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_media.h>
+#include <net/if_vlan_var.h>
+
+#include <net/bpf.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <netinet/sctp.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/network/virtio_net.h>
+#include <dev/virtio/network/if_vtnetvar.h>
+
+#include "virtio_if.h"
+
+static int	vtnet_modevent(module_t, int, void *);
+
+static int	vtnet_probe(device_t);
+static int	vtnet_attach(device_t);
+static int	vtnet_detach(device_t);
+static int	vtnet_suspend(device_t);
+static int	vtnet_resume(device_t);
+static int	vtnet_shutdown(device_t);
+static int	vtnet_config_change(device_t);
+
+static void	vtnet_negotiate_features(struct vtnet_softc *);
+static int	vtnet_alloc_virtqueues(struct vtnet_softc *);
+static void	vtnet_get_hwaddr(struct vtnet_softc *);
+static void	vtnet_set_hwaddr(struct vtnet_softc *);
+static int	vtnet_is_link_up(struct vtnet_softc *);
+static void	vtnet_update_link_status(struct vtnet_softc *);
+static void	vtnet_watchdog(struct vtnet_softc *);
+static void	vtnet_config_change_task(void *, int);
+static int	vtnet_change_mtu(struct vtnet_softc *, int);
+static int	vtnet_ioctl(struct ifnet *, u_long, caddr_t);
+
+static int	vtnet_init_rx_vq(struct vtnet_softc *);
+static void	vtnet_free_rx_mbufs(struct vtnet_softc *);
+static void	vtnet_free_tx_mbufs(struct vtnet_softc *);
+static void	vtnet_free_ctrl_vq(struct vtnet_softc *);
+
+#ifdef DEVICE_POLLING
+static poll_handler_t vtnet_poll;
+#endif
+
+static struct mbuf * vtnet_alloc_rxbuf(struct vtnet_softc *, int,
+		    struct mbuf **);
+static int	vtnet_replace_rxbuf(struct vtnet_softc *,
+		    struct mbuf *, int);
+static int	vtnet_newbuf(struct vtnet_softc *);
+static void	vtnet_discard_merged_rxbuf(struct vtnet_softc *, int);
+static void	vtnet_discard_rxbuf(struct vtnet_softc *, struct mbuf *);
+static int	vtnet_enqueue_rxbuf(struct vtnet_softc *, struct mbuf *);
+static void	vtnet_vlan_tag_remove(struct mbuf *);
+static int	vtnet_rx_csum(struct vtnet_softc *, struct mbuf *,
+		    struct virtio_net_hdr *);
+static int	vtnet_rxeof_merged(struct vtnet_softc *, struct mbuf *, int);
+static int	vtnet_rxeof(struct vtnet_softc *, int, int *);
+static void	vtnet_rx_intr_task(void *, int);
+static int	vtnet_rx_vq_intr(void *);
+
+static void	vtnet_txeof(struct vtnet_softc *);
+static struct mbuf * vtnet_tx_offload(struct vtnet_softc *, struct mbuf *,
+		    struct virtio_net_hdr *);
+static int	vtnet_enqueue_txbuf(struct vtnet_softc *, struct mbuf **,
+		    struct vtnet_tx_header *);
+static int	vtnet_encap(struct vtnet_softc *, struct mbuf **);
+static void	vtnet_start_locked(struct ifnet *);
+static void	vtnet_start(struct ifnet *);
+static void	vtnet_tick(void *);
+static void	vtnet_tx_intr_task(void *, int);
+static int	vtnet_tx_vq_intr(void *);
+
+static void	vtnet_stop(struct vtnet_softc *);
+static int	vtnet_reinit(struct vtnet_softc *);
+static void	vtnet_init_locked(struct vtnet_softc *);
+static void	vtnet_init(void *);
+
+static void	vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *,
+		    struct sglist *, int, int);
+
+static void	vtnet_rx_filter(struct vtnet_softc *sc);
+static int	vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int);
+static int	vtnet_set_promisc(struct vtnet_softc *, int);
+static int	vtnet_set_allmulti(struct vtnet_softc *, int);
+static void	vtnet_rx_filter_mac(struct vtnet_softc *);
+
+static int	vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t);
+static void	vtnet_rx_filter_vlan(struct vtnet_softc *);
+static void	vtnet_set_vlan_filter(struct vtnet_softc *, int, uint16_t);
+static void	vtnet_register_vlan(void *, struct ifnet *, uint16_t);
+static void	vtnet_unregister_vlan(void *, struct ifnet *, uint16_t);
+
+static int	vtnet_ifmedia_upd(struct ifnet *);
+static void	vtnet_ifmedia_sts(struct ifnet *, struct ifmediareq *);
+
+static void	vtnet_add_statistics(struct vtnet_softc *);
+
+static int	vtnet_enable_rx_intr(struct vtnet_softc *);
+static int	vtnet_enable_tx_intr(struct vtnet_softc *);
+static void	vtnet_disable_rx_intr(struct vtnet_softc *);
+static void	vtnet_disable_tx_intr(struct vtnet_softc *);
+
+/* Tunables. */
+static int vtnet_csum_disable = 0;
+TUNABLE_INT("hw.vtnet.csum_disable", &vtnet_csum_disable);
+static int vtnet_tso_disable = 0;
+TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable);
+static int vtnet_lro_disable = 0;
+TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable);
+
+/*
+ * Reducing the number of transmit completed interrupts can
+ * improve performance. To do so, the define below keeps the
+ * Tx vq interrupt disabled and adds calls to vtnet_txeof()
+ * in the start and watchdog paths. The price to pay for this
+ * is the m_free'ing of transmitted mbufs may be delayed until
+ * the watchdog fires.
+ */
+#define VTNET_TX_INTR_MODERATION
+
+static uma_zone_t vtnet_tx_header_zone;
+
+static struct virtio_feature_desc vtnet_feature_desc[] = {
+	{ VIRTIO_NET_F_CSUM,		"TxChecksum"	},
+	{ VIRTIO_NET_F_GUEST_CSUM,	"RxChecksum"	},
+	{ VIRTIO_NET_F_MAC,		"MacAddress"	},
+	{ VIRTIO_NET_F_GSO,		"TxAllGSO"	},
+	{ VIRTIO_NET_F_GUEST_TSO4,	"RxTSOv4"	},
+	{ VIRTIO_NET_F_GUEST_TSO6,	"RxTSOv6"	},
+	{ VIRTIO_NET_F_GUEST_ECN,	"RxECN"		},
+	{ VIRTIO_NET_F_GUEST_UFO,	"RxUFO"		},
+	{ VIRTIO_NET_F_HOST_TSO4,	"TxTSOv4"	},
+	{ VIRTIO_NET_F_HOST_TSO6,	"TxTSOv6"	},
+	{ VIRTIO_NET_F_HOST_ECN,	"TxTSOECN"	},
+	{ VIRTIO_NET_F_HOST_UFO,	"TxUFO"		},
+	{ VIRTIO_NET_F_MRG_RXBUF,	"MrgRxBuf"	},
+	{ VIRTIO_NET_F_STATUS,		"Status"	},
+	{ VIRTIO_NET_F_CTRL_VQ,		"ControlVq"	},
+	{ VIRTIO_NET_F_CTRL_RX,		"RxMode"	},
+	{ VIRTIO_NET_F_CTRL_VLAN,	"VLanFilter"	},
+	{ VIRTIO_NET_F_CTRL_RX_EXTRA,	"RxModeExtra"	},
+
+	{ 0, NULL }
+};
+
+static device_method_t vtnet_methods[] = {
+	/* Device methods. */
+	DEVMETHOD(device_probe,		vtnet_probe),
+	DEVMETHOD(device_attach,	vtnet_attach),
+	DEVMETHOD(device_detach,	vtnet_detach),
+	DEVMETHOD(device_suspend,	vtnet_suspend),
+	DEVMETHOD(device_resume,	vtnet_resume),
+	DEVMETHOD(device_shutdown,	vtnet_shutdown),
+
+	/* VirtIO methods. */
+	DEVMETHOD(virtio_config_change, vtnet_config_change),
+
+	{ 0, 0 }
+};
+
+static driver_t vtnet_driver = {
+	"vtnet",
+	vtnet_methods,
+	sizeof(struct vtnet_softc)
+};
+static devclass_t vtnet_devclass;
+
+DRIVER_MODULE(vtnet, virtio_pci, vtnet_driver, vtnet_devclass,
+    vtnet_modevent, 0);
+MODULE_VERSION(vtnet, 1);
+MODULE_DEPEND(vtnet, virtio, 1, 1, 1);
+
+static int
+vtnet_modevent(module_t mod, int type, void *unused)
+{
+	int error;
+
+	error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+		vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr",
+		    sizeof(struct vtnet_tx_header),
+		    NULL, NULL, NULL, NULL, 0, 0);
+		break;
+	case MOD_QUIESCE:
+	case MOD_UNLOAD:
+		if (uma_zone_get_cur(vtnet_tx_header_zone) > 0)
+			error = EBUSY;
+		else if (type == MOD_UNLOAD) {
+			uma_zdestroy(vtnet_tx_header_zone);
+			vtnet_tx_header_zone = NULL;
+		}
+		break;
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+static int
+vtnet_probe(device_t dev)
+{
+
+	if (virtio_get_device_type(dev) != VIRTIO_ID_NETWORK)
+		return (ENXIO);
+
+	device_set_desc(dev, "VirtIO Networking Adapter");
+
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtnet_attach(device_t dev)
+{
+	struct vtnet_softc *sc;
+	struct ifnet *ifp;
+	int tx_size, error;
+
+	sc = device_get_softc(dev);
+	sc->vtnet_dev = dev;
+
+	VTNET_LOCK_INIT(sc);
+	callout_init_mtx(&sc->vtnet_tick_ch, VTNET_MTX(sc), 0);
+
+	ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd,
+	    vtnet_ifmedia_sts);
+	ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL);
+	ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE);
+
+	vtnet_add_statistics(sc);
+
+	virtio_set_feature_desc(dev, vtnet_feature_desc);
+	vtnet_negotiate_features(sc);
+
+	if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) {
+		sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS;
+		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	} else
+		sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
+
+	sc->vtnet_rx_mbuf_size = MCLBYTES;
+	sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc);
+
+	if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) {
+		sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ;
+
+		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX))
+			sc->vtnet_flags |= VTNET_FLAG_CTRL_RX;
+		if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN))
+			sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER;
+	}
+
+	vtnet_get_hwaddr(sc);
+
+	error = vtnet_alloc_virtqueues(sc);
+	if (error) {
+		device_printf(dev, "cannot allocate virtqueues\n");
+		goto fail;
+	}
+
+	ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER);
+	if (ifp == NULL) {
+		device_printf(dev, "cannot allocate ifnet structure\n");
+		error = ENOSPC;
+		goto fail;
+	}
+
+	ifp->if_softc = sc;
+	if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+	ifp->if_init = vtnet_init;
+	ifp->if_start = vtnet_start;
+	ifp->if_ioctl = vtnet_ioctl;
+
+	sc->vtnet_rx_size = virtqueue_size(sc->vtnet_rx_vq);
+	sc->vtnet_rx_process_limit = sc->vtnet_rx_size;
+
+	tx_size = virtqueue_size(sc->vtnet_tx_vq);
+	sc->vtnet_tx_size = tx_size;
+	IFQ_SET_MAXLEN(&ifp->if_snd, tx_size - 1);
+	ifp->if_snd.ifq_drv_maxlen = tx_size - 1;
+	IFQ_SET_READY(&ifp->if_snd);
+
+	ether_ifattach(ifp, sc->vtnet_hwaddr);
+
+	if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS))
+		ifp->if_capabilities |= IFCAP_LINKSTATE;
+
+	/* Tell the upper layer(s) we support long frames. */
+	ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
+	ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU;
+
+	if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) {
+		ifp->if_capabilities |= IFCAP_TXCSUM;
+
+		if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4))
+			ifp->if_capabilities |= IFCAP_TSO4;
+		if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
+			ifp->if_capabilities |= IFCAP_TSO6;
+		if (ifp->if_capabilities & IFCAP_TSO)
+			ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
+
+		if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN))
+			sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
+	}
+
+	if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) {
+		ifp->if_capabilities |= IFCAP_RXCSUM;
+
+		if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) ||
+		    virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6))
+			ifp->if_capabilities |= IFCAP_LRO;
+	}
+
+	if (ifp->if_capabilities & IFCAP_HWCSUM) {
+		/*
+		 * VirtIO does not support VLAN tagging, but we can fake
+		 * it by inserting and removing the 802.1Q header during
+		 * transmit and receive. We are then able to do checksum
+		 * offloading of VLAN frames.
+		 */
+		ifp->if_capabilities |=
+		    IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
+	}
+
+	ifp->if_capenable = ifp->if_capabilities;
+
+	/*
+	 * Capabilities after here are not enabled by default.
+	 */
+
+	if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
+		ifp->if_capabilities |= IFCAP_VLAN_HWFILTER;
+
+		sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
+		    vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
+		sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
+		    vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
+	}
+
+#ifdef DEVICE_POLLING
+	ifp->if_capabilities |= IFCAP_POLLING;
+#endif
+
+	TASK_INIT(&sc->vtnet_rx_intr_task, 0, vtnet_rx_intr_task, sc);
+	TASK_INIT(&sc->vtnet_tx_intr_task, 0, vtnet_tx_intr_task, sc);
+	TASK_INIT(&sc->vtnet_cfgchg_task, 0, vtnet_config_change_task, sc);
+
+	sc->vtnet_tq = taskqueue_create_fast("vtnet_taskq", M_NOWAIT,
+	    taskqueue_thread_enqueue, &sc->vtnet_tq);
+	if (sc->vtnet_tq == NULL) {
+		error = ENOMEM;
+		device_printf(dev, "cannot allocate taskqueue\n");
+		ether_ifdetach(ifp);
+		goto fail;
+	}
+	taskqueue_start_threads(&sc->vtnet_tq, 1, PI_NET, "%s taskq",
+	    device_get_nameunit(dev));
+
+	error = virtio_setup_intr(dev, INTR_TYPE_NET);
+	if (error) {
+		device_printf(dev, "cannot setup virtqueue interrupts\n");
+		taskqueue_free(sc->vtnet_tq);
+		sc->vtnet_tq = NULL;
+		ether_ifdetach(ifp);
+		goto fail;
+	}
+
+	/*
+	 * Device defaults to promiscuous mode for backwards
+	 * compatibility. Turn it off if possible.
+	 */
+	if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
+		VTNET_LOCK(sc);
+		if (vtnet_set_promisc(sc, 0) != 0) {
+			ifp->if_flags |= IFF_PROMISC;
+			device_printf(dev,
+			    "cannot disable promiscuous mode\n");
+		}
+		VTNET_UNLOCK(sc);
+	} else
+		ifp->if_flags |= IFF_PROMISC;
+
+fail:
+	if (error)
+		vtnet_detach(dev);
+
+	return (error);
+}
+
+static int
+vtnet_detach(device_t dev)
+{
+	struct vtnet_softc *sc;
+	struct ifnet *ifp;
+
+	sc = device_get_softc(dev);
+	ifp = sc->vtnet_ifp;
+
+	KASSERT(mtx_initialized(VTNET_MTX(sc)),
+	    ("vtnet mutex not initialized"));
+
+#ifdef DEVICE_POLLING
+	if (ifp != NULL && ifp->if_capenable & IFCAP_POLLING)
+		ether_poll_deregister(ifp);
+#endif
+
+	if (device_is_attached(dev)) {
+		VTNET_LOCK(sc);
+		vtnet_stop(sc);
+		VTNET_UNLOCK(sc);
+
+		callout_drain(&sc->vtnet_tick_ch);
+		taskqueue_drain(taskqueue_fast, &sc->vtnet_cfgchg_task);
+
+		ether_ifdetach(ifp);
+	}
+
+	if (sc->vtnet_tq != NULL) {
+		taskqueue_drain(sc->vtnet_tq, &sc->vtnet_rx_intr_task);
+		taskqueue_drain(sc->vtnet_tq, &sc->vtnet_tx_intr_task);
+		taskqueue_free(sc->vtnet_tq);
+		sc->vtnet_tq = NULL;
+	}
+
+	if (sc->vtnet_vlan_attach != NULL) {
+		EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach);
+		sc->vtnet_vlan_attach = NULL;
+	}
+	if (sc->vtnet_vlan_detach != NULL) {
+		EVENTHANDLER_DEREGISTER(vlan_unconfg, sc->vtnet_vlan_detach);
+		sc->vtnet_vlan_detach = NULL;
+	}
+
+	if (ifp) {
+		if_free(ifp);
+		sc->vtnet_ifp = NULL;
+	}
+
+	if (sc->vtnet_rx_vq != NULL)
+		vtnet_free_rx_mbufs(sc);
+	if (sc->vtnet_tx_vq != NULL)
+		vtnet_free_tx_mbufs(sc);
+	if (sc->vtnet_ctrl_vq != NULL)
+		vtnet_free_ctrl_vq(sc);
+
+	ifmedia_removeall(&sc->vtnet_media);
+	VTNET_LOCK_DESTROY(sc);
+
+	return (0);
+}
+
+static int
+vtnet_suspend(device_t dev)
+{
+	struct vtnet_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	VTNET_LOCK(sc);
+	vtnet_stop(sc);
+	sc->vtnet_flags |= VTNET_FLAG_SUSPENDED;
+	VTNET_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+vtnet_resume(device_t dev)
+{
+	struct vtnet_softc *sc;
+	struct ifnet *ifp;
+
+	sc = device_get_softc(dev);
+	ifp = sc->vtnet_ifp;
+
+	VTNET_LOCK(sc);
+	if (ifp->if_flags & IFF_UP)
+		vtnet_init_locked(sc);
+	sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED;
+	VTNET_UNLOCK(sc);
+
+	return (0);
+}
+
+static int
+vtnet_shutdown(device_t dev)
+{
+
+	/*
+	 * Suspend already does all of what we need to
+	 * do here; we just never expect to be resumed.
+	 */
+	return (vtnet_suspend(dev));
+}
+
+static int
+vtnet_config_change(device_t dev)
+{
+	struct vtnet_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	taskqueue_enqueue_fast(taskqueue_fast, &sc->vtnet_cfgchg_task);
+
+	return (1);
+}
+
+static void
+vtnet_negotiate_features(struct vtnet_softc *sc)
+{
+	device_t dev;
+	uint64_t mask, features;
+
+	dev = sc->vtnet_dev;
+	mask = 0;
+
+	if (vtnet_csum_disable)
+		mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
+
+	/*
+	 * TSO and LRO are only available when their corresponding
+	 * checksum offload feature is also negotiated.
+	 */
+
+	if (vtnet_csum_disable || vtnet_tso_disable)
+		mask |= VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 |
+		    VIRTIO_NET_F_HOST_ECN;
+
+	if (vtnet_csum_disable || vtnet_lro_disable)
+		mask |= VTNET_LRO_FEATURES;
+
+	features = VTNET_FEATURES & ~mask;
+#ifdef VTNET_TX_INTR_MODERATION
+	features |= VIRTIO_F_NOTIFY_ON_EMPTY;
+#endif
+	sc->vtnet_features = virtio_negotiate_features(dev, features);
+
+	if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0 &&
+	    virtio_with_feature(dev, VTNET_LRO_FEATURES)) {
+		/*
+		 * LRO without mergeable buffers requires special care. This
+		 * is not ideal because every receive buffer must be large
+		 * enough to hold the maximum TCP packet, the Ethernet header,
+		 * and the vtnet_rx_header. This requires up to 34 descriptors
+		 * when using MCLBYTES clusters. If we do not have indirect
+		 * descriptors, LRO is disabled since the virtqueue will not
+		 * be able to contain very many receive buffers.
+		 */
+		if (virtio_with_feature(dev,
+		    VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+			device_printf(dev,
+			    "LRO disabled due to lack of both mergeable "
+			    "buffers and indirect descriptors\n");
+
+			sc->vtnet_features = virtio_negotiate_features(dev,
+			    features & ~VTNET_LRO_FEATURES);
+		} else
+			sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG;
+	}
+}
+
+static int
+vtnet_alloc_virtqueues(struct vtnet_softc *sc)
+{
+	device_t dev;
+	struct vq_alloc_info vq_info[3];
+	int nvqs, rxsegs;
+
+	dev = sc->vtnet_dev;
+	nvqs = 2;
+
+	/*
+	 * Indirect descriptors are not needed for the Rx
+	 * virtqueue when mergeable buffers are negotiated.
+	 * The header is placed inline with the data, not
+	 * in a separate descriptor, and mbuf clusters are
+	 * always physically contiguous.
+	 */
+	if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+		rxsegs = sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG ?
+		    VTNET_MAX_RX_SEGS : VTNET_MIN_RX_SEGS;
+	} else
+		rxsegs = 0;
+
+	VQ_ALLOC_INFO_INIT(&vq_info[0], rxsegs,
+	    vtnet_rx_vq_intr, sc, &sc->vtnet_rx_vq,
+	    "%s receive", device_get_nameunit(dev));
+
+	VQ_ALLOC_INFO_INIT(&vq_info[1], VTNET_MAX_TX_SEGS,
+	    vtnet_tx_vq_intr, sc, &sc->vtnet_tx_vq,
+	    "%s transmit", device_get_nameunit(dev));
+
+	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
+		nvqs++;
+
+		VQ_ALLOC_INFO_INIT(&vq_info[2], 0, NULL, NULL,
+		    &sc->vtnet_ctrl_vq, "%s control",
+		    device_get_nameunit(dev));
+	}
+
+	return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info));
+}
+
+static void
+vtnet_get_hwaddr(struct vtnet_softc *sc)
+{
+	device_t dev;
+
+	dev = sc->vtnet_dev;
+
+	if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) {
+		virtio_read_device_config(dev,
+		    offsetof(struct virtio_net_config, mac),
+		    sc->vtnet_hwaddr, ETHER_ADDR_LEN);
+	} else {
+		/* Generate random locally administered unicast address. */
+		sc->vtnet_hwaddr[0] = 0xB2;
+		arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0);
+
+		vtnet_set_hwaddr(sc);
+	}
+}
+
+static void
+vtnet_set_hwaddr(struct vtnet_softc *sc)
+{
+	device_t dev;
+
+	dev = sc->vtnet_dev;
+
+	virtio_write_device_config(dev,
+	    offsetof(struct virtio_net_config, mac),
+	    sc->vtnet_hwaddr, ETHER_ADDR_LEN);
+}
+
+static int
+vtnet_is_link_up(struct vtnet_softc *sc)
+{
+	device_t dev;
+	struct ifnet *ifp;
+	uint16_t status;
+
+	dev = sc->vtnet_dev;
+	ifp = sc->vtnet_ifp;
+
+	VTNET_LOCK_ASSERT(sc);
+
+	if ((ifp->if_capenable & IFCAP_LINKSTATE) == 0)
+		return (1);
+
+	status = virtio_read_dev_config_2(dev,
+	    offsetof(struct virtio_net_config, status));
+
+	return ((status & VIRTIO_NET_S_LINK_UP) != 0);
+}
+
+static void
+vtnet_update_link_status(struct vtnet_softc *sc)
+{
+	device_t dev;
+	struct ifnet *ifp;
+	int link;
+
+	dev = sc->vtnet_dev;
+	ifp = sc->vtnet_ifp;
+
+	link = vtnet_is_link_up(sc);
+
+	if (link && ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0)) {
+		sc->vtnet_flags |= VTNET_FLAG_LINK;
+		if (bootverbose)
+			device_printf(dev, "Link is up\n");
+
+		if_link_state_change(ifp, LINK_STATE_UP);
+		if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+			vtnet_start_locked(ifp);
+	} else if (!link && (sc->vtnet_flags & VTNET_FLAG_LINK)) {
+		sc->vtnet_flags &= ~VTNET_FLAG_LINK;
+		if (bootverbose)
+			device_printf(dev, "Link is down\n");
+
+		if_link_state_change(ifp, LINK_STATE_DOWN);
+	}
+}
+
+static void
+vtnet_watchdog(struct vtnet_softc *sc)
+{
+	struct ifnet *ifp;
+
+	ifp = sc->vtnet_ifp;
+
+#ifdef VTNET_TX_INTR_MODERATION
+	vtnet_txeof(sc);
+#endif
+
+	if (sc->vtnet_watchdog_timer == 0 || --sc->vtnet_watchdog_timer)
+		return;
+
+	if_printf(ifp, "watchdog timeout -- resetting\n");
+#ifdef VTNET_DEBUG
+	virtqueue_dump(sc->vtnet_tx_vq);
+#endif
+	ifp->if_oerrors++;
+	ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+	vtnet_init_locked(sc);
+}
+
+static void
+vtnet_config_change_task(void *arg, int pending)
+{
+	struct vtnet_softc *sc;
+
+	sc = arg;
+
+	VTNET_LOCK(sc);
+	vtnet_update_link_status(sc);
+	VTNET_UNLOCK(sc);
+}
+
+static int
+vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+	struct vtnet_softc *sc;
+	struct ifreq *ifr;
+	int reinit, mask, error;
+
+	sc = ifp->if_softc;
+	ifr = (struct ifreq *) data;
+	reinit = 0;
+	error = 0;
+
+	switch (cmd) {
+	case SIOCSIFMTU:
+		if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > VTNET_MAX_MTU)
+			error = EINVAL;
+		else if (ifp->if_mtu != ifr->ifr_mtu) {
+			VTNET_LOCK(sc);
+			error = vtnet_change_mtu(sc, ifr->ifr_mtu);
+			VTNET_UNLOCK(sc);
+		}
+		break;
+
+	case SIOCSIFFLAGS:
+		VTNET_LOCK(sc);
+		if ((ifp->if_flags & IFF_UP) == 0) {
+			if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+				vtnet_stop(sc);
+		} else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+			if ((ifp->if_flags ^ sc->vtnet_if_flags) &
+			    (IFF_PROMISC | IFF_ALLMULTI)) {
+				if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX)
+					vtnet_rx_filter(sc);
+				else
+					error = ENOTSUP;
+			}
+		} else
+			vtnet_init_locked(sc);
+
+		if (error == 0)
+			sc->vtnet_if_flags = ifp->if_flags;
+		VTNET_UNLOCK(sc);
+		break;
+
+	case SIOCADDMULTI:
+	case SIOCDELMULTI:
+		VTNET_LOCK(sc);
+		if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) &&
+		    (ifp->if_drv_flags & IFF_DRV_RUNNING))
+			vtnet_rx_filter_mac(sc);
+		VTNET_UNLOCK(sc);
+		break;
+
+	case SIOCSIFMEDIA:
+	case SIOCGIFMEDIA:
+		error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd);
+		break;
+
+	case SIOCSIFCAP:
+		mask = ifr->ifr_reqcap ^ ifp->if_capenable;
+
+#ifdef DEVICE_POLLING
+		if (mask & IFCAP_POLLING) {
+			if (ifr->ifr_reqcap & IFCAP_POLLING) {
+				error = ether_poll_register(vtnet_poll, ifp);
+				if (error)
+					break;
+
+				VTNET_LOCK(sc);
+				vtnet_disable_rx_intr(sc);
+				vtnet_disable_tx_intr(sc);
+				ifp->if_capenable |= IFCAP_POLLING;
+				VTNET_UNLOCK(sc);
+			} else {
+				error = ether_poll_deregister(ifp);
+
+				/* Enable interrupts even in error case. */
+				VTNET_LOCK(sc);
+				vtnet_enable_tx_intr(sc);
+				vtnet_enable_rx_intr(sc);
+				ifp->if_capenable &= ~IFCAP_POLLING;
+				VTNET_UNLOCK(sc);
+			}
+		}
+#endif
+		VTNET_LOCK(sc);
+
+		if (mask & IFCAP_TXCSUM) {
+			ifp->if_capenable ^= IFCAP_TXCSUM;
+			if (ifp->if_capenable & IFCAP_TXCSUM)
+				ifp->if_hwassist |= VTNET_CSUM_OFFLOAD;
+			else
+				ifp->if_hwassist &= ~VTNET_CSUM_OFFLOAD;
+		}
+
+		if (mask & IFCAP_TSO4) {
+			ifp->if_capenable ^= IFCAP_TSO4;
+			if (ifp->if_capenable & IFCAP_TSO4)
+				ifp->if_hwassist |= CSUM_TSO;
+			else
+				ifp->if_hwassist &= ~CSUM_TSO;
+		}
+
+		if (mask & IFCAP_RXCSUM) {
+			ifp->if_capenable ^= IFCAP_RXCSUM;
+			reinit = 1;
+		}
+
+		if (mask & IFCAP_LRO) {
+			ifp->if_capenable ^= IFCAP_LRO;
+			reinit = 1;
+		}
+
+		if (mask & IFCAP_VLAN_HWFILTER) {
+			ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
+			reinit = 1;
+		}
+
+		if (mask & IFCAP_VLAN_HWTSO)
+			ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
+
+		if (mask & IFCAP_VLAN_HWTAGGING)
+			ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
+
+		if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+			ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+			vtnet_init_locked(sc);
+		}
+		VLAN_CAPABILITIES(ifp);
+
+		VTNET_UNLOCK(sc);
+		break;
+
+	default:
+		error = ether_ioctl(ifp, cmd, data);
+		break;
+	}
+
+	VTNET_LOCK_ASSERT_NOTOWNED(sc);
+
+	return (error);
+}
+
+static int
+vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu)
+{
+	struct ifnet *ifp;
+	int new_frame_size, clsize;
+
+	ifp = sc->vtnet_ifp;
+
+	if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+		new_frame_size = sizeof(struct vtnet_rx_header) +
+		    sizeof(struct ether_vlan_header) + new_mtu;
+
+		if (new_frame_size > MJUM9BYTES)
+			return (EINVAL);
+
+		if (new_frame_size <= MCLBYTES)
+			clsize = MCLBYTES;
+		else
+			clsize = MJUM9BYTES;
+	} else {
+		new_frame_size = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
+		    sizeof(struct ether_vlan_header) + new_mtu;
+
+		if (new_frame_size <= MCLBYTES)
+			clsize = MCLBYTES;
+		else
+			clsize = MJUMPAGESIZE;
+	}
+
+	sc->vtnet_rx_mbuf_size = clsize;
+	sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc);
+	KASSERT(sc->vtnet_rx_mbuf_count < VTNET_MAX_RX_SEGS,
+	    ("too many rx mbufs: %d", sc->vtnet_rx_mbuf_count));
+
+	ifp->if_mtu = new_mtu;
+
+	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+		vtnet_init_locked(sc);
+	}
+
+	return (0);
+}
+
+static int
+vtnet_init_rx_vq(struct vtnet_softc *sc)
+{
+	struct virtqueue *vq;
+	int nbufs, error;
+
+	vq = sc->vtnet_rx_vq;
+	nbufs = 0;
+	error = ENOSPC;
+
+	while (!virtqueue_full(vq)) {
+		if ((error = vtnet_newbuf(sc)) != 0)
+			break;
+		nbufs++;
+	}
+
+	if (nbufs > 0) {
+		virtqueue_notify(vq);
+
+		/*
+		 * EMSGSIZE signifies the virtqueue did not have enough
+		 * entries available to hold the last mbuf. This is not
+		 * an error. We should not get ENOSPC since we check if
+		 * the virtqueue is full before attempting to add a
+		 * buffer.
+		 */
+		if (error == EMSGSIZE)
+			error = 0;
+	}
+
+	return (error);
+}
+
+static void
+vtnet_free_rx_mbufs(struct vtnet_softc *sc)
+{
+	struct virtqueue *vq;
+	struct mbuf *m;
+	int last;
+
+	vq = sc->vtnet_rx_vq;
+	last = 0;
+
+	while ((m = virtqueue_drain(vq, &last)) != NULL)
+		m_freem(m);
+
+	KASSERT(virtqueue_empty(vq), ("mbufs remaining in Rx Vq"));
+}
+
+static void
+vtnet_free_tx_mbufs(struct vtnet_softc *sc)
+{
+	struct virtqueue *vq;
+	struct vtnet_tx_header *txhdr;
+	int last;
+
+	vq = sc->vtnet_tx_vq;
+	last = 0;
+
+	while ((txhdr = virtqueue_drain(vq, &last)) != NULL) {
+		m_freem(txhdr->vth_mbuf);
+		uma_zfree(vtnet_tx_header_zone, txhdr);
+	}
+
+	KASSERT(virtqueue_empty(vq), ("mbufs remaining in Tx Vq"));
+}
+
+static void
+vtnet_free_ctrl_vq(struct vtnet_softc *sc)
+{
+
+	/*
+	 * The control virtqueue is only polled, therefore
+	 * it should already be empty.
+	 */
+	KASSERT(virtqueue_empty(sc->vtnet_ctrl_vq),
+	    ("Ctrl Vq not empty"));
+}
+
+#ifdef DEVICE_POLLING
+static int
+vtnet_poll(struct ifnet *ifp, enum poll_cmd cmd, int count)
+{
+	struct vtnet_softc *sc;
+	int rx_done;
+
+	sc = ifp->if_softc;
+	rx_done = 0;
+
+	VTNET_LOCK(sc);
+	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+		if (cmd == POLL_AND_CHECK_STATUS)
+			vtnet_update_link_status(sc);
+
+		if (virtqueue_nused(sc->vtnet_rx_vq) > 0)
+			vtnet_rxeof(sc, count, &rx_done);
+
+		vtnet_txeof(sc);
+		if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+			vtnet_start_locked(ifp);
+	}
+	VTNET_UNLOCK(sc);
+
+	return (rx_done);
+}
+#endif /* DEVICE_POLLING */
+
+static struct mbuf *
+vtnet_alloc_rxbuf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp)
+{
+	struct mbuf *m_head, *m_tail, *m;
+	int i, clsize;
+
+	clsize = sc->vtnet_rx_mbuf_size;
+
+	m_head = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, clsize);
+	if (m_head == NULL)
+		goto fail;
+
+	m_head->m_len = clsize;
+	m_tail = m_head;
+
+	if (nbufs > 1) {
+		KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
+		    ("chained Rx mbuf requested without LRO_NOMRG"));
+
+		for (i = 0; i < nbufs - 1; i++) {
+			m = m_getjcl(M_DONTWAIT, MT_DATA, 0, clsize);
+			if (m == NULL)
+				goto fail;
+
+			m->m_len = clsize;
+			m_tail->m_next = m;
+			m_tail = m;
+		}
+	}
+
+	if (m_tailp != NULL)
+		*m_tailp = m_tail;
+
+	return (m_head);
+
+fail:
+	sc->vtnet_stats.mbuf_alloc_failed++;
+	m_freem(m_head);
+
+	return (NULL);
+}
+
+static int
+vtnet_replace_rxbuf(struct vtnet_softc *sc, struct mbuf *m0, int len0)
+{
+	struct mbuf *m, *m_prev;
+	struct mbuf *m_new, *m_tail;
+	int len, clsize, nreplace, error;
+
+	m = m0;
+	m_prev = NULL;
+	len = len0;
+
+	m_tail = NULL;
+	clsize = sc->vtnet_rx_mbuf_size;
+	nreplace = 0;
+
+	if (m->m_next != NULL)
+		KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
+		    ("chained Rx mbuf without LRO_NOMRG"));
+
+	/*
+	 * Since LRO_NOMRG mbuf chains are so large, we want to avoid
+	 * allocating an entire chain for each received frame. When
+	 * the received frame's length is less than that of the chain,
+	 * the unused mbufs are reassigned to the new chain.
+	 */
+	while (len > 0) {
+		/*
+		 * Something is seriously wrong if we received
+		 * a frame larger than the mbuf chain. Drop it.
+		 */
+		if (m == NULL) {
+			sc->vtnet_stats.rx_frame_too_large++;
+			return (EMSGSIZE);
+		}
+
+		KASSERT(m->m_len == clsize,
+		    ("mbuf length not expected cluster size: %d",
+		    m->m_len));
+
+		m->m_len = MIN(m->m_len, len);
+		len -= m->m_len;
+
+		m_prev = m;
+		m = m->m_next;
+		nreplace++;
+	}
+
+	KASSERT(m_prev != NULL, ("m_prev == NULL"));
+	KASSERT(nreplace <= sc->vtnet_rx_mbuf_count,
+	    ("too many replacement mbufs: %d/%d", nreplace,
+	    sc->vtnet_rx_mbuf_count));
+
+	m_new = vtnet_alloc_rxbuf(sc, nreplace, &m_tail);
+	if (m_new == NULL) {
+		m_prev->m_len = clsize;
+		return (ENOBUFS);
+	}
+
+	/*
+	 * Move unused mbufs, if any, from the original chain
+	 * onto the end of the new chain.
+	 */
+	if (m_prev->m_next != NULL) {
+		m_tail->m_next = m_prev->m_next;
+		m_prev->m_next = NULL;
+	}
+
+	error = vtnet_enqueue_rxbuf(sc, m_new);
+	if (error) {
+		/*
+		 * BAD! We could not enqueue the replacement mbuf chain. We
+		 * must restore the m0 chain to the original state if it was
+		 * modified so we can subsequently discard it.
+		 *
+		 * NOTE: The replacement is suppose to be an identical copy
+		 * to the one just dequeued so this is an unexpected error.
+		 */
+		sc->vtnet_stats.rx_enq_replacement_failed++;
+
+		if (m_tail->m_next != NULL) {
+			m_prev->m_next = m_tail->m_next;
+			m_tail->m_next = NULL;
+		}
+
+		m_prev->m_len = clsize;
+		m_freem(m_new);
+	}
+
+	return (error);
+}
+
+static int
+vtnet_newbuf(struct vtnet_softc *sc)
+{
+	struct mbuf *m;
+	int error;
+
+	m = vtnet_alloc_rxbuf(sc, sc->vtnet_rx_mbuf_count, NULL);
+	if (m == NULL)
+		return (ENOBUFS);
+
+	error = vtnet_enqueue_rxbuf(sc, m);
+	if (error)
+		m_freem(m);
+
+	return (error);
+}
+
+static void
+vtnet_discard_merged_rxbuf(struct vtnet_softc *sc, int nbufs)
+{
+	struct virtqueue *vq;
+	struct mbuf *m;
+
+	vq = sc->vtnet_rx_vq;
+
+	while (--nbufs > 0) {
+		if ((m = virtqueue_dequeue(vq, NULL)) == NULL)
+			break;
+		vtnet_discard_rxbuf(sc, m);
+	}
+}
+
+static void
+vtnet_discard_rxbuf(struct vtnet_softc *sc, struct mbuf *m)
+{
+	int error;
+
+	/*
+	 * Requeue the discarded mbuf. This should always be
+	 * successful since it was just dequeued.
+	 */
+	error = vtnet_enqueue_rxbuf(sc, m);
+	KASSERT(error == 0, ("cannot requeue discarded mbuf"));
+}
+
+static int
+vtnet_enqueue_rxbuf(struct vtnet_softc *sc, struct mbuf *m)
+{
+	struct sglist sg;
+	struct sglist_seg segs[VTNET_MAX_RX_SEGS];
+	struct vtnet_rx_header *rxhdr;
+	struct virtio_net_hdr *hdr;
+	uint8_t *mdata;
+	int offset, error;
+
+	VTNET_LOCK_ASSERT(sc);
+	if ((sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) == 0)
+		KASSERT(m->m_next == NULL, ("chained Rx mbuf"));
+
+	sglist_init(&sg, VTNET_MAX_RX_SEGS, segs);
+
+	mdata = mtod(m, uint8_t *);
+	offset = 0;
+
+	if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+		rxhdr = (struct vtnet_rx_header *) mdata;
+		hdr = &rxhdr->vrh_hdr;
+		offset += sizeof(struct vtnet_rx_header);
+
+		error = sglist_append(&sg, hdr, sc->vtnet_hdr_size);
+		KASSERT(error == 0, ("cannot add header to sglist"));
+	}
+
+	error = sglist_append(&sg, mdata + offset, m->m_len - offset);
+	if (error)
+		return (error);
+
+	if (m->m_next != NULL) {
+		error = sglist_append_mbuf(&sg, m->m_next);
+		if (error)
+			return (error);
+	}
+
+	return (virtqueue_enqueue(sc->vtnet_rx_vq, m, &sg, 0, sg.sg_nseg));
+}
+
+static void
+vtnet_vlan_tag_remove(struct mbuf *m)
+{
+	struct ether_vlan_header *evl;
+
+	evl = mtod(m, struct ether_vlan_header *);
+
+	m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
+	m->m_flags |= M_VLANTAG;
+
+	/* Strip the 802.1Q header. */
+	bcopy((char *) evl, (char *) evl + ETHER_VLAN_ENCAP_LEN,
+	    ETHER_HDR_LEN - ETHER_TYPE_LEN);
+	m_adj(m, ETHER_VLAN_ENCAP_LEN);
+}
+
+#ifdef notyet
+static int
+vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m,
+    struct virtio_net_hdr *hdr)
+{
+	struct ether_header *eh;
+	struct ether_vlan_header *evh;
+	struct ip *ip;
+	struct ip6_hdr *ip6;
+	struct udphdr *udp;
+	int ip_offset, csum_start, csum_offset, hlen;
+	uint16_t eth_type;
+	uint8_t ip_proto;
+
+	/*
+	 * Convert the VirtIO checksum interface to FreeBSD's interface.
+	 * The host only provides us with the offset at which to start
+	 * checksumming, and the offset from that to place the completed
+	 * checksum. While this maps well with how Linux does checksums,
+	 * for FreeBSD, we must parse the received packet in order to set
+	 * the appropriate CSUM_* flags.
+	 */
+
+	/*
+	 * Every mbuf added to the receive virtqueue is always at least
+	 * MCLBYTES big, so assume something is amiss if the first mbuf
+	 * does not contain both the Ethernet and protocol headers.
+	 */
+	ip_offset = sizeof(struct ether_header);
+	if (m->m_len < ip_offset)
+		return (1);
+
+	eh = mtod(m, struct ether_header *);
+	eth_type = ntohs(eh->ether_type);
+	if (eth_type == ETHERTYPE_VLAN) {
+		ip_offset = sizeof(struct ether_vlan_header);
+		if (m->m_len < ip_offset)
+			return (1);
+		evh = mtod(m, struct ether_vlan_header *);
+		eth_type = ntohs(evh->evl_proto);
+	}
+
+	switch (eth_type) {
+	case ETHERTYPE_IP:
+		if (m->m_len < ip_offset + sizeof(struct ip))
+			return (1);
+
+		ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset);
+		 /* Sanity check the IP header. */
+		if (ip->ip_v != IPVERSION)
+			return (1);
+		hlen = ip->ip_hl << 2;
+		if (hlen < sizeof(struct ip))
+			return (1);
+		if (ntohs(ip->ip_len) < hlen)
+			return (1);
+		if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset))
+			return (1);
+
+		ip_proto = ip->ip_p;
+		csum_start = ip_offset + hlen;
+		break;
+
+	case ETHERTYPE_IPV6:
+		if (m->m_len < ip_offset + sizeof(struct ip6_hdr))
+			return (1);
+
+		/*
+		 * XXX FreeBSD does not handle any IPv6 checksum offloading
+		 * at the moment.
+		 */
+
+		ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset);
+		/* XXX Assume no extension headers are present. */
+		ip_proto = ip6->ip6_nxt;
+		csum_start = ip_offset + sizeof(struct ip6_hdr);
+		break;
+
+	default:
+		sc->vtnet_stats.rx_csum_bad_ethtype++;
+		return (1);
+	}
+
+	/* Assume checksum begins right after the IP header. */
+	if (hdr->csum_start != csum_start) {
+		sc->vtnet_stats.rx_csum_bad_start++;
+		return (1);
+	}
+
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+		csum_offset = offsetof(struct tcphdr, th_sum);
+		break;
+
+	case IPPROTO_UDP:
+		csum_offset = offsetof(struct udphdr, uh_sum);
+		break;
+
+	case IPPROTO_SCTP:
+		csum_offset = offsetof(struct sctphdr, checksum);
+		break;
+
+	default:
+		sc->vtnet_stats.rx_csum_bad_ipproto++;
+		return (1);
+	}
+
+	if (hdr->csum_offset != csum_offset) {
+		sc->vtnet_stats.rx_csum_bad_offset++;
+		return (1);
+	}
+
+	/*
+	 * The IP header checksum is almost certainly valid but I'm
+	 * uncertain if that is guaranteed.
+	 *
+	 * m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID;
+	 */
+
+	switch (ip_proto) {
+	case IPPROTO_UDP:
+		if (m->m_len < csum_start + sizeof(struct udphdr))
+			return (1);
+
+		udp = (struct udphdr *)(mtod(m, uint8_t *) + csum_start);
+		if (udp->uh_sum == 0)
+			return (0);
+
+		/* FALLTHROUGH */
+
+	case IPPROTO_TCP:
+		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+		m->m_pkthdr.csum_data = 0xFFFF;
+		break;
+
+	case IPPROTO_SCTP:
+		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+		break;
+	}
+
+	sc->vtnet_stats.rx_csum_offloaded++;
+
+	return (0);
+}
+#endif
+
+/*
+ * Alternative method of doing receive checksum offloading. Rather
+ * than parsing the received frame down to the IP header, use the
+ * csum_offset to determine which CSUM_* flags are appropriate. We
+ * can get by with doing this only because the checksum offsets are
+ * unique for the things we care about.
+ */
+static int
+vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m,
+    struct virtio_net_hdr *hdr)
+{
+	struct ether_header *eh;
+	struct ether_vlan_header *evh;
+	struct udphdr *udp;
+	int csum_len;
+	uint16_t eth_type;
+
+	csum_len = hdr->csum_start + hdr->csum_offset;
+
+	if (csum_len < sizeof(struct ether_header) + sizeof(struct ip))
+		return (1);
+	if (m->m_len < csum_len)
+		return (1);
+
+	eh = mtod(m, struct ether_header *);
+	eth_type = ntohs(eh->ether_type);
+	if (eth_type == ETHERTYPE_VLAN) {
+		evh = mtod(m, struct ether_vlan_header *);
+		eth_type = ntohs(evh->evl_proto);
+	}
+
+	if (eth_type != ETHERTYPE_IP && eth_type != ETHERTYPE_IPV6) {
+		sc->vtnet_stats.rx_csum_bad_ethtype++;
+		return (1);
+	}
+
+	/* Use the offset to determine the appropriate CSUM_* flags. */
+	switch (hdr->csum_offset) {
+	case offsetof(struct udphdr, uh_sum):
+		if (m->m_len < hdr->csum_start + sizeof(struct udphdr))
+			return (1);
+		udp = (struct udphdr *)(mtod(m, uint8_t *) + hdr->csum_start);
+		if (udp->uh_sum == 0)
+			return (0);
+
+		/* FALLTHROUGH */
+
+	case offsetof(struct tcphdr, th_sum):
+		m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+		m->m_pkthdr.csum_data = 0xFFFF;
+		break;
+
+	case offsetof(struct sctphdr, checksum):
+		m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+		break;
+
+	default:
+		sc->vtnet_stats.rx_csum_bad_offset++;
+		return (1);
+	}
+
+	sc->vtnet_stats.rx_csum_offloaded++;
+
+	return (0);
+}
+
+static int
+vtnet_rxeof_merged(struct vtnet_softc *sc, struct mbuf *m_head, int nbufs)
+{
+	struct ifnet *ifp;
+	struct virtqueue *vq;
+	struct mbuf *m, *m_tail;
+	int len;
+
+	ifp = sc->vtnet_ifp;
+	vq = sc->vtnet_rx_vq;
+	m_tail = m_head;
+
+	while (--nbufs > 0) {
+		m = virtqueue_dequeue(vq, &len);
+		if (m == NULL) {
+			ifp->if_ierrors++;
+			goto fail;
+		}
+
+		if (vtnet_newbuf(sc) != 0) {
+			ifp->if_iqdrops++;
+			vtnet_discard_rxbuf(sc, m);
+			if (nbufs > 1)
+				vtnet_discard_merged_rxbuf(sc, nbufs);
+			goto fail;
+		}
+
+		if (m->m_len < len)
+			len = m->m_len;
+
+		m->m_len = len;
+		m->m_flags &= ~M_PKTHDR;
+
+		m_head->m_pkthdr.len += len;
+		m_tail->m_next = m;
+		m_tail = m;
+	}
+
+	return (0);
+
+fail:
+	sc->vtnet_stats.rx_mergeable_failed++;
+	m_freem(m_head);
+
+	return (1);
+}
+
+static int
+vtnet_rxeof(struct vtnet_softc *sc, int count, int *rx_npktsp)
+{
+	struct virtio_net_hdr lhdr;
+	struct ifnet *ifp;
+	struct virtqueue *vq;
+	struct mbuf *m;
+	struct ether_header *eh;
+	struct virtio_net_hdr *hdr;
+	struct virtio_net_hdr_mrg_rxbuf *mhdr;
+	int len, deq, nbufs, adjsz, rx_npkts;
+
+	ifp = sc->vtnet_ifp;
+	vq = sc->vtnet_rx_vq;
+	hdr = &lhdr;
+	deq = 0;
+	rx_npkts = 0;
+
+	VTNET_LOCK_ASSERT(sc);
+
+	while (--count >= 0) {
+		m = virtqueue_dequeue(vq, &len);
+		if (m == NULL)
+			break;
+		deq++;
+
+		if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) {
+			ifp->if_ierrors++;
+			vtnet_discard_rxbuf(sc, m);
+			continue;
+		}
+
+		if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+			nbufs = 1;
+			adjsz = sizeof(struct vtnet_rx_header);
+			/*
+			 * Account for our pad between the header and
+			 * the actual start of the frame.
+			 */
+			len += VTNET_RX_HEADER_PAD;
+		} else {
+			mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *);
+			nbufs = mhdr->num_buffers;
+			adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		}
+
+		if (vtnet_replace_rxbuf(sc, m, len) != 0) {
+			ifp->if_iqdrops++;
+			vtnet_discard_rxbuf(sc, m);
+			if (nbufs > 1)
+				vtnet_discard_merged_rxbuf(sc, nbufs);
+			continue;
+		}
+
+		m->m_pkthdr.len = len;
+		m->m_pkthdr.rcvif = ifp;
+		m->m_pkthdr.csum_flags = 0;
+
+		if (nbufs > 1) {
+			if (vtnet_rxeof_merged(sc, m, nbufs) != 0)
+				continue;
+		}
+
+		ifp->if_ipackets++;
+
+		/*
+		 * Save copy of header before we strip it. For both mergeable
+		 * and non-mergeable, the VirtIO header is placed first in the
+		 * mbuf's data. We no longer need num_buffers, so always use a
+		 * virtio_net_hdr.
+		 */
+		memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr));
+		m_adj(m, adjsz);
+
+		if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) {
+			eh = mtod(m, struct ether_header *);
+			if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
+				vtnet_vlan_tag_remove(m);
+
+				/*
+				 * With the 802.1Q header removed, update the
+				 * checksum starting location accordingly.
+				 */
+				if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+					hdr->csum_start -=
+					    ETHER_VLAN_ENCAP_LEN;
+			}
+		}
+
+		if (ifp->if_capenable & IFCAP_RXCSUM &&
+		    hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+			if (vtnet_rx_csum(sc, m, hdr) != 0)
+				sc->vtnet_stats.rx_csum_failed++;
+		}
+
+		VTNET_UNLOCK(sc);
+		rx_npkts++;
+		(*ifp->if_input)(ifp, m);
+		VTNET_LOCK(sc);
+
+		/*
+		 * The interface may have been stopped while we were
+		 * passing the packet up the network stack.
+		 */
+		if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+			break;
+	}
+
+	virtqueue_notify(vq);
+
+	if (rx_npktsp != NULL)
+		*rx_npktsp = rx_npkts;
+
+	return (count > 0 ? 0 : EAGAIN);
+}
+
+static void
+vtnet_rx_intr_task(void *arg, int pending)
+{
+	struct vtnet_softc *sc;
+	struct ifnet *ifp;
+	int more;
+
+	sc = arg;
+	ifp = sc->vtnet_ifp;
+
+	VTNET_LOCK(sc);
+
+#ifdef DEVICE_POLLING
+	if (ifp->if_capenable & IFCAP_POLLING) {
+		VTNET_UNLOCK(sc);
+		return;
+	}
+#endif
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+		vtnet_enable_rx_intr(sc);
+		VTNET_UNLOCK(sc);
+		return;
+	}
+
+	more = vtnet_rxeof(sc, sc->vtnet_rx_process_limit, NULL);
+	if (!more && vtnet_enable_rx_intr(sc) != 0) {
+		vtnet_disable_rx_intr(sc);
+		more = 1;
+	}
+
+	VTNET_UNLOCK(sc);
+
+	if (more) {
+		sc->vtnet_stats.rx_task_rescheduled++;
+		taskqueue_enqueue_fast(sc->vtnet_tq,
+		    &sc->vtnet_rx_intr_task);
+	}
+}
+
+static int
+vtnet_rx_vq_intr(void *xsc)
+{
+	struct vtnet_softc *sc;
+
+	sc = xsc;
+
+	vtnet_disable_rx_intr(sc);
+	taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_rx_intr_task);
+
+	return (1);
+}
+
+static void
+vtnet_txeof(struct vtnet_softc *sc)
+{
+	struct virtqueue *vq;
+	struct ifnet *ifp;
+	struct vtnet_tx_header *txhdr;
+	int deq;
+
+	vq = sc->vtnet_tx_vq;
+	ifp = sc->vtnet_ifp;
+	deq = 0;
+
+	VTNET_LOCK_ASSERT(sc);
+
+	while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) {
+		deq++;
+		ifp->if_opackets++;
+		m_freem(txhdr->vth_mbuf);
+		uma_zfree(vtnet_tx_header_zone, txhdr);
+	}
+
+	if (deq > 0) {
+		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+		if (virtqueue_empty(vq))
+			sc->vtnet_watchdog_timer = 0;
+	}
+}
+
+static struct mbuf *
+vtnet_tx_offload(struct vtnet_softc *sc, struct mbuf *m,
+    struct virtio_net_hdr *hdr)
+{
+	struct ifnet *ifp;
+	struct ether_header *eh;
+	struct ether_vlan_header *evh;
+	struct ip *ip;
+	struct ip6_hdr *ip6;
+	struct tcphdr *tcp;
+	int ip_offset;
+	uint16_t eth_type, csum_start;
+	uint8_t ip_proto, gso_type;
+
+	ifp = sc->vtnet_ifp;
+	M_ASSERTPKTHDR(m);
+
+	ip_offset = sizeof(struct ether_header);
+	if (m->m_len < ip_offset) {
+		if ((m = m_pullup(m, ip_offset)) == NULL)
+			return (NULL);
+	}
+
+	eh = mtod(m, struct ether_header *);
+	eth_type = ntohs(eh->ether_type);
+	if (eth_type == ETHERTYPE_VLAN) {
+		ip_offset = sizeof(struct ether_vlan_header);
+		if (m->m_len < ip_offset) {
+			if ((m = m_pullup(m, ip_offset)) == NULL)
+				return (NULL);
+		}
+		evh = mtod(m, struct ether_vlan_header *);
+		eth_type = ntohs(evh->evl_proto);
+	}
+
+	switch (eth_type) {
+	case ETHERTYPE_IP:
+		if (m->m_len < ip_offset + sizeof(struct ip)) {
+			m = m_pullup(m, ip_offset + sizeof(struct ip));
+			if (m == NULL)
+				return (NULL);
+		}
+
+		ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset);
+		ip_proto = ip->ip_p;
+		csum_start = ip_offset + (ip->ip_hl << 2);
+		gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+		break;
+
+	case ETHERTYPE_IPV6:
+		if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) {
+			m = m_pullup(m, ip_offset + sizeof(struct ip6_hdr));
+			if (m == NULL)
+				return (NULL);
+		}
+
+		ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset);
+		/*
+		 * XXX Assume no extension headers are present. Presently,
+		 * this will always be true in the case of TSO, and FreeBSD
+		 * does not perform checksum offloading of IPv6 yet.
+		 */
+		ip_proto = ip6->ip6_nxt;
+		csum_start = ip_offset + sizeof(struct ip6_hdr);
+		gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+		break;
+
+	default:
+		return (m);
+	}
+
+	if (m->m_pkthdr.csum_flags & VTNET_CSUM_OFFLOAD) {
+		hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+		hdr->csum_start = csum_start;
+		hdr->csum_offset = m->m_pkthdr.csum_data;
+
+		sc->vtnet_stats.tx_csum_offloaded++;
+	}
+
+	if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+		if (ip_proto != IPPROTO_TCP)
+			return (m);
+
+		if (m->m_len < csum_start + sizeof(struct tcphdr)) {
+			m = m_pullup(m, csum_start + sizeof(struct tcphdr));
+			if (m == NULL)
+				return (NULL);
+		}
+
+		tcp = (struct tcphdr *)(mtod(m, uint8_t *) + csum_start);
+		hdr->gso_type = gso_type;
+		hdr->hdr_len = csum_start + (tcp->th_off << 2);
+		hdr->gso_size = m->m_pkthdr.tso_segsz;
+
+		if (tcp->th_flags & TH_CWR) {
+			/*
+			 * Drop if we did not negotiate VIRTIO_NET_F_HOST_ECN.
+			 * ECN support is only configurable globally with the
+			 * net.inet.tcp.ecn.enable sysctl knob.
+			 */
+			if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
+				if_printf(ifp, "TSO with ECN not supported "
+				    "by host\n");
+				m_freem(m);
+				return (NULL);
+			}
+
+			hdr->flags |= VIRTIO_NET_HDR_GSO_ECN;
+		}
+
+		sc->vtnet_stats.tx_tso_offloaded++;
+	}
+
+	return (m);
+}
+
+static int
+vtnet_enqueue_txbuf(struct vtnet_softc *sc, struct mbuf **m_head,
+    struct vtnet_tx_header *txhdr)
+{
+	struct sglist sg;
+	struct sglist_seg segs[VTNET_MAX_TX_SEGS];
+	struct virtqueue *vq;
+	struct mbuf *m;
+	int collapsed, error;
+
+	vq = sc->vtnet_tx_vq;
+	m = *m_head;
+	collapsed = 0;
+
+	sglist_init(&sg, VTNET_MAX_TX_SEGS, segs);
+	error = sglist_append(&sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size);
+	KASSERT(error == 0 && sg.sg_nseg == 1,
+	    ("cannot add header to sglist"));
+
+again:
+	error = sglist_append_mbuf(&sg, m);
+	if (error) {
+		if (collapsed)
+			goto fail;
+
+		m = m_collapse(m, M_DONTWAIT, VTNET_MAX_TX_SEGS - 1);
+		if (m == NULL)
+			goto fail;
+
+		*m_head = m;
+		collapsed = 1;
+		goto again;
+	}
+
+	txhdr->vth_mbuf = m;
+
+	return (virtqueue_enqueue(vq, txhdr, &sg, sg.sg_nseg, 0));
+
+fail:
+	m_freem(*m_head);
+	*m_head = NULL;
+
+	return (ENOBUFS);
+}
+
+static int
+vtnet_encap(struct vtnet_softc *sc, struct mbuf **m_head)
+{
+	struct vtnet_tx_header *txhdr;
+	struct virtio_net_hdr *hdr;
+	struct mbuf *m;
+	int error;
+
+	txhdr = uma_zalloc(vtnet_tx_header_zone, M_NOWAIT | M_ZERO);
+	if (txhdr == NULL)
+		return (ENOMEM);
+
+	/*
+	 * Always use the non-mergeable header to simplify things. When
+	 * the mergeable feature is negotiated, the num_buffers field
+	 * must be set to zero. We use vtnet_hdr_size later to enqueue
+	 * the correct header size to the host.
+	 */
+	hdr = &txhdr->vth_uhdr.hdr;
+	m = *m_head;
+
+	error = ENOBUFS;
+
+	if (m->m_flags & M_VLANTAG) {
+		m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
+		if ((*m_head = m) == NULL)
+			goto fail;
+		m->m_flags &= ~M_VLANTAG;
+	}
+
+	if (m->m_pkthdr.csum_flags != 0) {
+		m = vtnet_tx_offload(sc, m, hdr);
+		if ((*m_head = m) == NULL)
+			goto fail;
+	}
+
+	error = vtnet_enqueue_txbuf(sc, m_head, txhdr);
+fail:
+	if (error)
+		uma_zfree(vtnet_tx_header_zone, txhdr);
+
+	return (error);
+}
+
+static void
+vtnet_start(struct ifnet *ifp)
+{
+	struct vtnet_softc *sc;
+
+	sc = ifp->if_softc;
+
+	VTNET_LOCK(sc);
+	vtnet_start_locked(ifp);
+	VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_start_locked(struct ifnet *ifp)
+{
+	struct vtnet_softc *sc;
+	struct virtqueue *vq;
+	struct mbuf *m0;
+	int enq;
+
+	sc = ifp->if_softc;
+	vq = sc->vtnet_tx_vq;
+	enq = 0;
+
+	VTNET_LOCK_ASSERT(sc);
+
+	if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+	    IFF_DRV_RUNNING || ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0))
+		return;
+
+#ifdef VTNET_TX_INTR_MODERATION
+	if (virtqueue_nused(vq) >= sc->vtnet_tx_size / 2)
+		vtnet_txeof(sc);
+#endif
+
+	while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+		if (virtqueue_full(vq)) {
+			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+			break;
+		}
+
+		IFQ_DRV_DEQUEUE(&ifp->if_snd, m0);
+		if (m0 == NULL)
+			break;
+
+		if (vtnet_encap(sc, &m0) != 0) {
+			if (m0 == NULL)
+				break;
+			IFQ_DRV_PREPEND(&ifp->if_snd, m0);
+			ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+			break;
+		}
+
+		enq++;
+		ETHER_BPF_MTAP(ifp, m0);
+	}
+
+	if (enq > 0) {
+		virtqueue_notify(vq);
+		sc->vtnet_watchdog_timer = VTNET_WATCHDOG_TIMEOUT;
+	}
+}
+
+static void
+vtnet_tick(void *xsc)
+{
+	struct vtnet_softc *sc;
+
+	sc = xsc;
+
+	VTNET_LOCK_ASSERT(sc);
+#ifdef VTNET_DEBUG
+	virtqueue_dump(sc->vtnet_rx_vq);
+	virtqueue_dump(sc->vtnet_tx_vq);
+#endif
+
+	vtnet_watchdog(sc);
+	callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
+}
+
+static void
+vtnet_tx_intr_task(void *arg, int pending)
+{
+	struct vtnet_softc *sc;
+	struct ifnet *ifp;
+
+	sc = arg;
+	ifp = sc->vtnet_ifp;
+
+	VTNET_LOCK(sc);
+
+#ifdef DEVICE_POLLING
+	if (ifp->if_capenable & IFCAP_POLLING) {
+		VTNET_UNLOCK(sc);
+		return;
+	}
+#endif
+
+	if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+		vtnet_enable_tx_intr(sc);
+		VTNET_UNLOCK(sc);
+		return;
+	}
+
+	vtnet_txeof(sc);
+
+	if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+		vtnet_start_locked(ifp);
+
+	if (vtnet_enable_tx_intr(sc) != 0) {
+		vtnet_disable_tx_intr(sc);
+		sc->vtnet_stats.tx_task_rescheduled++;
+		VTNET_UNLOCK(sc);
+		taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_tx_intr_task);
+		return;
+	}
+
+	VTNET_UNLOCK(sc);
+}
+
+static int
+vtnet_tx_vq_intr(void *xsc)
+{
+	struct vtnet_softc *sc;
+
+	sc = xsc;
+
+	vtnet_disable_tx_intr(sc);
+	taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_tx_intr_task);
+
+	return (1);
+}
+
+static void
+vtnet_stop(struct vtnet_softc *sc)
+{
+	device_t dev;
+	struct ifnet *ifp;
+
+	dev = sc->vtnet_dev;
+	ifp = sc->vtnet_ifp;
+
+	VTNET_LOCK_ASSERT(sc);
+
+	sc->vtnet_watchdog_timer = 0;
+	callout_stop(&sc->vtnet_tick_ch);
+	ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+	vtnet_disable_rx_intr(sc);
+	vtnet_disable_tx_intr(sc);
+
+	/*
+	 * Stop the host VirtIO adapter. Note this will reset the host
+	 * adapter's state back to the pre-initialized state, so in
+	 * order to make the device usable again, we must drive it
+	 * through virtio_reinit() and virtio_reinit_complete().
+	 */
+	virtio_stop(dev);
+
+	sc->vtnet_flags &= ~VTNET_FLAG_LINK;
+
+	vtnet_free_rx_mbufs(sc);
+	vtnet_free_tx_mbufs(sc);
+}
+
+static int
+vtnet_reinit(struct vtnet_softc *sc)
+{
+	struct ifnet *ifp;
+	uint64_t features;
+
+	ifp = sc->vtnet_ifp;
+	features = sc->vtnet_features;
+
+	/*
+	 * Re-negotiate with the host, removing any disabled receive
+	 * features. Transmit features are disabled only on our side
+	 * via if_capenable and if_hwassist.
+	 */
+
+	if (ifp->if_capabilities & IFCAP_RXCSUM) {
+		if ((ifp->if_capenable & IFCAP_RXCSUM) == 0)
+			features &= ~VIRTIO_NET_F_GUEST_CSUM;
+	}
+
+	if (ifp->if_capabilities & IFCAP_LRO) {
+		if ((ifp->if_capenable & IFCAP_LRO) == 0)
+			features &= ~VTNET_LRO_FEATURES;
+	}
+
+	if (ifp->if_capabilities & IFCAP_VLAN_HWFILTER) {
+		if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0)
+			features &= ~VIRTIO_NET_F_CTRL_VLAN;
+	}
+
+	return (virtio_reinit(sc->vtnet_dev, features));
+}
+
+static void
+vtnet_init_locked(struct vtnet_softc *sc)
+{
+	device_t dev;
+	struct ifnet *ifp;
+	int error;
+
+	dev = sc->vtnet_dev;
+	ifp = sc->vtnet_ifp;
+
+	VTNET_LOCK_ASSERT(sc);
+
+	if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+		return;
+
+	/* Stop host's adapter, cancel any pending I/O. */
+	vtnet_stop(sc);
+
+	/* Reinitialize the host device. */
+	error = vtnet_reinit(sc);
+	if (error) {
+		device_printf(dev,
+		    "reinitialization failed, stopping device...\n");
+		vtnet_stop(sc);
+		return;
+	}
+
+	/* Update host with assigned MAC address. */
+	bcopy(IF_LLADDR(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN);
+	vtnet_set_hwaddr(sc);
+
+	ifp->if_hwassist = 0;
+	if (ifp->if_capenable & IFCAP_TXCSUM)
+		ifp->if_hwassist |= VTNET_CSUM_OFFLOAD;
+	if (ifp->if_capenable & IFCAP_TSO4)
+		ifp->if_hwassist |= CSUM_TSO;
+
+	error = vtnet_init_rx_vq(sc);
+	if (error) {
+		device_printf(dev,
+		    "cannot allocate mbufs for Rx virtqueue\n");
+		vtnet_stop(sc);
+		return;
+	}
+
+	if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
+		if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
+			/* Restore promiscuous and all-multicast modes. */
+			vtnet_rx_filter(sc);
+
+			/* Restore filtered MAC addresses. */
+			vtnet_rx_filter_mac(sc);
+		}
+
+		/* Restore VLAN filters. */
+		if (ifp->if_capenable & IFCAP_VLAN_HWFILTER)
+			vtnet_rx_filter_vlan(sc);
+	}
+
+#ifdef DEVICE_POLLING
+	if (ifp->if_capenable & IFCAP_POLLING) {
+		vtnet_disable_rx_intr(sc);
+		vtnet_disable_tx_intr(sc);
+	} else
+#endif
+	{
+		vtnet_enable_rx_intr(sc);
+		vtnet_enable_tx_intr(sc);
+	}
+
+	ifp->if_drv_flags |= IFF_DRV_RUNNING;
+	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+	virtio_reinit_complete(dev);
+
+	vtnet_update_link_status(sc);
+	callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
+}
+
+static void
+vtnet_init(void *xsc)
+{
+	struct vtnet_softc *sc;
+
+	sc = xsc;
+
+	VTNET_LOCK(sc);
+	vtnet_init_locked(sc);
+	VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie,
+    struct sglist *sg, int readable, int writable)
+{
+	struct virtqueue *vq;
+	void *c;
+
+	vq = sc->vtnet_ctrl_vq;
+
+	VTNET_LOCK_ASSERT(sc);
+	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ,
+	    ("no control virtqueue"));
+	KASSERT(virtqueue_empty(vq),
+	    ("control command already enqueued"));
+
+	if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0)
+		return;
+
+	virtqueue_notify(vq);
+
+	/*
+	 * Poll until the command is complete. Previously, we would
+	 * sleep until the control virtqueue interrupt handler woke
+	 * us up, but dropping the VTNET_MTX leads to serialization
+	 * difficulties.
+	 *
+	 * Furthermore, it appears QEMU/KVM only allocates three MSIX
+	 * vectors. Two of those vectors are needed for the Rx and Tx
+	 * virtqueues. We do not support sharing both a Vq and config
+	 * changed notification on the same MSIX vector.
+	 */
+	c = virtqueue_poll(vq, NULL);
+	KASSERT(c == cookie, ("unexpected control command response"));
+}
+
+static void
+vtnet_rx_filter(struct vtnet_softc *sc)
+{
+	device_t dev;
+	struct ifnet *ifp;
+
+	dev = sc->vtnet_dev;
+	ifp = sc->vtnet_ifp;
+
+	VTNET_LOCK_ASSERT(sc);
+	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
+	    ("CTRL_RX feature not negotiated"));
+
+	if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0)
+		device_printf(dev, "cannot %s promiscuous mode\n",
+		    ifp->if_flags & IFF_PROMISC ? "enable" : "disable");
+
+	if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0)
+		device_printf(dev, "cannot %s all-multicast mode\n",
+		    ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable");
+}
+
+static int
+vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on)
+{
+	struct virtio_net_ctrl_hdr hdr;
+	struct sglist_seg segs[3];
+	struct sglist sg;
+	uint8_t onoff, ack;
+	int error;
+
+	if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0)
+		return (ENOTSUP);
+
+	error = 0;
+
+	hdr.class = VIRTIO_NET_CTRL_RX;
+	hdr.cmd = cmd;
+	onoff = !!on;
+	ack = VIRTIO_NET_ERR;
+
+	sglist_init(&sg, 3, segs);
+	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
+	error |= sglist_append(&sg, &onoff, sizeof(uint8_t));
+	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
+	KASSERT(error == 0 && sg.sg_nseg == 3,
+	    ("error adding Rx filter message to sglist"));
+
+	vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
+
+	return (ack == VIRTIO_NET_OK ? 0 : EIO);
+}
+
+static int
+vtnet_set_promisc(struct vtnet_softc *sc, int on)
+{
+
+	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on));
+}
+
+static int
+vtnet_set_allmulti(struct vtnet_softc *sc, int on)
+{
+
+	return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on));
+}
+
+static void
+vtnet_rx_filter_mac(struct vtnet_softc *sc)
+{
+	struct virtio_net_ctrl_hdr hdr;
+	struct vtnet_mac_filter *filter;
+	struct sglist_seg segs[4];
+	struct sglist sg;
+	struct ifnet *ifp;
+	struct ifaddr *ifa;
+	struct ifmultiaddr *ifma;
+	int ucnt, mcnt, promisc, allmulti, error;
+	uint8_t ack;
+
+	ifp = sc->vtnet_ifp;
+	ucnt = 0;
+	mcnt = 0;
+	promisc = 0;
+	allmulti = 0;
+	error = 0;
+
+	VTNET_LOCK_ASSERT(sc);
+	KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
+	    ("CTRL_RX feature not negotiated"));
+
+	/*
+	 * Allocate the MAC filtering table. Note we could do this
+	 * at attach time, but it is probably not worth keeping it
+	 * around for an infrequent occurrence.
+	 */
+	filter = malloc(sizeof(struct vtnet_mac_filter), M_DEVBUF,
+	    M_NOWAIT | M_ZERO);
+	if (filter == NULL) {
+		device_printf(sc->vtnet_dev,
+		    "cannot allocate MAC address filtering table\n");
+		return;
+	}
+
+	/* Unicast MAC addresses: */
+	if_addr_rlock(ifp);
+	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+		if (ifa->ifa_addr->sa_family != AF_LINK)
+			continue;
+		else if (ucnt == VTNET_MAX_MAC_ENTRIES)
+			break;
+
+		bcopy(LLADDR((struct sockaddr_dl *)ifa->ifa_addr),
+		    &filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN);
+		ucnt++;
+	}
+	if_addr_runlock(ifp);
+
+	if (ucnt >= VTNET_MAX_MAC_ENTRIES) {
+		promisc = 1;
+		filter->vmf_unicast.nentries = 0;
+
+		if_printf(ifp, "more than %d MAC addresses assigned, "
+		    "falling back to promiscuous mode\n",
+		    VTNET_MAX_MAC_ENTRIES);
+	} else
+		filter->vmf_unicast.nentries = ucnt;
+
+	/* Multicast MAC addresses: */
+	if_maddr_rlock(ifp);
+	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+		if (ifma->ifma_addr->sa_family != AF_LINK)
+			continue;
+		else if (mcnt == VTNET_MAX_MAC_ENTRIES)
+			break;
+
+		bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
+		    &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN);
+		mcnt++;
+	}
+	if_maddr_runlock(ifp);
+
+	if (mcnt >= VTNET_MAX_MAC_ENTRIES) {
+		allmulti = 1;
+		filter->vmf_multicast.nentries = 0;
+
+		if_printf(ifp, "more than %d multicast MAC addresses "
+		    "assigned, falling back to all-multicast mode\n",
+		    VTNET_MAX_MAC_ENTRIES);
+	} else
+		filter->vmf_multicast.nentries = mcnt;
+
+	if (promisc && allmulti)
+		goto out;
+
+	hdr.class = VIRTIO_NET_CTRL_MAC;
+	hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
+	ack = VIRTIO_NET_ERR;
+
+	sglist_init(&sg, 4, segs);
+	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
+	error |= sglist_append(&sg, &filter->vmf_unicast,
+	    sizeof(struct vtnet_mac_table));
+	error |= sglist_append(&sg, &filter->vmf_multicast,
+	    sizeof(struct vtnet_mac_table));
+	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
+	KASSERT(error == 0 && sg.sg_nseg == 4,
+	    ("error adding MAC filtering message to sglist"));
+
+	vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
+
+	if (ack != VIRTIO_NET_OK)
+		if_printf(ifp, "error setting host MAC filter table\n");
+
+out:
+	free(filter, M_DEVBUF);
+
+	if (promisc)
+		if (vtnet_set_promisc(sc, 1) != 0)
+			if_printf(ifp, "cannot enable promiscuous mode\n");
+	if (allmulti)
+		if (vtnet_set_allmulti(sc, 1) != 0)
+			if_printf(ifp, "cannot enable all-multicast mode\n");
+}
+
+static int
+vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
+{
+	struct virtio_net_ctrl_hdr hdr;
+	struct sglist_seg segs[3];
+	struct sglist sg;
+	uint8_t ack;
+	int error;
+
+	hdr.class = VIRTIO_NET_CTRL_VLAN;
+	hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL;
+	ack = VIRTIO_NET_ERR;
+	error = 0;
+
+	sglist_init(&sg, 3, segs);
+	error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
+	error |= sglist_append(&sg, &tag, sizeof(uint16_t));
+	error |= sglist_append(&sg, &ack, sizeof(uint8_t));
+	KASSERT(error == 0 && sg.sg_nseg == 3,
+	    ("error adding VLAN control message to sglist"));
+
+	vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
+
+	return (ack == VIRTIO_NET_OK ? 0 : EIO);
+}
+
+static void
+vtnet_rx_filter_vlan(struct vtnet_softc *sc)
+{
+	device_t dev;
+	uint32_t w, mask;
+	uint16_t tag;
+	int i, nvlans, error;
+
+	VTNET_LOCK_ASSERT(sc);
+	KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER,
+	    ("VLAN_FILTER feature not negotiated"));
+
+	dev = sc->vtnet_dev;
+	nvlans = sc->vtnet_nvlans;
+	error = 0;
+
+	/* Enable filtering for each configured VLAN. */
+	for (i = 0; i < VTNET_VLAN_SHADOW_SIZE && nvlans > 0; i++) {
+		w = sc->vtnet_vlan_shadow[i];
+		for (mask = 1, tag = i * 32; w != 0; mask <<= 1, tag++) {
+			if ((w & mask) != 0) {
+				w &= ~mask;
+				nvlans--;
+				if (vtnet_exec_vlan_filter(sc, 1, tag) != 0)
+					error++;
+			}
+		}
+	}
+
+	KASSERT(nvlans == 0, ("VLAN count incorrect"));
+	if (error)
+		device_printf(dev, "cannot restore VLAN filter table\n");
+}
+
+static void
+vtnet_set_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
+{
+	struct ifnet *ifp;
+	int idx, bit;
+
+	KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER,
+	    ("VLAN_FILTER feature not negotiated"));
+
+	if ((tag == 0) || (tag > 4095))
+		return;
+
+	ifp = sc->vtnet_ifp;
+	idx = (tag >> 5) & 0x7F;
+	bit = tag & 0x1F;
+
+	VTNET_LOCK(sc);
+
+	/* Update shadow VLAN table. */
+	if (add) {
+		sc->vtnet_nvlans++;
+		sc->vtnet_vlan_shadow[idx] |= (1 << bit);
+	} else {
+		sc->vtnet_nvlans--;
+		sc->vtnet_vlan_shadow[idx] &= ~(1 << bit);
+	}
+
+	if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) {
+		if (vtnet_exec_vlan_filter(sc, add, tag) != 0) {
+			device_printf(sc->vtnet_dev,
+			    "cannot %s VLAN %d %s the host filter table\n",
+			    add ? "add" : "remove", tag,
+			    add ? "to" : "from");
+		}
+	}
+
+	VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag)
+{
+
+	if (ifp->if_softc != arg)
+		return;
+
+	vtnet_set_vlan_filter(arg, 1, tag);
+}
+
+static void
+vtnet_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag)
+{
+
+	if (ifp->if_softc != arg)
+		return;
+
+	vtnet_set_vlan_filter(arg, 0, tag);
+}
+
+static int
+vtnet_ifmedia_upd(struct ifnet *ifp)
+{
+	struct vtnet_softc *sc;
+	struct ifmedia *ifm;
+
+	sc = ifp->if_softc;
+	ifm = &sc->vtnet_media;
+
+	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
+		return (EINVAL);
+
+	return (0);
+}
+
+static void
+vtnet_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+	struct vtnet_softc *sc;
+
+	sc = ifp->if_softc;
+
+	ifmr->ifm_status = IFM_AVALID;
+	ifmr->ifm_active = IFM_ETHER;
+
+	VTNET_LOCK(sc);
+	if (vtnet_is_link_up(sc) != 0) {
+		ifmr->ifm_status |= IFM_ACTIVE;
+		ifmr->ifm_active |= VTNET_MEDIATYPE;
+	} else
+		ifmr->ifm_active |= IFM_NONE;
+	VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_add_statistics(struct vtnet_softc *sc)
+{
+	device_t dev;
+	struct vtnet_statistics *stats;
+        struct sysctl_ctx_list *ctx;
+	struct sysctl_oid *tree;
+	struct sysctl_oid_list *child;
+
+	dev = sc->vtnet_dev;
+	stats = &sc->vtnet_stats;
+	ctx = device_get_sysctl_ctx(dev);
+	tree = device_get_sysctl_tree(dev);
+	child = SYSCTL_CHILDREN(tree);
+
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_alloc_failed",
+	    CTLFLAG_RD, &stats->mbuf_alloc_failed,
+	    "Mbuf cluster allocation failures");
+
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_frame_too_large",
+	    CTLFLAG_RD, &stats->rx_frame_too_large,
+	    "Received frame larger than the mbuf chain");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_enq_replacement_failed",
+	    CTLFLAG_RD, &stats->rx_enq_replacement_failed,
+	    "Enqueuing the replacement receive mbuf failed");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_mergeable_failed",
+	    CTLFLAG_RD, &stats->rx_mergeable_failed,
+	    "Mergeable buffers receive failures");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ethtype",
+	    CTLFLAG_RD, &stats->rx_csum_bad_ethtype,
+	    "Received checksum offloaded buffer with unsupported "
+	    "Ethernet type");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_start",
+	    CTLFLAG_RD, &stats->rx_csum_bad_start,
+	    "Received checksum offloaded buffer with incorrect start offset");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ipproto",
+	    CTLFLAG_RD, &stats->rx_csum_bad_ipproto,
+	    "Received checksum offloaded buffer with incorrect IP protocol");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_offset",
+	    CTLFLAG_RD, &stats->rx_csum_bad_offset,
+	    "Received checksum offloaded buffer with incorrect offset");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_failed",
+	    CTLFLAG_RD, &stats->rx_csum_failed,
+	    "Received buffer checksum offload failed");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_offloaded",
+	    CTLFLAG_RD, &stats->rx_csum_offloaded,
+	    "Received buffer checksum offload succeeded");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_task_rescheduled",
+	    CTLFLAG_RD, &stats->rx_task_rescheduled,
+	    "Times the receive interrupt task rescheduled itself");
+
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_offloaded",
+	    CTLFLAG_RD, &stats->tx_csum_offloaded,
+	    "Offloaded checksum of transmitted buffer");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_offloaded",
+	    CTLFLAG_RD, &stats->tx_tso_offloaded,
+	    "Segmentation offload of transmitted buffer");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_bad_ethtype",
+	    CTLFLAG_RD, &stats->tx_csum_bad_ethtype,
+	    "Aborted transmit of checksum offloaded buffer with unknown "
+	    "Ethernet type");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_bad_ethtype",
+	    CTLFLAG_RD, &stats->tx_tso_bad_ethtype,
+	    "Aborted transmit of TSO buffer with unknown Ethernet type");
+	SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_task_rescheduled",
+	    CTLFLAG_RD, &stats->tx_task_rescheduled,
+	    "Times the transmit interrupt task rescheduled itself");
+}
+
+static int
+vtnet_enable_rx_intr(struct vtnet_softc *sc)
+{
+
+	return (virtqueue_enable_intr(sc->vtnet_rx_vq));
+}
+
+static void
+vtnet_disable_rx_intr(struct vtnet_softc *sc)
+{
+
+	virtqueue_disable_intr(sc->vtnet_rx_vq);
+}
+
+static int
+vtnet_enable_tx_intr(struct vtnet_softc *sc)
+{
+
+#ifdef VTNET_TX_INTR_MODERATION
+	return (0);
+#else
+	return (virtqueue_enable_intr(sc->vtnet_tx_vq));
+#endif
+}
+
+static void
+vtnet_disable_tx_intr(struct vtnet_softc *sc)
+{
+
+	virtqueue_disable_intr(sc->vtnet_tx_vq);
+}
diff --git a/sys/dev/virtio/network/if_vtnetvar.h b/sys/dev/virtio/network/if_vtnetvar.h
new file mode 100644
index 0000000..613b2b0
--- /dev/null
+++ b/sys/dev/virtio/network/if_vtnetvar.h
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IF_VTNETVAR_H
+#define _IF_VTNETVAR_H
+
+struct vtnet_statistics {
+	unsigned long	mbuf_alloc_failed;
+
+	unsigned long	rx_frame_too_large;
+	unsigned long	rx_enq_replacement_failed;
+	unsigned long	rx_mergeable_failed;
+	unsigned long	rx_csum_bad_ethtype;
+	unsigned long	rx_csum_bad_start;
+	unsigned long	rx_csum_bad_ipproto;
+	unsigned long	rx_csum_bad_offset;
+	unsigned long	rx_csum_failed;
+	unsigned long	rx_csum_offloaded;
+	unsigned long	rx_task_rescheduled;
+
+	unsigned long	tx_csum_offloaded;
+	unsigned long	tx_tso_offloaded;
+	unsigned long	tx_csum_bad_ethtype;
+	unsigned long	tx_tso_bad_ethtype;
+	unsigned long	tx_task_rescheduled;
+};
+
+struct vtnet_softc {
+	device_t		 vtnet_dev;
+	struct ifnet		*vtnet_ifp;
+	struct mtx		 vtnet_mtx;
+
+	uint32_t		 vtnet_flags;
+#define VTNET_FLAG_LINK		 0x0001
+#define VTNET_FLAG_SUSPENDED	 0x0002
+#define VTNET_FLAG_CTRL_VQ	 0x0004
+#define VTNET_FLAG_CTRL_RX	 0x0008
+#define VTNET_FLAG_VLAN_FILTER	 0x0010
+#define VTNET_FLAG_TSO_ECN	 0x0020
+#define VTNET_FLAG_MRG_RXBUFS	 0x0040
+#define VTNET_FLAG_LRO_NOMRG	 0x0080
+
+	struct virtqueue	*vtnet_rx_vq;
+	struct virtqueue	*vtnet_tx_vq;
+	struct virtqueue	*vtnet_ctrl_vq;
+
+	int			 vtnet_hdr_size;
+	int			 vtnet_tx_size;
+	int			 vtnet_rx_size;
+	int			 vtnet_rx_process_limit;
+	int			 vtnet_rx_mbuf_size;
+	int			 vtnet_rx_mbuf_count;
+	int			 vtnet_if_flags;
+	int			 vtnet_watchdog_timer;
+	uint64_t		 vtnet_features;
+
+	struct taskqueue	*vtnet_tq;
+	struct task		 vtnet_rx_intr_task;
+	struct task		 vtnet_tx_intr_task;
+	struct task		 vtnet_cfgchg_task;
+
+	struct vtnet_statistics	 vtnet_stats;
+
+	struct callout		 vtnet_tick_ch;
+
+	eventhandler_tag	 vtnet_vlan_attach;
+	eventhandler_tag	 vtnet_vlan_detach;
+
+	struct ifmedia		 vtnet_media;
+	/*
+	 * Fake media type; the host does not provide us with
+	 * any real media information.
+	 */
+#define VTNET_MEDIATYPE		 (IFM_ETHER | IFM_1000_T | IFM_FDX)
+	char			 vtnet_hwaddr[ETHER_ADDR_LEN];
+
+	/*
+	 * During reset, the host's VLAN filtering table is lost. The
+	 * array below is used to restore all the VLANs configured on
+	 * this interface after a reset.
+	 */
+#define VTNET_VLAN_SHADOW_SIZE	 (4096 / 32)
+	int			 vtnet_nvlans;
+	uint32_t		 vtnet_vlan_shadow[VTNET_VLAN_SHADOW_SIZE];
+
+	char			 vtnet_mtx_name[16];
+};
+
+/*
+ * When mergeable buffers are not negotiated, the vtnet_rx_header structure
+ * below is placed at the beginning of the mbuf data. Use 4 bytes of pad to
+ * both keep the VirtIO header and the data non-contiguous and to keep the
+ * frame's payload 4 byte aligned.
+ *
+ * When mergeable buffers are negotiated, the host puts the VirtIO header in
+ * the beginning of the first mbuf's data.
+ */
+#define VTNET_RX_HEADER_PAD	4
+struct vtnet_rx_header {
+	struct virtio_net_hdr	vrh_hdr;
+	char			vrh_pad[VTNET_RX_HEADER_PAD];
+} __packed;
+
+/*
+ * For each outgoing frame, the vtnet_tx_header below is allocated from
+ * the vtnet_tx_header_zone.
+ */
+struct vtnet_tx_header {
+	union {
+		struct virtio_net_hdr		hdr;
+		struct virtio_net_hdr_mrg_rxbuf	mhdr;
+	} vth_uhdr;
+
+	struct mbuf *vth_mbuf;
+};
+
+/*
+ * The VirtIO specification does not place a limit on the number of MAC
+ * addresses the guest driver may request to be filtered. In practice,
+ * the host is constrained by available resources. To simplify this driver,
+ * impose a reasonably high limit of MAC addresses we will filter before
+ * falling back to promiscuous or all-multicast modes.
+ */
+#define VTNET_MAX_MAC_ENTRIES	128
+
+struct vtnet_mac_table {
+	uint32_t	nentries;
+	uint8_t		macs[VTNET_MAX_MAC_ENTRIES][ETHER_ADDR_LEN];
+} __packed;
+
+struct vtnet_mac_filter {
+	struct vtnet_mac_table	vmf_unicast;
+	uint32_t		vmf_pad; /* Make tables non-contiguous. */
+	struct vtnet_mac_table	vmf_multicast;
+};
+
+/*
+ * The MAC filter table is malloc(9)'d when needed. Ensure it will
+ * always fit in one segment.
+ */
+CTASSERT(sizeof(struct vtnet_mac_filter) <= PAGE_SIZE);
+
+#define VTNET_WATCHDOG_TIMEOUT	5
+#define VTNET_CSUM_OFFLOAD	(CSUM_TCP | CSUM_UDP | CSUM_SCTP)
+
+/* Features desired/implemented by this driver. */
+#define VTNET_FEATURES \
+    (VIRTIO_NET_F_MAC			| \
+     VIRTIO_NET_F_STATUS		| \
+     VIRTIO_NET_F_CTRL_VQ		| \
+     VIRTIO_NET_F_CTRL_RX		| \
+     VIRTIO_NET_F_CTRL_VLAN		| \
+     VIRTIO_NET_F_CSUM			| \
+     VIRTIO_NET_F_HOST_TSO4		| \
+     VIRTIO_NET_F_HOST_TSO6		| \
+     VIRTIO_NET_F_HOST_ECN		| \
+     VIRTIO_NET_F_GUEST_CSUM		| \
+     VIRTIO_NET_F_GUEST_TSO4		| \
+     VIRTIO_NET_F_GUEST_TSO6		| \
+     VIRTIO_NET_F_GUEST_ECN		| \
+     VIRTIO_NET_F_MRG_RXBUF		| \
+     VIRTIO_RING_F_INDIRECT_DESC)
+
+/*
+ * The VIRTIO_NET_F_GUEST_TSO[46] features permit the host to send us
+ * frames larger than 1514 bytes. We do not yet support software LRO
+ * via tcp_lro_rx().
+ */
+#define VTNET_LRO_FEATURES (VIRTIO_NET_F_GUEST_TSO4 | \
+    VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN)
+
+#define VTNET_MAX_MTU		65536
+#define VTNET_MAX_RX_SIZE	65550
+
+/*
+ * Used to preallocate the Vq indirect descriptors. The first segment
+ * is reserved for the header.
+ */
+#define VTNET_MIN_RX_SEGS	2
+#define VTNET_MAX_RX_SEGS	34
+#define VTNET_MAX_TX_SEGS	34
+
+/*
+ * Assert we can receive and transmit the maximum with regular
+ * size clusters.
+ */
+CTASSERT(((VTNET_MAX_RX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_RX_SIZE);
+CTASSERT(((VTNET_MAX_TX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_MTU);
+
+/*
+ * Determine how many mbufs are in each receive buffer. For LRO without
+ * mergeable descriptors, we must allocate an mbuf chain large enough to
+ * hold both the vtnet_rx_header and the maximum receivable data.
+ */
+#define VTNET_NEEDED_RX_MBUFS(_sc)					\
+	((_sc)->vtnet_flags & VTNET_FLAG_LRO_NOMRG) == 0 ? 1 :		\
+	    howmany(sizeof(struct vtnet_rx_header) + VTNET_MAX_RX_SIZE,	\
+	        (_sc)->vtnet_rx_mbuf_size)
+
+#define VTNET_MTX(_sc)		&(_sc)->vtnet_mtx
+#define VTNET_LOCK(_sc)		mtx_lock(VTNET_MTX((_sc)))
+#define VTNET_UNLOCK(_sc)	mtx_unlock(VTNET_MTX((_sc)))
+#define VTNET_LOCK_DESTROY(_sc)	mtx_destroy(VTNET_MTX((_sc)))
+#define VTNET_LOCK_ASSERT(_sc)	mtx_assert(VTNET_MTX((_sc)), MA_OWNED)
+#define VTNET_LOCK_ASSERT_NOTOWNED(_sc)	\
+	 			mtx_assert(VTNET_MTX((_sc)), MA_NOTOWNED)
+
+#define VTNET_LOCK_INIT(_sc) do {					\
+    snprintf((_sc)->vtnet_mtx_name, sizeof((_sc)->vtnet_mtx_name),	\
+        "%s", device_get_nameunit((_sc)->vtnet_dev));			\
+    mtx_init(VTNET_MTX((_sc)), (_sc)->vtnet_mtx_name,			\
+        "VTNET Core Lock", MTX_DEF);					\
+} while (0)
+
+#endif /* _IF_VTNETVAR_H */
diff --git a/sys/dev/virtio/network/virtio_net.h b/sys/dev/virtio/network/virtio_net.h
new file mode 100644
index 0000000..7361aa1
--- /dev/null
+++ b/sys/dev/virtio/network/virtio_net.h
@@ -0,0 +1,138 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_NET_H
+#define _VIRTIO_NET_H
+
+#include <sys/types.h>
+
+/* The feature bitmap for virtio net */
+#define VIRTIO_NET_F_CSUM	0x00001 /* Host handles pkts w/ partial csum */
+#define VIRTIO_NET_F_GUEST_CSUM 0x00002 /* Guest handles pkts w/ partial csum*/
+#define VIRTIO_NET_F_MAC	0x00020 /* Host has given MAC address. */
+#define VIRTIO_NET_F_GSO	0x00040 /* Host handles pkts w/ any GSO type */
+#define VIRTIO_NET_F_GUEST_TSO4	0x00080 /* Guest can handle TSOv4 in. */
+#define VIRTIO_NET_F_GUEST_TSO6	0x00100 /* Guest can handle TSOv6 in. */
+#define VIRTIO_NET_F_GUEST_ECN	0x00200 /* Guest can handle TSO[6] w/ ECN in.*/
+#define VIRTIO_NET_F_GUEST_UFO	0x00400 /* Guest can handle UFO in. */
+#define VIRTIO_NET_F_HOST_TSO4	0x00800 /* Host can handle TSOv4 in. */
+#define VIRTIO_NET_F_HOST_TSO6	0x01000 /* Host can handle TSOv6 in. */
+#define VIRTIO_NET_F_HOST_ECN	0x02000 /* Host can handle TSO[6] w/ ECN in. */
+#define VIRTIO_NET_F_HOST_UFO	0x04000 /* Host can handle UFO in. */
+#define VIRTIO_NET_F_MRG_RXBUF	0x08000 /* Host can merge receive buffers. */
+#define VIRTIO_NET_F_STATUS	0x10000 /* virtio_net_config.status available*/
+#define VIRTIO_NET_F_CTRL_VQ	0x20000 /* Control channel available */
+#define VIRTIO_NET_F_CTRL_RX	0x40000 /* Control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN	0x80000 /* Control channel VLAN filtering */
+#define VIRTIO_NET_F_CTRL_RX_EXTRA 0x100000 /* Extra RX mode control support */
+
+#define VIRTIO_NET_S_LINK_UP	1	/* Link is up */
+
+struct virtio_net_config {
+	/* The config defining mac address (if VIRTIO_NET_F_MAC) */
+	uint8_t		mac[ETHER_ADDR_LEN]; 
+	/* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
+	uint16_t	status;
+} __packed;
+
+/*
+ * This is the first element of the scatter-gather list.  If you don't
+ * specify GSO or CSUM features, you can simply ignore the header.
+ */
+struct virtio_net_hdr {
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM	1	/* Use csum_start,csum_offset*/
+	uint8_t	flags;
+#define VIRTIO_NET_HDR_GSO_NONE		0	/* Not a GSO frame */
+#define VIRTIO_NET_HDR_GSO_TCPV4	1	/* GSO frame, IPv4 TCP (TSO) */
+#define VIRTIO_NET_HDR_GSO_UDP		3	/* GSO frame, IPv4 UDP (UFO) */
+#define VIRTIO_NET_HDR_GSO_TCPV6	4	/* GSO frame, IPv6 TCP */
+#define VIRTIO_NET_HDR_GSO_ECN		0x80	/* TCP has ECN set */
+	uint8_t gso_type;
+	uint16_t hdr_len;	/* Ethernet + IP + tcp/udp hdrs */
+	uint16_t gso_size;	/* Bytes to append to hdr_len per frame */
+	uint16_t csum_start;	/* Position to start checksumming from */
+	uint16_t csum_offset;	/* Offset after that to place checksum */
+};
+
+/*
+ * This is the version of the header to use when the MRG_RXBUF
+ * feature has been negotiated.
+ */
+struct virtio_net_hdr_mrg_rxbuf {
+	struct virtio_net_hdr hdr;
+	uint16_t num_buffers;	/* Number of merged rx buffers */
+};
+
+/*
+ * Control virtqueue data structures
+ *
+ * The control virtqueue expects a header in the first sg entry
+ * and an ack/status response in the last entry.  Data for the
+ * command goes in between.
+ */
+struct virtio_net_ctrl_hdr {
+	uint8_t class;
+	uint8_t cmd;
+} __packed;
+
+typedef uint8_t virtio_net_ctrl_ack;
+
+#define VIRTIO_NET_OK	0
+#define VIRTIO_NET_ERR	1
+
+/*
+ * Control the RX mode, ie. promiscuous, allmulti, etc...
+ * All commands require an "out" sg entry containing a 1 byte
+ * state value, zero = disable, non-zero = enable.  Commands
+ * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature.
+ * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA.
+ */
+#define VIRTIO_NET_CTRL_RX	0
+#define VIRTIO_NET_CTRL_RX_PROMISC	0
+#define VIRTIO_NET_CTRL_RX_ALLMULTI	1
+#define VIRTIO_NET_CTRL_RX_ALLUNI	2
+#define VIRTIO_NET_CTRL_RX_NOMULTI	3
+#define VIRTIO_NET_CTRL_RX_NOUNI	4
+#define VIRTIO_NET_CTRL_RX_NOBCAST	5
+
+/*
+ * Control the MAC filter table.
+ *
+ * The MAC filter table is managed by the hypervisor, the guest should
+ * assume the size is infinite.  Filtering should be considered
+ * non-perfect, ie. based on hypervisor resources, the guest may
+ * received packets from sources not specified in the filter list.
+ *
+ * In addition to the class/cmd header, the TABLE_SET command requires
+ * two out scatterlists.  Each contains a 4 byte count of entries followed
+ * by a concatenated byte stream of the ETH_ALEN MAC addresses.  The
+ * first sg list contains unicast addresses, the second is for multicast.
+ * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
+ * is available.
+ */
+struct virtio_net_ctrl_mac {
+	uint32_t	entries;
+	uint8_t		macs[][ETHER_ADDR_LEN];
+} __packed;
+
+#define VIRTIO_NET_CTRL_MAC	1
+#define VIRTIO_NET_CTRL_MAC_TABLE_SET	0
+
+/*
+ * Control VLAN filtering
+ *
+ * The VLAN filter table is controlled via a simple ADD/DEL interface.
+ * VLAN IDs not added may be filtered by the hypervisor.  Del is the
+ * opposite of add.  Both commands expect an out entry containing a 2
+ * byte VLAN ID.  VLAN filtering is available with the
+ * VIRTIO_NET_F_CTRL_VLAN feature bit.
+ */
+#define VIRTIO_NET_CTRL_VLAN	2
+#define VIRTIO_NET_CTRL_VLAN_ADD	0
+#define VIRTIO_NET_CTRL_VLAN_DEL	1
+
+#endif /* _VIRTIO_NET_H */
diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c
new file mode 100644
index 0000000..dd348a5
--- /dev/null
+++ b/sys/dev/virtio/pci/virtio_pci.c
@@ -0,0 +1,1081 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for the VirtIO PCI interface. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/pci/virtio_pci.h>
+
+#include "virtio_bus_if.h"
+#include "virtio_if.h"
+
+struct vtpci_softc {
+	device_t			 vtpci_dev;
+	struct resource			*vtpci_res;
+	struct resource			*vtpci_msix_res;
+	uint64_t			 vtpci_features;
+	uint32_t			 vtpci_flags;
+#define VIRTIO_PCI_FLAG_NO_MSI		 0x0001
+#define VIRTIO_PCI_FLAG_MSI		 0x0002
+#define VIRTIO_PCI_FLAG_NO_MSIX		 0x0010
+#define VIRTIO_PCI_FLAG_MSIX		 0x0020
+#define VIRTIO_PCI_FLAG_SHARED_MSIX	 0x0040
+
+	device_t			 vtpci_child_dev;
+	struct virtio_feature_desc	*vtpci_child_feat_desc;
+
+	/*
+	 * Ideally, each virtqueue that the driver provides a callback for
+	 * will receive its own MSIX vector. If there are not sufficient
+	 * vectors available, we will then attempt to have all the VQs
+	 * share one vector. Note that when using MSIX, the configuration
+	 * changed notifications must be on their own vector.
+	 *
+	 * If MSIX is not available, we will attempt to have the whole
+	 * device share one MSI vector, and then, finally, one legacy
+	 * interrupt.
+	 */
+	int				 vtpci_nvqs;
+	struct vtpci_virtqueue {
+		struct virtqueue *vq;
+
+		/* Index into vtpci_intr_res[] below. Unused, then -1. */
+		int		  ires_idx;
+	} vtpci_vqx[VIRTIO_MAX_VIRTQUEUES];
+
+	/*
+	 * When using MSIX interrupts, the first element of vtpci_intr_res[]
+	 * is always the configuration changed notifications. The remaining
+	 * element(s) are used for the virtqueues.
+	 *
+	 * With MSI and legacy interrupts, only the first element of
+	 * vtpci_intr_res[] is used.
+	 */
+	int				 vtpci_nintr_res;
+	struct vtpci_intr_resource {
+		struct resource	*irq;
+		int		 rid;
+		void		*intrhand;
+	} vtpci_intr_res[1 + VIRTIO_MAX_VIRTQUEUES];
+};
+
+static int	vtpci_probe(device_t);
+static int	vtpci_attach(device_t);
+static int	vtpci_detach(device_t);
+static int	vtpci_suspend(device_t);
+static int	vtpci_resume(device_t);
+static int	vtpci_shutdown(device_t);
+static void	vtpci_driver_added(device_t, driver_t *);
+static void	vtpci_child_detached(device_t, device_t);
+static int	vtpci_read_ivar(device_t, device_t, int, uintptr_t *);
+static int	vtpci_write_ivar(device_t, device_t, int, uintptr_t);
+
+static uint64_t	vtpci_negotiate_features(device_t, uint64_t);
+static int	vtpci_with_feature(device_t, uint64_t);
+static int	vtpci_alloc_virtqueues(device_t, int, int,
+		    struct vq_alloc_info *);
+static int	vtpci_setup_intr(device_t, enum intr_type);
+static void	vtpci_stop(device_t);
+static int	vtpci_reinit(device_t, uint64_t);
+static void	vtpci_reinit_complete(device_t);
+static void	vtpci_notify_virtqueue(device_t, uint16_t);
+static uint8_t	vtpci_get_status(device_t);
+static void	vtpci_set_status(device_t, uint8_t);
+static void	vtpci_read_dev_config(device_t, bus_size_t, void *, int);
+static void	vtpci_write_dev_config(device_t, bus_size_t, void *, int);
+
+static void	vtpci_describe_features(struct vtpci_softc *, const char *,
+		    uint64_t);
+static void	vtpci_probe_and_attach_child(struct vtpci_softc *);
+
+static int	vtpci_alloc_interrupts(struct vtpci_softc *, int, int,
+		    struct vq_alloc_info *);
+static int	vtpci_alloc_intr_resources(struct vtpci_softc *, int,
+		    struct vq_alloc_info *);
+static int	vtpci_alloc_msi(struct vtpci_softc *);
+static int	vtpci_alloc_msix(struct vtpci_softc *, int);
+static int	vtpci_register_msix_vector(struct vtpci_softc *, int, int);
+
+static void	vtpci_free_interrupts(struct vtpci_softc *);
+static void	vtpci_free_virtqueues(struct vtpci_softc *);
+static void	vtpci_release_child_resources(struct vtpci_softc *);
+static void	vtpci_reset(struct vtpci_softc *);
+
+static int	vtpci_legacy_intr(void *);
+static int	vtpci_vq_shared_intr(void *);
+static int	vtpci_vq_intr(void *);
+static int	vtpci_config_intr(void *);
+
+/*
+ * I/O port read/write wrappers.
+ */
+#define vtpci_read_config_1(sc, o)	bus_read_1((sc)->vtpci_res, (o))
+#define vtpci_read_config_2(sc, o)	bus_read_2((sc)->vtpci_res, (o))
+#define vtpci_read_config_4(sc, o)	bus_read_4((sc)->vtpci_res, (o))
+#define vtpci_write_config_1(sc, o, v)	bus_write_1((sc)->vtpci_res, (o), (v))
+#define vtpci_write_config_2(sc, o, v)	bus_write_2((sc)->vtpci_res, (o), (v))
+#define vtpci_write_config_4(sc, o, v)	bus_write_4((sc)->vtpci_res, (o), (v))
+
+/* Tunables. */
+static int vtpci_disable_msix = 0;
+TUNABLE_INT("hw.virtio.pci.disable_msix", &vtpci_disable_msix);
+
+static device_method_t vtpci_methods[] = {
+	/* Device interface. */
+	DEVMETHOD(device_probe,			  vtpci_probe),
+	DEVMETHOD(device_attach,		  vtpci_attach),
+	DEVMETHOD(device_detach,		  vtpci_detach),
+	DEVMETHOD(device_suspend,		  vtpci_suspend),
+	DEVMETHOD(device_resume,		  vtpci_resume),
+	DEVMETHOD(device_shutdown,		  vtpci_shutdown),
+
+	/* Bus interface. */
+	DEVMETHOD(bus_driver_added,		  vtpci_driver_added),
+	DEVMETHOD(bus_child_detached,		  vtpci_child_detached),
+	DEVMETHOD(bus_read_ivar,		  vtpci_read_ivar),
+	DEVMETHOD(bus_write_ivar,		  vtpci_write_ivar),
+
+	/* VirtIO bus interface. */
+	DEVMETHOD(virtio_bus_negotiate_features,  vtpci_negotiate_features),
+	DEVMETHOD(virtio_bus_with_feature,	  vtpci_with_feature),
+	DEVMETHOD(virtio_bus_alloc_virtqueues,	  vtpci_alloc_virtqueues),
+	DEVMETHOD(virtio_bus_setup_intr,	  vtpci_setup_intr),
+	DEVMETHOD(virtio_bus_stop,		  vtpci_stop),
+	DEVMETHOD(virtio_bus_reinit,		  vtpci_reinit),
+	DEVMETHOD(virtio_bus_reinit_complete,	  vtpci_reinit_complete),
+	DEVMETHOD(virtio_bus_notify_vq,		  vtpci_notify_virtqueue),
+	DEVMETHOD(virtio_bus_read_device_config,  vtpci_read_dev_config),
+	DEVMETHOD(virtio_bus_write_device_config, vtpci_write_dev_config),
+
+	{ 0, 0 }
+};
+
+static driver_t vtpci_driver = {
+	"virtio_pci",
+	vtpci_methods,
+	sizeof(struct vtpci_softc)
+};
+
+devclass_t vtpci_devclass;
+
+DRIVER_MODULE(virtio_pci, pci, vtpci_driver, vtpci_devclass, 0, 0);
+MODULE_VERSION(virtio_pci, 1);
+MODULE_DEPEND(virtio_pci, pci, 1, 1, 1);
+MODULE_DEPEND(virtio_pci, virtio, 1, 1, 1);
+
+static int
+vtpci_probe(device_t dev)
+{
+	char desc[36];
+	const char *name;
+
+	if (pci_get_vendor(dev) != VIRTIO_PCI_VENDORID)
+		return (ENXIO);
+
+	if (pci_get_device(dev) < VIRTIO_PCI_DEVICEID_MIN ||
+	    pci_get_device(dev) > VIRTIO_PCI_DEVICEID_MAX)
+		return (ENXIO);
+
+	if (pci_get_revid(dev) != VIRTIO_PCI_ABI_VERSION)
+		return (ENXIO);
+
+	name = virtio_device_name(pci_get_subdevice(dev));
+	if (name == NULL)
+		name = "Unknown";
+
+	snprintf(desc, sizeof(desc), "VirtIO PCI %s adapter", name);
+	device_set_desc_copy(dev, desc);
+
+	return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtpci_attach(device_t dev)
+{
+	struct vtpci_softc *sc;
+	device_t child;
+	int rid;
+
+	sc = device_get_softc(dev);
+	sc->vtpci_dev = dev;
+
+	pci_enable_busmaster(dev);
+
+	rid = PCIR_BAR(0);
+	sc->vtpci_res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
+	    RF_ACTIVE);
+	if (sc->vtpci_res == NULL) {
+		device_printf(dev, "cannot map I/O space\n");
+		return (ENXIO);
+	}
+
+	if (pci_find_extcap(dev, PCIY_MSI, NULL) != 0)
+		sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSI;
+
+	if (pci_find_extcap(dev, PCIY_MSIX, NULL) == 0) {
+		rid = PCIR_BAR(1);
+		sc->vtpci_msix_res = bus_alloc_resource_any(dev,
+		    SYS_RES_MEMORY, &rid, RF_ACTIVE);
+	}
+
+	if (sc->vtpci_msix_res == NULL)
+		sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSIX;
+
+	vtpci_reset(sc);
+
+	/* Tell the host we've noticed this device. */
+	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+
+	if ((child = device_add_child(dev, NULL, -1)) == NULL) {
+		device_printf(dev, "cannot create child device\n");
+		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+		vtpci_detach(dev);
+		return (ENOMEM);
+	}
+
+	sc->vtpci_child_dev = child;
+	vtpci_probe_and_attach_child(sc);
+
+	return (0);
+}
+
+static int
+vtpci_detach(device_t dev)
+{
+	struct vtpci_softc *sc;
+	device_t child;
+	int error;
+
+	sc = device_get_softc(dev);
+
+	if ((child = sc->vtpci_child_dev) != NULL) {
+		error = device_delete_child(dev, child);
+		if (error)
+			return (error);
+		sc->vtpci_child_dev = NULL;
+	}
+
+	vtpci_reset(sc);
+
+	if (sc->vtpci_msix_res != NULL) {
+		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(1),
+		    sc->vtpci_msix_res);
+		sc->vtpci_msix_res = NULL;
+	}
+
+	if (sc->vtpci_res != NULL) {
+		bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(0),
+		    sc->vtpci_res);
+		sc->vtpci_res = NULL;
+	}
+
+	return (0);
+}
+
+static int
+vtpci_suspend(device_t dev)
+{
+
+	return (bus_generic_suspend(dev));
+}
+
+static int
+vtpci_resume(device_t dev)
+{
+
+	return (bus_generic_resume(dev));
+}
+
+static int
+vtpci_shutdown(device_t dev)
+{
+
+	(void) bus_generic_shutdown(dev);
+	/* Forcibly stop the host device. */
+	vtpci_stop(dev);
+
+	return (0);
+}
+
+static void
+vtpci_driver_added(device_t dev, driver_t *driver)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	vtpci_probe_and_attach_child(sc);
+}
+
+static void
+vtpci_child_detached(device_t dev, device_t child)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	vtpci_reset(sc);
+	vtpci_release_child_resources(sc);
+}
+
+static int
+vtpci_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	if (sc->vtpci_child_dev != child)
+		return (ENOENT);
+
+	switch (index) {
+	case VIRTIO_IVAR_DEVTYPE:
+		*result = pci_get_subdevice(dev);
+		break;
+	default:
+		return (ENOENT);
+	}
+
+	return (0);
+}
+
+static int
+vtpci_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	if (sc->vtpci_child_dev != child)
+		return (ENOENT);
+
+	switch (index) {
+	case VIRTIO_IVAR_FEATURE_DESC:
+		sc->vtpci_child_feat_desc = (void *) value;
+		break;
+	default:
+		return (ENOENT);
+	}
+
+	return (0);
+}
+
+static uint64_t
+vtpci_negotiate_features(device_t dev, uint64_t child_features)
+{
+	struct vtpci_softc *sc;
+	uint64_t host_features, features;
+
+	sc = device_get_softc(dev);
+
+	host_features = vtpci_read_config_4(sc, VIRTIO_PCI_HOST_FEATURES);
+	vtpci_describe_features(sc, "host", host_features);
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & child_features;
+	features = virtqueue_filter_features(features);
+	sc->vtpci_features = features;
+
+	vtpci_describe_features(sc, "negotiated", features);
+	vtpci_write_config_4(sc, VIRTIO_PCI_GUEST_FEATURES, features);
+
+	return (features);
+}
+
+static int
+vtpci_with_feature(device_t dev, uint64_t feature)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	return ((sc->vtpci_features & feature) != 0);
+}
+
+static int
+vtpci_alloc_virtqueues(device_t dev, int flags, int nvqs,
+    struct vq_alloc_info *vq_info)
+{
+	struct vtpci_softc *sc;
+	struct vtpci_virtqueue *vqx;
+	struct vq_alloc_info *info;
+	int queue, error;
+	uint16_t vq_size;
+
+	sc = device_get_softc(dev);
+
+	if (sc->vtpci_nvqs != 0 || nvqs <= 0 ||
+	    nvqs > VIRTIO_MAX_VIRTQUEUES)
+		return (EINVAL);
+
+	error = vtpci_alloc_interrupts(sc, flags, nvqs, vq_info);
+	if (error) {
+		device_printf(dev, "cannot allocate interrupts\n");
+		return (error);
+	}
+
+	if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+		error = vtpci_register_msix_vector(sc,
+		    VIRTIO_MSI_CONFIG_VECTOR, 0);
+		if (error)
+			return (error);
+	}
+
+	for (queue = 0; queue < nvqs; queue++) {
+		vqx = &sc->vtpci_vqx[queue];
+		info = &vq_info[queue];
+
+		vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue);
+
+		vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
+		error = virtqueue_alloc(dev, queue, vq_size,
+		    VIRTIO_PCI_VRING_ALIGN, 0xFFFFFFFFUL, info, &vqx->vq);
+		if (error)
+			return (error);
+
+		if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+			error = vtpci_register_msix_vector(sc,
+			    VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx);
+			if (error)
+				return (error);
+		}
+
+		vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
+		    virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+
+		*info->vqai_vq = vqx->vq;
+		sc->vtpci_nvqs++;
+	}
+
+	return (0);
+}
+
+static int
+vtpci_setup_intr(device_t dev, enum intr_type type)
+{
+	struct vtpci_softc *sc;
+	struct vtpci_intr_resource *ires;
+	struct vtpci_virtqueue *vqx;
+	int i, flags, error;
+
+	sc = device_get_softc(dev);
+	flags = type | INTR_MPSAFE;
+	ires = &sc->vtpci_intr_res[0];
+
+	if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) == 0) {
+		error = bus_setup_intr(dev, ires->irq, flags,
+		    vtpci_legacy_intr, NULL, sc, &ires->intrhand);
+
+		return (error);
+	}
+
+	error = bus_setup_intr(dev, ires->irq, flags, vtpci_config_intr,
+	    NULL, sc, &ires->intrhand);
+	if (error)
+		return (error);
+
+	if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX) {
+		ires = &sc->vtpci_intr_res[1];
+		error = bus_setup_intr(dev, ires->irq, flags,
+		    vtpci_vq_shared_intr, NULL, sc, &ires->intrhand);
+
+		return (error);
+	}
+
+	/* Setup an interrupt handler for each virtqueue. */
+	for (i = 0; i < sc->vtpci_nvqs; i++) {
+		vqx = &sc->vtpci_vqx[i];
+		if (vqx->ires_idx < 1)
+			continue;
+
+		ires = &sc->vtpci_intr_res[vqx->ires_idx];
+		error = bus_setup_intr(dev, ires->irq, flags,
+		    vtpci_vq_intr, NULL, vqx->vq, &ires->intrhand);
+		if (error)
+			return (error);
+	}
+
+	return (0);
+}
+
+static void
+vtpci_stop(device_t dev)
+{
+
+	vtpci_reset(device_get_softc(dev));
+}
+
+static int
+vtpci_reinit(device_t dev, uint64_t features)
+{
+	struct vtpci_softc *sc;
+	struct vtpci_virtqueue *vqx;
+	struct virtqueue *vq;
+	int queue, error;
+	uint16_t vq_size;
+
+	sc = device_get_softc(dev);
+
+	/*
+	 * Redrive the device initialization. This is a bit of an abuse
+	 * of the specification, but both VirtualBox and QEMU/KVM seem
+	 * to play nice. We do not allow the host device to change from
+	 * what was originally negotiated beyond what the guest driver
+	 * changed (MSIX state should not change, number of virtqueues
+	 * and their size remain the same, etc).
+	 */
+
+	if (vtpci_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET)
+		vtpci_stop(dev);
+
+	/*
+	 * Quickly drive the status through ACK and DRIVER. The device
+	 * does not become usable again until vtpci_reinit_complete().
+	 */
+	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
+
+	vtpci_negotiate_features(dev, features);
+
+	if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+		error = vtpci_register_msix_vector(sc,
+		    VIRTIO_MSI_CONFIG_VECTOR, 0);
+		if (error)
+			return (error);
+	}
+
+	for (queue = 0; queue < sc->vtpci_nvqs; queue++) {
+		vqx = &sc->vtpci_vqx[queue];
+		vq = vqx->vq;
+
+		KASSERT(vq != NULL, ("vq %d not allocated", queue));
+		vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue);
+
+		vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
+		error = virtqueue_reinit(vq, vq_size);
+		if (error)
+			return (error);
+
+		if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+			error = vtpci_register_msix_vector(sc,
+			    VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx);
+			if (error)
+				return (error);
+		}
+
+		vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
+		    virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+	}
+
+	return (0);
+}
+
+static void
+vtpci_reinit_complete(device_t dev)
+{
+
+	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+static void
+vtpci_notify_virtqueue(device_t dev, uint16_t queue)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_NOTIFY, queue);
+}
+
+static uint8_t
+vtpci_get_status(device_t dev)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	return (vtpci_read_config_1(sc, VIRTIO_PCI_STATUS));
+}
+
+static void
+vtpci_set_status(device_t dev, uint8_t status)
+{
+	struct vtpci_softc *sc;
+
+	sc = device_get_softc(dev);
+
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= vtpci_get_status(dev);
+
+	vtpci_write_config_1(sc, VIRTIO_PCI_STATUS, status);
+}
+
+static void
+vtpci_read_dev_config(device_t dev, bus_size_t offset,
+    void *dst, int length)
+{
+	struct vtpci_softc *sc;
+	bus_size_t off;
+	uint8_t *d;
+	int size;
+
+	sc = device_get_softc(dev);
+	off = VIRTIO_PCI_CONFIG(sc) + offset;
+
+	for (d = dst; length > 0; d += size, off += size, length -= size) {
+		if (length >= 4) {
+			size = 4;
+			*(uint32_t *)d = vtpci_read_config_4(sc, off);
+		} else if (length >= 2) {
+			size = 2;
+			*(uint16_t *)d = vtpci_read_config_2(sc, off);
+		} else {
+			size = 1;
+			*d = vtpci_read_config_1(sc, off);
+		}
+	}
+}
+
+static void
+vtpci_write_dev_config(device_t dev, bus_size_t offset,
+    void *src, int length)
+{
+	struct vtpci_softc *sc;
+	bus_size_t off;
+	uint8_t *s;
+	int size;
+
+	sc = device_get_softc(dev);
+	off = VIRTIO_PCI_CONFIG(sc) + offset;
+
+	for (s = src; length > 0; s += size, off += size, length -= size) {
+		if (length >= 4) {
+			size = 4;
+			vtpci_write_config_4(sc, off, *(uint32_t *)s);
+		} else if (length >= 2) {
+			size = 2;
+			vtpci_write_config_2(sc, off, *(uint16_t *)s);
+		} else {
+			size = 1;
+			vtpci_write_config_1(sc, off, *s);
+		}
+	}
+}
+
+static void
+vtpci_describe_features(struct vtpci_softc *sc, const char *msg,
+    uint64_t features)
+{
+	device_t dev, child;
+
+	dev = sc->vtpci_dev;
+	child = sc->vtpci_child_dev;
+
+	if (device_is_attached(child) && bootverbose == 0)
+		return;
+
+	virtio_describe(dev, msg, features, sc->vtpci_child_feat_desc);
+}
+
+static void
+vtpci_probe_and_attach_child(struct vtpci_softc *sc)
+{
+	device_t dev, child;
+
+	dev = sc->vtpci_dev;
+	child = sc->vtpci_child_dev;
+
+	if (child == NULL)
+		return;
+
+	if (device_get_state(child) != DS_NOTPRESENT)
+		return;
+
+	if (device_probe(child) != 0)
+		return;
+
+	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
+	if (device_attach(child) != 0) {
+		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+		vtpci_reset(sc);
+		vtpci_release_child_resources(sc);
+
+		/* Reset status for future attempt. */
+		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+	} else
+		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+static int
+vtpci_alloc_interrupts(struct vtpci_softc *sc, int flags, int nvqs,
+    struct vq_alloc_info *vq_info)
+{
+	int i, nvectors, error;
+
+	/*
+	 * Only allocate a vector for virtqueues that are actually
+	 * expecting an interrupt.
+	 */
+	for (nvectors = 0, i = 0; i < nvqs; i++)
+		if (vq_info[i].vqai_intr != NULL)
+			nvectors++;
+
+	if (vtpci_disable_msix != 0 ||
+	    sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSIX ||
+	    flags & VIRTIO_ALLOC_VQS_DISABLE_MSIX ||
+	    vtpci_alloc_msix(sc, nvectors) != 0) {
+		/*
+		 * Use MSI interrupts if available. Otherwise, we fallback
+		 * to legacy interrupts.
+		 */
+		if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSI) == 0 &&
+		    vtpci_alloc_msi(sc) == 0)
+			sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSI;
+
+		sc->vtpci_nintr_res = 1;
+	}
+
+	error = vtpci_alloc_intr_resources(sc, nvqs, vq_info);
+
+	return (error);
+}
+
+static int
+vtpci_alloc_intr_resources(struct vtpci_softc *sc, int nvqs,
+    struct vq_alloc_info *vq_info)
+{
+	device_t dev;
+	struct resource *irq;
+	struct vtpci_virtqueue *vqx;
+	int i, rid, flags, res_idx;
+
+	dev = sc->vtpci_dev;
+	flags = RF_ACTIVE;
+
+	if ((sc->vtpci_flags &
+	    (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) == 0) {
+		rid = 0;
+		flags |= RF_SHAREABLE;
+	} else
+		rid = 1;
+
+	for (i = 0; i < sc->vtpci_nintr_res; i++) {
+		irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, flags);
+		if (irq == NULL)
+			return (ENXIO);
+
+		sc->vtpci_intr_res[i].irq = irq;
+		sc->vtpci_intr_res[i].rid = rid++;
+	}
+
+	/*
+	 * Map the virtqueue into the correct index in vq_intr_res[]. Note the
+	 * first index is reserved for configuration changes notifications.
+	 */
+	for (i = 0, res_idx = 1; i < nvqs; i++) {
+		vqx = &sc->vtpci_vqx[i];
+
+		if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+			if (vq_info[i].vqai_intr == NULL)
+				vqx->ires_idx = -1;
+			else if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX)
+				vqx->ires_idx = res_idx;
+			else
+				vqx->ires_idx = res_idx++;
+		} else
+			vqx->ires_idx = -1;
+	}
+
+	return (0);
+}
+
+static int
+vtpci_alloc_msi(struct vtpci_softc *sc)
+{
+	device_t dev;
+	int nmsi, cnt;
+
+	dev = sc->vtpci_dev;
+	nmsi = pci_msi_count(dev);
+
+	if (nmsi < 1)
+		return (1);
+
+	cnt = 1;
+	if (pci_alloc_msi(dev, &cnt) == 0 && cnt == 1)
+		return (0);
+
+	return (1);
+}
+
+static int
+vtpci_alloc_msix(struct vtpci_softc *sc, int nvectors)
+{
+	device_t dev;
+	int nmsix, cnt, required;
+
+	dev = sc->vtpci_dev;
+
+	nmsix = pci_msix_count(dev);
+	if (nmsix < 1)
+		return (1);
+
+	/* An additional vector is needed for the config changes. */
+	required = nvectors + 1;
+	if (nmsix >= required) {
+		cnt = required;
+		if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required)
+			goto out;
+
+		pci_release_msi(dev);
+	}
+
+	/* Attempt shared MSIX configuration. */
+	required = 2;
+	if (nmsix >= required) {
+		cnt = required;
+		if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) {
+			sc->vtpci_flags |= VIRTIO_PCI_FLAG_SHARED_MSIX;
+			goto out;
+		}
+
+		pci_release_msi(dev);
+	}
+
+	return (1);
+
+out:
+	sc->vtpci_nintr_res = required;
+	sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSIX;
+
+	if (bootverbose) {
+		if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX)
+			device_printf(dev, "using shared virtqueue MSIX\n");
+		else
+			device_printf(dev, "using per virtqueue MSIX\n");
+	}
+
+	return (0);
+}
+
+static int
+vtpci_register_msix_vector(struct vtpci_softc *sc, int offset, int res_idx)
+{
+	device_t dev;
+	uint16_t vector;
+
+	dev = sc->vtpci_dev;
+
+	if (offset != VIRTIO_MSI_CONFIG_VECTOR &&
+	    offset != VIRTIO_MSI_QUEUE_VECTOR)
+		return (EINVAL);
+
+	if (res_idx != -1) {
+		/* Map from rid to host vector. */
+		vector = sc->vtpci_intr_res[res_idx].rid - 1;
+	} else
+		vector = VIRTIO_MSI_NO_VECTOR;
+
+	/* The first resource is special; make sure it is used correctly. */
+	if (res_idx == 0) {
+		KASSERT(vector == 0, ("unexpected config vector"));
+		KASSERT(offset == VIRTIO_MSI_CONFIG_VECTOR,
+		    ("unexpected config offset"));
+	}
+
+	vtpci_write_config_2(sc, offset, vector);
+
+	if (vtpci_read_config_2(sc, offset) != vector) {
+		device_printf(dev, "insufficient host resources for "
+		    "MSIX interrupts\n");
+		return (ENODEV);
+	}
+
+	return (0);
+}
+
+static void
+vtpci_free_interrupts(struct vtpci_softc *sc)
+{
+	device_t dev;
+	struct vtpci_intr_resource *ires;
+	int i;
+
+	dev = sc->vtpci_dev;
+	sc->vtpci_nintr_res = 0;
+
+	if (sc->vtpci_flags & (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) {
+		pci_release_msi(dev);
+		sc->vtpci_flags &= ~(VIRTIO_PCI_FLAG_MSI |
+		    VIRTIO_PCI_FLAG_MSIX | VIRTIO_PCI_FLAG_SHARED_MSIX);
+	}
+
+	for (i = 0; i < 1 + VIRTIO_MAX_VIRTQUEUES; i++) {
+		ires = &sc->vtpci_intr_res[i];
+
+		if (ires->intrhand != NULL) {
+			bus_teardown_intr(dev, ires->irq, ires->intrhand);
+			ires->intrhand = NULL;
+		}
+
+		if (ires->irq != NULL) {
+			bus_release_resource(dev, SYS_RES_IRQ, ires->rid,
+			    ires->irq);
+			ires->irq = NULL;
+		}
+
+		ires->rid = -1;
+	}
+}
+
+static void
+vtpci_free_virtqueues(struct vtpci_softc *sc)
+{
+	struct vtpci_virtqueue *vqx;
+	int i;
+
+	sc->vtpci_nvqs = 0;
+
+	for (i = 0; i < VIRTIO_MAX_VIRTQUEUES; i++) {
+		vqx = &sc->vtpci_vqx[i];
+
+		if (vqx->vq != NULL) {
+			virtqueue_free(vqx->vq);
+			vqx->vq = NULL;
+		}
+	}
+}
+
+static void
+vtpci_release_child_resources(struct vtpci_softc *sc)
+{
+
+	vtpci_free_interrupts(sc);
+	vtpci_free_virtqueues(sc);
+}
+
+static void
+vtpci_reset(struct vtpci_softc *sc)
+{
+
+	/*
+	 * Setting the status to RESET sets the host device to
+	 * the original, uninitialized state.
+	 */
+	vtpci_set_status(sc->vtpci_dev, VIRTIO_CONFIG_STATUS_RESET);
+}
+
+static int
+vtpci_legacy_intr(void *xsc)
+{
+	struct vtpci_softc *sc;
+	struct vtpci_virtqueue *vqx;
+	int i;
+	uint8_t isr;
+
+	sc = xsc;
+	vqx = &sc->vtpci_vqx[0];
+
+	/* Reading the ISR also clears it. */
+	isr = vtpci_read_config_1(sc, VIRTIO_PCI_ISR);
+
+	if (isr & VIRTIO_PCI_ISR_CONFIG)
+		vtpci_config_intr(sc);
+
+	if (isr & VIRTIO_PCI_ISR_INTR)
+		for (i = 0; i < sc->vtpci_nvqs; i++, vqx++)
+			virtqueue_intr(vqx->vq);
+
+	return (isr ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+static int
+vtpci_vq_shared_intr(void *xsc)
+{
+	struct vtpci_softc *sc;
+	struct vtpci_virtqueue *vqx;
+	int i, rc;
+
+	rc = 0;
+	sc = xsc;
+	vqx = &sc->vtpci_vqx[0];
+
+	for (i = 0; i < sc->vtpci_nvqs; i++, vqx++)
+		rc |= virtqueue_intr(vqx->vq);
+
+	return (rc ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+static int
+vtpci_vq_intr(void *xvq)
+{
+	struct virtqueue *vq;
+	int rc;
+
+	vq = xvq;
+	rc = virtqueue_intr(vq);
+
+	return (rc ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+static int
+vtpci_config_intr(void *xsc)
+{
+	struct vtpci_softc *sc;
+	device_t child;
+	int rc;
+
+	rc = 0;
+	sc = xsc;
+	child = sc->vtpci_child_dev;
+
+	if (child != NULL)
+		rc = VIRTIO_CONFIG_CHANGE(child);
+
+	return (rc ? FILTER_HANDLED : FILTER_STRAY);
+}
diff --git a/sys/dev/virtio/pci/virtio_pci.h b/sys/dev/virtio/pci/virtio_pci.h
new file mode 100644
index 0000000..6ebfdd5
--- /dev/null
+++ b/sys/dev/virtio/pci/virtio_pci.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright IBM Corp. 2007
+ *
+ * Authors:
+ *  Anthony Liguori  <aliguori@us.ibm.com>
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_PCI_H
+#define _VIRTIO_PCI_H
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_PCI_VENDORID	0x1AF4
+#define VIRTIO_PCI_DEVICEID_MIN	0x1000
+#define VIRTIO_PCI_DEVICEID_MAX	0x103F
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION	0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY	  16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR            19 /* interrupt status register, reading
+				      * also clears the register (8, RO) */
+/* Only if MSIX is enabled: */
+#define VIRTIO_MSI_CONFIG_VECTOR  20 /* configuration change vector (16, RW) */
+#define VIRTIO_MSI_QUEUE_VECTOR   22 /* vector for selected VQ notifications
+					(16, RW) */
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR	0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG	0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR	0xFFFF
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(sc) \
+    (((sc)->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) ? 24 : 20)
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT	12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN	4096
+
+#endif /* _VIRTIO_PCI_H */
diff --git a/sys/dev/virtio/virtio.c b/sys/dev/virtio/virtio.c
new file mode 100644
index 0000000..e385575
--- /dev/null
+++ b/sys/dev/virtio/virtio.c
@@ -0,0 +1,283 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <machine/_inttypes.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+
+#include "virtio_bus_if.h"
+
+static int virtio_modevent(module_t, int, void *);
+static const char *virtio_feature_name(uint64_t, struct virtio_feature_desc *);
+
+static struct virtio_ident {
+	uint16_t devid;
+	char	*name;
+} virtio_ident_table[] = {
+	{ VIRTIO_ID_NETWORK,	"Network"	},
+	{ VIRTIO_ID_BLOCK,	"Block"		},
+	{ VIRTIO_ID_CONSOLE,	"Console"	},
+	{ VIRTIO_ID_ENTROPY,	"Entropy"	},
+	{ VIRTIO_ID_BALLOON,	"Balloon"	},
+	{ VIRTIO_ID_IOMEMORY,	"IOMemory"	},
+	{ VIRTIO_ID_9P,		"9P Transport"	},
+
+	{ 0, NULL }
+};
+
+/* Device independent features. */
+static struct virtio_feature_desc virtio_common_feature_desc[] = {
+	{ VIRTIO_F_NOTIFY_ON_EMPTY,	"NotifyOnEmpty"	},
+	{ VIRTIO_RING_F_INDIRECT_DESC,	"RingIndirect"	},
+	{ VIRTIO_RING_F_EVENT_IDX,	"EventIdx"	},
+	{ VIRTIO_F_BAD_FEATURE,		"BadFeature"	},
+
+	{ 0, NULL }
+};
+
+const char *
+virtio_device_name(uint16_t devid)
+{
+	struct virtio_ident *ident;
+
+	for (ident = virtio_ident_table; ident->name != NULL; ident++) {
+		if (ident->devid == devid)
+			return (ident->name);
+	}
+
+	return (NULL);
+}
+
+int
+virtio_get_device_type(device_t dev)
+{
+	uintptr_t devtype;
+
+	devtype = -1;
+
+	BUS_READ_IVAR(device_get_parent(dev), dev,
+	    VIRTIO_IVAR_DEVTYPE, &devtype);
+
+	return ((int) devtype);
+}
+
+void
+virtio_set_feature_desc(device_t dev,
+    struct virtio_feature_desc *feature_desc)
+{
+
+	BUS_WRITE_IVAR(device_get_parent(dev), dev,
+	    VIRTIO_IVAR_FEATURE_DESC, (uintptr_t) feature_desc);
+}
+
+void
+virtio_describe(device_t dev, const char *msg,
+    uint64_t features, struct virtio_feature_desc *feature_desc)
+{
+	struct sbuf sb;
+	uint64_t val;
+	char *buf;
+	const char *name;
+	int n;
+
+	if ((buf = malloc(512, M_TEMP, M_NOWAIT)) == NULL) {
+		device_printf(dev, "%s features: 0x%"PRIx64"\n", msg,
+		    features);
+		return;
+	}
+
+	sbuf_new(&sb, buf, 512, SBUF_FIXEDLEN);
+	sbuf_printf(&sb, "%s features: 0x%"PRIx64, msg, features);
+
+	for (n = 0, val = 1ULL << 63; val != 0; val >>= 1) {
+		/*
+		 * BAD_FEATURE is used to detect broken Linux clients
+		 * and therefore is not applicable to FreeBSD.
+		 */
+		if (((features & val) == 0) || val == VIRTIO_F_BAD_FEATURE)
+			continue;
+
+		if (n++ == 0)
+			sbuf_cat(&sb, " <");
+		else
+			sbuf_cat(&sb, ",");
+
+		name = NULL;
+		if (feature_desc != NULL)
+			name = virtio_feature_name(val, feature_desc);
+		if (name == NULL)
+			name = virtio_feature_name(val,
+			    virtio_common_feature_desc);
+
+		if (name == NULL)
+			sbuf_printf(&sb, "0x%"PRIx64, val);
+		else
+			sbuf_cat(&sb, name);
+	}
+
+	if (n > 0)
+		sbuf_cat(&sb, ">");
+
+#if __FreeBSD_version < 900020
+	sbuf_finish(&sb);
+	if (sbuf_overflowed(&sb) == 0)
+#else
+	if (sbuf_finish(&sb) == 0)
+#endif
+		device_printf(dev, "%s\n", sbuf_data(&sb));
+
+	sbuf_delete(&sb);
+	free(buf, M_TEMP);
+}
+
+static const char *
+virtio_feature_name(uint64_t val, struct virtio_feature_desc *feature_desc)
+{
+	int i;
+
+	for (i = 0; feature_desc[i].vfd_val != 0; i++)
+		if (val == feature_desc[i].vfd_val)
+			return (feature_desc[i].vfd_str);
+
+	return (NULL);
+}
+
+/*
+ * VirtIO bus method wrappers.
+ */
+
+uint64_t
+virtio_negotiate_features(device_t dev, uint64_t child_features)
+{
+
+	return (VIRTIO_BUS_NEGOTIATE_FEATURES(device_get_parent(dev),
+	    child_features));
+}
+
+int
+virtio_alloc_virtqueues(device_t dev, int flags, int nvqs,
+    struct vq_alloc_info *info)
+{
+
+	return (VIRTIO_BUS_ALLOC_VIRTQUEUES(device_get_parent(dev), flags,
+	    nvqs, info));
+}
+
+int
+virtio_setup_intr(device_t dev, enum intr_type type)
+{
+
+	return (VIRTIO_BUS_SETUP_INTR(device_get_parent(dev), type));
+}
+
+int
+virtio_with_feature(device_t dev, uint64_t feature)
+{
+
+	return (VIRTIO_BUS_WITH_FEATURE(device_get_parent(dev), feature));
+}
+
+void
+virtio_stop(device_t dev)
+{
+
+	VIRTIO_BUS_STOP(device_get_parent(dev));
+}
+
+int
+virtio_reinit(device_t dev, uint64_t features)
+{
+
+	return (VIRTIO_BUS_REINIT(device_get_parent(dev), features));
+}
+
+void
+virtio_reinit_complete(device_t dev)
+{
+
+	VIRTIO_BUS_REINIT_COMPLETE(device_get_parent(dev));
+}
+
+void
+virtio_read_device_config(device_t dev, bus_size_t offset, void *dst, int len)
+{
+
+	VIRTIO_BUS_READ_DEVICE_CONFIG(device_get_parent(dev),
+	    offset, dst, len);
+}
+
+void
+virtio_write_device_config(device_t dev, bus_size_t offset, void *dst, int len)
+{
+
+	VIRTIO_BUS_WRITE_DEVICE_CONFIG(device_get_parent(dev),
+	    offset, dst, len);
+}
+
+static int
+virtio_modevent(module_t mod, int type, void *unused)
+{
+	int error;
+
+	error = 0;
+
+	switch (type) {
+	case MOD_LOAD:
+	case MOD_QUIESCE:
+	case MOD_UNLOAD:
+	case MOD_SHUTDOWN:
+		break;
+	default:
+		error = EOPNOTSUPP;
+		break;
+	}
+
+	return (error);
+}
+
+static moduledata_t virtio_mod = {
+	"virtio",
+	virtio_modevent,
+	0
+};
+
+DECLARE_MODULE(virtio, virtio_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_VERSION(virtio, 1);
diff --git a/sys/dev/virtio/virtio.h b/sys/dev/virtio/virtio.h
new file mode 100644
index 0000000..ebd3c74
--- /dev/null
+++ b/sys/dev/virtio/virtio.h
@@ -0,0 +1,130 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#include <sys/types.h>
+
+struct vq_alloc_info;
+
+/* VirtIO device IDs. */
+#define VIRTIO_ID_NETWORK	0x01
+#define VIRTIO_ID_BLOCK		0x02
+#define VIRTIO_ID_CONSOLE	0x03
+#define VIRTIO_ID_ENTROPY	0x04
+#define VIRTIO_ID_BALLOON	0x05
+#define VIRTIO_ID_IOMEMORY	0x06
+#define VIRTIO_ID_9P		0x09
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET	0x00
+#define VIRTIO_CONFIG_STATUS_ACK	0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER	0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK	0x04
+#define VIRTIO_CONFIG_STATUS_FAILED	0x80
+
+/*
+ * Generate interrupt when the virtqueue ring is
+ * completely used, even if we've suppressed them.
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24)
+
+/*
+ * The guest should never negotiate this feature; it
+ * is used to detect faulty drivers.
+ */
+#define VIRTIO_F_BAD_FEATURE (1 << 30)
+
+/*
+ * Some VirtIO feature bits (currently bits 28 through 31) are
+ * reserved for the transport being used (eg. virtio_ring), the
+ * rest are per-device feature bits.
+ */
+#define VIRTIO_TRANSPORT_F_START	28
+#define VIRTIO_TRANSPORT_F_END		32
+
+/*
+ * Maximum number of virtqueues per device.
+ */
+#define VIRTIO_MAX_VIRTQUEUES 8
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/*
+ * VirtIO instance variables indices.
+ */
+#define VIRTIO_IVAR_DEVTYPE		1
+#define VIRTIO_IVAR_FEATURE_DESC	2
+
+struct virtio_feature_desc {
+	uint64_t	 vfd_val;
+	char		*vfd_str;
+};
+
+const char *virtio_device_name(uint16_t devid);
+int	 virtio_get_device_type(device_t dev);
+void	 virtio_set_feature_desc(device_t dev,
+	     struct virtio_feature_desc *feature_desc);
+void	 virtio_describe(device_t dev, const char *msg,
+	     uint64_t features, struct virtio_feature_desc *feature_desc);
+
+/*
+ * VirtIO Bus Methods.
+ */
+uint64_t virtio_negotiate_features(device_t dev, uint64_t child_features);
+int	 virtio_alloc_virtqueues(device_t dev, int flags, int nvqs,
+	     struct vq_alloc_info *info);
+int	 virtio_setup_intr(device_t dev, enum intr_type type);
+int	 virtio_with_feature(device_t dev, uint64_t feature);
+void	 virtio_stop(device_t dev);
+int	 virtio_reinit(device_t dev, uint64_t features);
+void	 virtio_reinit_complete(device_t dev);
+
+/*
+ * Read/write a variable amount from the device specific (ie, network)
+ * configuration region. This region is encoded in the same endian as
+ * the guest.
+ */
+void	 virtio_read_device_config(device_t dev, bus_size_t offset,
+	     void *dst, int length);
+void	 virtio_write_device_config(device_t dev, bus_size_t offset,
+	     void *src, int length);
+
+/* Inlined device specific read/write functions for common lengths. */
+#define VIRTIO_RDWR_DEVICE_CONFIG(size, type)				\
+static inline type							\
+__CONCAT(virtio_read_dev_config_,size)(device_t dev,			\
+    bus_size_t offset)							\
+{									\
+	type val;							\
+	virtio_read_device_config(dev, offset, &val, sizeof(type));	\
+	return (val);							\
+}									\
+									\
+static inline void							\
+__CONCAT(virtio_write_dev_config_,size)(device_t dev,			\
+    bus_size_t offset, type val)					\
+{									\
+	virtio_write_device_config(dev, offset, &val, sizeof(type));	\
+}
+
+VIRTIO_RDWR_DEVICE_CONFIG(1, uint8_t);
+VIRTIO_RDWR_DEVICE_CONFIG(2, uint16_t);
+VIRTIO_RDWR_DEVICE_CONFIG(4, uint32_t);
+
+#endif /* _VIRTIO_H_ */
diff --git a/sys/dev/virtio/virtio_bus_if.m b/sys/dev/virtio/virtio_bus_if.m
new file mode 100644
index 0000000..ec2029d
--- /dev/null
+++ b/sys/dev/virtio/virtio_bus_if.m
@@ -0,0 +1,92 @@
+#-
+# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+INTERFACE virtio_bus;
+
+HEADER {
+struct vq_alloc_info;
+};
+
+METHOD uint64_t negotiate_features {
+	device_t	dev;
+	uint64_t	child_features;
+};
+
+METHOD int with_feature {
+	device_t	dev;
+	uint64_t	feature;
+};
+
+METHOD int alloc_virtqueues {
+	device_t	dev;
+	int		flags;
+	int		nvqs;
+	struct vq_alloc_info *info;
+};
+HEADER {
+#define VIRTIO_ALLOC_VQS_DISABLE_MSIX 0x1
+};
+
+METHOD int setup_intr {
+	device_t	dev;
+	enum intr_type	type;
+};
+
+METHOD void stop {
+	device_t	dev;
+};
+
+METHOD int reinit {
+	device_t	dev;
+	uint64_t	features;
+};
+
+METHOD void reinit_complete {
+	device_t	dev;
+};
+
+METHOD void notify_vq {
+	device_t	dev;
+	uint16_t	queue;
+};
+
+METHOD void read_device_config {
+	device_t	dev;
+	bus_size_t	offset;
+	void		*dst;
+	int		len;
+};
+
+METHOD void write_device_config {
+	device_t	dev;
+	bus_size_t	offset;
+	void		*src;
+	int		len;
+};
diff --git a/sys/dev/virtio/virtio_if.m b/sys/dev/virtio/virtio_if.m
new file mode 100644
index 0000000..701678c
--- /dev/null
+++ b/sys/dev/virtio/virtio_if.m
@@ -0,0 +1,43 @@
+#-
+# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+
+INTERFACE virtio;
+
+CODE {
+	static int
+	virtio_default_config_change(device_t dev)
+	{
+		/* Return that we've handled the change. */
+		return (1);
+	}
+};
+
+METHOD int config_change {
+	device_t	dev;
+} DEFAULT virtio_default_config_change;
diff --git a/sys/dev/virtio/virtio_ring.h b/sys/dev/virtio/virtio_ring.h
new file mode 100644
index 0000000..124cb4d
--- /dev/null
+++ b/sys/dev/virtio/virtio_ring.h
@@ -0,0 +1,119 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions
+ * to implement compatible drivers/servers.
+ *
+ * Copyright Rusty Russell IBM Corporation 2007.
+ */
+/* $FreeBSD$ */
+
+#ifndef VIRTIO_RING_H
+#define	VIRTIO_RING_H
+
+#include <sys/types.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT       1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE      2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT	4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer.  It's unreliable, so it's simply an
+ * optimization.  Guest will still kick if it's out of buffers. */
+#define VRING_USED_F_NO_NOTIFY  1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer.  It's unreliable, so it's
+ * simply an optimization.  */
+#define VRING_AVAIL_F_NO_INTERRUPT      1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next". */
+struct vring_desc {
+        /* Address (guest-physical). */
+        uint64_t addr;
+        /* Length. */
+        uint32_t len;
+        /* The flags as indicated above. */
+        uint16_t flags;
+        /* We chain unused descriptors via this, too. */
+        uint16_t next;
+};
+
+struct vring_avail {
+        uint16_t flags;
+        uint16_t idx;
+        uint16_t ring[0];
+};
+
+/* uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+        /* Index of start of used descriptor chain. */
+        uint32_t id;
+        /* Total length of the descriptor chain which was written to. */
+        uint32_t len;
+};
+
+struct vring_used {
+        uint16_t flags;
+        uint16_t idx;
+        struct vring_used_elem ring[0];
+};
+
+struct vring {
+	unsigned int num;
+
+	struct vring_desc *desc;
+	struct vring_avail *avail;
+	struct vring_used *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this.  We assume num is a power of 2.
+ *
+ * struct vring {
+ *      // The actual descriptors (16 bytes each)
+ *      struct vring_desc desc[num];
+ *
+ *      // A ring of available descriptor heads with free-running index.
+ *      __u16 avail_flags;
+ *      __u16 avail_idx;
+ *      __u16 available[num];
+ *
+ *      // Padding to the next align boundary.
+ *      char pad[];
+ *
+ *      // A ring of used descriptor heads with free-running index.
+ *      __u16 used_flags;
+ *      __u16 used_idx;
+ *      struct vring_used_elem used[num];
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+static inline int
+vring_size(unsigned int num, unsigned long align)
+{
+	int size;
+
+	size = num * sizeof(struct vring_desc);
+	size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+	size = (size + align - 1) & ~(align - 1);
+	size += sizeof(struct vring_used) +
+	    (num * sizeof(struct vring_used_elem));
+	return (size);
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+    unsigned long align)
+{
+        vr->num = num;
+        vr->desc = (struct vring_desc *) p;
+        vr->avail = (struct vring_avail *) (p +
+	    num * sizeof(struct vring_desc));
+        vr->used = (void *)
+	    (((unsigned long) &vr->avail->ring[num] + align-1) & ~(align-1));
+}
+#endif /* VIRTIO_RING_H */
diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c
new file mode 100644
index 0000000..1fb182e
--- /dev/null
+++ b/sys/dev/virtio/virtqueue.c
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Implements the virtqueue interface as basically described
+ * in the original VirtIO paper.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/sglist.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/cpu.h>
+#include <machine/bus.h>
+#include <machine/atomic.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/virtio_ring.h>
+
+#include "virtio_bus_if.h"
+
+struct virtqueue {
+	device_t		 vq_dev;
+	char			 vq_name[VIRTQUEUE_MAX_NAME_SZ];
+	uint16_t		 vq_queue_index;
+	uint16_t		 vq_nentries;
+	uint32_t		 vq_flags;
+#define	VIRTQUEUE_FLAG_INDIRECT	 0x0001
+
+	int			 vq_alignment;
+	int			 vq_ring_size;
+	void			*vq_ring_mem;
+	int			 vq_max_indirect_size;
+	int			 vq_indirect_mem_size;
+	virtqueue_intr_t	*vq_intrhand;
+	void			*vq_intrhand_arg;
+
+	struct vring		 vq_ring;
+	uint16_t		 vq_free_cnt;
+	uint16_t		 vq_queued_cnt;
+	/*
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t		 vq_desc_head_idx;
+	/*
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t		 vq_used_cons_idx;
+
+	struct vq_desc_extra {
+		void		  *cookie;
+		struct vring_desc *indirect;
+		vm_paddr_t	   indirect_paddr;
+		uint16_t	   ndescs;
+	} vq_descx[0];
+};
+
+/*
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+#define VQASSERT(_vq, _exp, _msg, ...)				\
+    KASSERT((_exp),("%s: %s - "_msg, __func__, (_vq)->vq_name,	\
+	##__VA_ARGS__))
+
+#define VQ_RING_ASSERT_VALID_IDX(_vq, _idx)			\
+    VQASSERT((_vq), (_idx) < (_vq)->vq_nentries,		\
+	"invalid ring index: %d, max: %d", (_idx),		\
+	(_vq)->vq_nentries)
+
+#define VQ_RING_ASSERT_CHAIN_TERM(_vq)				\
+    VQASSERT((_vq), (_vq)->vq_desc_head_idx ==			\
+	VQ_RING_DESC_CHAIN_END,	"full ring terminated "		\
+	"incorrectly: head idx: %d", (_vq)->vq_desc_head_idx)
+
+static int	virtqueue_init_indirect(struct virtqueue *vq, int);
+static void	virtqueue_free_indirect(struct virtqueue *vq);
+static void	virtqueue_init_indirect_list(struct virtqueue *,
+		    struct vring_desc *);
+
+static void	vq_ring_init(struct virtqueue *);
+static void	vq_ring_update_avail(struct virtqueue *, uint16_t);
+static uint16_t	vq_ring_enqueue_segments(struct virtqueue *,
+		    struct vring_desc *, uint16_t, struct sglist *, int, int);
+static int	vq_ring_use_indirect(struct virtqueue *, int);
+static void	vq_ring_enqueue_indirect(struct virtqueue *, void *,
+		    struct sglist *, int, int);
+static void	vq_ring_notify_host(struct virtqueue *, int);
+static void	vq_ring_free_chain(struct virtqueue *, uint16_t);
+
+uint64_t
+virtqueue_filter_features(uint64_t features)
+{
+	uint64_t mask;
+
+	mask = (1 << VIRTIO_TRANSPORT_F_START) - 1;
+	mask |= VIRTIO_RING_F_INDIRECT_DESC;
+
+	return (features & mask);
+}
+
+int
+virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, int align,
+    vm_paddr_t highaddr, struct vq_alloc_info *info, struct virtqueue **vqp)
+{
+	struct virtqueue *vq;
+	int error;
+
+	*vqp = NULL;
+	error = 0;
+
+	if (size == 0) {
+		device_printf(dev,
+		    "virtqueue %d (%s) does not exist (size is zero)\n",
+		    queue, info->vqai_name);
+		return (ENODEV);
+	} else if (!powerof2(size)) {
+		device_printf(dev,
+		    "virtqueue %d (%s) size is not a power of 2: %d\n",
+		    queue, info->vqai_name, size);
+		return (ENXIO);
+	} else if (info->vqai_maxindirsz > VIRTIO_MAX_INDIRECT) {
+		device_printf(dev, "virtqueue %d (%s) requested too many "
+		    "indirect descriptors: %d, max %d\n",
+		    queue, info->vqai_name, info->vqai_maxindirsz,
+		    VIRTIO_MAX_INDIRECT);
+		return (EINVAL);
+	}
+
+	vq = malloc(sizeof(struct virtqueue) +
+	    size * sizeof(struct vq_desc_extra), M_DEVBUF, M_NOWAIT | M_ZERO);
+	if (vq == NULL) {
+		device_printf(dev, "cannot allocate virtqueue\n");
+		return (ENOMEM);
+	}
+
+	vq->vq_dev = dev;
+	strlcpy(vq->vq_name, info->vqai_name, sizeof(vq->vq_name));
+	vq->vq_queue_index = queue;
+	vq->vq_alignment = align;
+	vq->vq_nentries = size;
+	vq->vq_free_cnt = size;
+	vq->vq_intrhand = info->vqai_intr;
+	vq->vq_intrhand_arg = info->vqai_intr_arg;
+
+	if (info->vqai_maxindirsz > 1) {
+		error = virtqueue_init_indirect(vq, info->vqai_maxindirsz);
+		if (error)
+			goto fail;
+	}
+
+	vq->vq_ring_size = round_page(vring_size(size, align));
+	vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
+	    M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
+	if (vq->vq_ring_mem == NULL) {
+		device_printf(dev,
+		    "cannot allocate memory for virtqueue ring\n");
+		error = ENOMEM;
+		goto fail;
+	}
+
+	vq_ring_init(vq);
+	virtqueue_disable_intr(vq);
+
+	*vqp = vq;
+
+fail:
+	if (error)
+		virtqueue_free(vq);
+
+	return (error);
+}
+
+static int
+virtqueue_init_indirect(struct virtqueue *vq, int indirect_size)
+{
+	device_t dev;
+	struct vq_desc_extra *dxp;
+	int i, size;
+
+	dev = vq->vq_dev;
+
+	if (VIRTIO_BUS_WITH_FEATURE(dev, VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+		/*
+		 * Indirect descriptors requested by the driver but not
+		 * negotiated. Return zero to keep the initialization
+		 * going: we'll run fine without.
+		 */
+		if (bootverbose)
+			device_printf(dev, "virtqueue %d (%s) requested "
+			    "indirect descriptors but not negotiated\n",
+			    vq->vq_queue_index, vq->vq_name);
+		return (0);
+	}
+
+	size = indirect_size * sizeof(struct vring_desc);
+	vq->vq_max_indirect_size = indirect_size;
+	vq->vq_indirect_mem_size = size;
+	vq->vq_flags |= VIRTQUEUE_FLAG_INDIRECT;
+
+	for (i = 0; i < vq->vq_nentries; i++) {
+		dxp = &vq->vq_descx[i];
+
+		dxp->indirect = malloc(size, M_DEVBUF, M_NOWAIT);
+		if (dxp->indirect == NULL) {
+			device_printf(dev, "cannot allocate indirect list\n");
+			return (ENOMEM);
+		}
+
+		dxp->indirect_paddr = vtophys(dxp->indirect);
+		virtqueue_init_indirect_list(vq, dxp->indirect);
+	}
+
+	return (0);
+}
+
+static void
+virtqueue_free_indirect(struct virtqueue *vq)
+{
+	struct vq_desc_extra *dxp;
+	int i;
+
+	for (i = 0; i < vq->vq_nentries; i++) {
+		dxp = &vq->vq_descx[i];
+
+		if (dxp->indirect == NULL)
+			break;
+
+		free(dxp->indirect, M_DEVBUF);
+		dxp->indirect = NULL;
+		dxp->indirect_paddr = 0;
+	}
+
+	vq->vq_flags &= ~VIRTQUEUE_FLAG_INDIRECT;
+	vq->vq_indirect_mem_size = 0;
+}
+
+static void
+virtqueue_init_indirect_list(struct virtqueue *vq,
+    struct vring_desc *indirect)
+{
+	int i;
+
+	bzero(indirect, vq->vq_indirect_mem_size);
+
+	for (i = 0; i < vq->vq_max_indirect_size - 1; i++)
+		indirect[i].next = i + 1;
+	indirect[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+int
+virtqueue_reinit(struct virtqueue *vq, uint16_t size)
+{
+	struct vq_desc_extra *dxp;
+	int i;
+
+	if (vq->vq_nentries != size) {
+		device_printf(vq->vq_dev,
+		    "%s: '%s' changed size; old=%hu, new=%hu\n",
+		    __func__, vq->vq_name, vq->vq_nentries, size);
+		return (EINVAL);
+	}
+
+	/* Warn if the virtqueue was not properly cleaned up. */
+	if (vq->vq_free_cnt != vq->vq_nentries) {
+		device_printf(vq->vq_dev,
+		    "%s: warning, '%s' virtqueue not empty, "
+		    "leaking %d entries\n", __func__, vq->vq_name,
+		    vq->vq_nentries - vq->vq_free_cnt);
+	}
+
+	vq->vq_desc_head_idx = 0;
+	vq->vq_used_cons_idx = 0;
+	vq->vq_queued_cnt = 0;
+	vq->vq_free_cnt = vq->vq_nentries;
+
+	/* To be safe, reset all our allocated memory. */
+	bzero(vq->vq_ring_mem, vq->vq_ring_size);
+	for (i = 0; i < vq->vq_nentries; i++) {
+		dxp = &vq->vq_descx[i];
+		dxp->cookie = NULL;
+		dxp->ndescs = 0;
+		if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT)
+			virtqueue_init_indirect_list(vq, dxp->indirect);
+	}
+
+	vq_ring_init(vq);
+	virtqueue_disable_intr(vq);
+
+	return (0);
+}
+
+void
+virtqueue_free(struct virtqueue *vq)
+{
+
+	if (vq->vq_free_cnt != vq->vq_nentries) {
+		device_printf(vq->vq_dev, "%s: freeing non-empty virtqueue, "
+		    "leaking %d entries\n", vq->vq_name,
+		    vq->vq_nentries - vq->vq_free_cnt);
+	}
+
+	if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT)
+		virtqueue_free_indirect(vq);
+
+	if (vq->vq_ring_mem != NULL) {
+		contigfree(vq->vq_ring_mem, vq->vq_ring_size, M_DEVBUF);
+		vq->vq_ring_size = 0;
+		vq->vq_ring_mem = NULL;
+	}
+
+	free(vq, M_DEVBUF);
+}
+
+vm_paddr_t
+virtqueue_paddr(struct virtqueue *vq)
+{
+
+	return (vtophys(vq->vq_ring_mem));
+}
+
+int
+virtqueue_size(struct virtqueue *vq)
+{
+
+	return (vq->vq_nentries);
+}
+
+int
+virtqueue_empty(struct virtqueue *vq)
+{
+
+	return (vq->vq_nentries == vq->vq_free_cnt);
+}
+
+int
+virtqueue_full(struct virtqueue *vq)
+{
+
+	return (vq->vq_free_cnt == 0);
+}
+
+void
+virtqueue_notify(struct virtqueue *vq)
+{
+
+	vq->vq_queued_cnt = 0;
+	vq_ring_notify_host(vq, 0);
+}
+
+int
+virtqueue_nused(struct virtqueue *vq)
+{
+	uint16_t used_idx, nused;
+
+	used_idx = vq->vq_ring.used->idx;
+	if (used_idx >= vq->vq_used_cons_idx)
+		nused = used_idx - vq->vq_used_cons_idx;
+	else
+		nused = UINT16_MAX - vq->vq_used_cons_idx +
+		    used_idx + 1;
+	VQASSERT(vq, nused <= vq->vq_nentries, "used more than available");
+
+	return (nused);
+}
+
+int
+virtqueue_intr(struct virtqueue *vq)
+{
+
+	if (vq->vq_intrhand == NULL ||
+	    vq->vq_used_cons_idx == vq->vq_ring.used->idx)
+		return (0);
+
+	vq->vq_intrhand(vq->vq_intrhand_arg);
+
+	return (1);
+}
+
+int
+virtqueue_enable_intr(struct virtqueue *vq)
+{
+
+	/*
+	 * Enable interrupts, making sure we get the latest
+	 * index of what's already been consumed.
+	 */
+	vq->vq_ring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+
+	mb();
+
+	/*
+	 * Additional items may have been consumed in the time between
+	 * since we last checked and enabled interrupts above. Let our
+	 * caller know so it processes the new entries.
+	 */
+	if (vq->vq_used_cons_idx != vq->vq_ring.used->idx)
+		return (1);
+
+	return (0);
+}
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+
+	/*
+	 * Note this is only considered a hint to the host.
+	 */
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+int
+virtqueue_enqueue(struct virtqueue *vq, void *cookie, struct sglist *sg,
+    int readable, int writable)
+{
+	struct vq_desc_extra *dxp;
+	int needed;
+	uint16_t head_idx, idx;
+
+	needed = readable + writable;
+
+	VQASSERT(vq, cookie != NULL, "enqueuing with no cookie");
+	VQASSERT(vq, needed == sg->sg_nseg,
+	    "segment count mismatch, %d, %d", needed, sg->sg_nseg);
+	VQASSERT(vq,
+	    needed <= vq->vq_nentries || needed <= vq->vq_max_indirect_size,
+	    "too many segments to enqueue: %d, %d/%d", needed,
+	    vq->vq_nentries, vq->vq_max_indirect_size);
+
+	if (needed < 1)
+		return (EINVAL);
+	if (vq->vq_free_cnt == 0)
+		return (ENOSPC);
+
+	if (vq_ring_use_indirect(vq, needed)) {
+		vq_ring_enqueue_indirect(vq, cookie, sg, readable, writable);
+		return (0);
+	} else if (vq->vq_free_cnt < needed)
+		return (EMSGSIZE);
+
+	head_idx = vq->vq_desc_head_idx;
+	VQ_RING_ASSERT_VALID_IDX(vq, head_idx);
+	dxp = &vq->vq_descx[head_idx];
+
+	VQASSERT(vq, dxp->cookie == NULL,
+	    "cookie already exists for index %d", head_idx);
+	dxp->cookie = cookie;
+	dxp->ndescs = needed;
+
+	idx = vq_ring_enqueue_segments(vq, vq->vq_ring.desc, head_idx,
+	    sg, readable, writable);
+
+	vq->vq_desc_head_idx = idx;
+	vq->vq_free_cnt -= needed;
+	if (vq->vq_free_cnt == 0)
+		VQ_RING_ASSERT_CHAIN_TERM(vq);
+	else
+		VQ_RING_ASSERT_VALID_IDX(vq, idx);
+
+	vq_ring_update_avail(vq, head_idx);
+
+	return (0);
+}
+
+void *
+virtqueue_dequeue(struct virtqueue *vq, uint32_t *len)
+{
+	struct vring_used_elem *uep;
+	void *cookie;
+	uint16_t used_idx, desc_idx;
+
+	if (vq->vq_used_cons_idx == vq->vq_ring.used->idx)
+		return (NULL);
+
+	used_idx = vq->vq_used_cons_idx++ & (vq->vq_nentries - 1);
+	uep = &vq->vq_ring.used->ring[used_idx];
+
+	mb();
+	desc_idx = (uint16_t) uep->id;
+	if (len != NULL)
+		*len = uep->len;
+
+	vq_ring_free_chain(vq, desc_idx);
+
+	cookie = vq->vq_descx[desc_idx].cookie;
+	VQASSERT(vq, cookie != NULL, "no cookie for index %d", desc_idx);
+	vq->vq_descx[desc_idx].cookie = NULL;
+
+	return (cookie);
+}
+
+void *
+virtqueue_poll(struct virtqueue *vq, uint32_t *len)
+{
+	void *cookie;
+
+	while ((cookie = virtqueue_dequeue(vq, len)) == NULL)
+		cpu_spinwait();
+
+	return (cookie);
+}
+
+void *
+virtqueue_drain(struct virtqueue *vq, int *last)
+{
+	void *cookie;
+	int idx;
+
+	cookie = NULL;
+	idx = *last;
+
+	while (idx < vq->vq_nentries && cookie == NULL) {
+		if ((cookie = vq->vq_descx[idx].cookie) != NULL) {
+			vq->vq_descx[idx].cookie = NULL;
+			/* Free chain to keep free count consistent. */
+			vq_ring_free_chain(vq, idx);
+		}
+		idx++;
+	}
+
+	*last = idx;
+
+	return (cookie);
+}
+
+void
+virtqueue_dump(struct virtqueue *vq)
+{
+
+	if (vq == NULL)
+		return;
+
+	printf("VQ: %s - size=%d; free=%d; used=%d; queued=%d; "
+	    "desc_head_idx=%d; avail.idx=%d; used_cons_idx=%d; "
+	    "used.idx=%d; avail.flags=0x%x; used.flags=0x%x\n",
+	    vq->vq_name, vq->vq_nentries, vq->vq_free_cnt,
+	    virtqueue_nused(vq), vq->vq_queued_cnt, vq->vq_desc_head_idx,
+	    vq->vq_ring.avail->idx, vq->vq_used_cons_idx,
+	    vq->vq_ring.used->idx, vq->vq_ring.avail->flags,
+	    vq->vq_ring.used->flags);
+}
+
+static void
+vq_ring_init(struct virtqueue *vq)
+{
+	struct vring *vr;
+	char *ring_mem;
+	int i, size;
+
+	ring_mem = vq->vq_ring_mem;
+	size = vq->vq_nentries;
+	vr = &vq->vq_ring;
+
+	vring_init(vr, size, ring_mem, vq->vq_alignment);
+
+	for (i = 0; i < size - 1; i++)
+		vr->desc[i].next = i + 1;
+	vr->desc[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+static void
+vq_ring_update_avail(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = vq->vq_ring.avail->idx & (vq->vq_nentries - 1);
+	vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+
+	mb();
+	vq->vq_ring.avail->idx++;
+
+	/* Keep pending count until virtqueue_notify() for debugging. */
+	vq->vq_queued_cnt++;
+}
+
+static uint16_t
+vq_ring_enqueue_segments(struct virtqueue *vq, struct vring_desc *desc,
+    uint16_t head_idx, struct sglist *sg, int readable, int writable)
+{
+	struct sglist_seg *seg;
+	struct vring_desc *dp;
+	int i, needed;
+	uint16_t idx;
+
+	needed = readable + writable;
+
+	for (i = 0, idx = head_idx, seg = sg->sg_segs;
+	     i < needed;
+	     i++, idx = dp->next, seg++) {
+		VQASSERT(vq, idx != VQ_RING_DESC_CHAIN_END,
+		    "premature end of free desc chain");
+
+		dp = &desc[idx];
+		dp->addr = seg->ss_paddr;
+		dp->len = seg->ss_len;
+		dp->flags = 0;
+
+		if (i < needed - 1)
+			dp->flags |= VRING_DESC_F_NEXT;
+		if (i >= readable)
+			dp->flags |= VRING_DESC_F_WRITE;
+	}
+
+	return (idx);
+}
+
+static int
+vq_ring_use_indirect(struct virtqueue *vq, int needed)
+{
+
+	if ((vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT) == 0)
+		return (0);
+
+	if (vq->vq_max_indirect_size < needed)
+		return (0);
+
+	if (needed < 2)
+		return (0);
+
+	return (1);
+}
+
+static void
+vq_ring_enqueue_indirect(struct virtqueue *vq, void *cookie,
+    struct sglist *sg, int readable, int writable)
+{
+	struct vring_desc *dp;
+	struct vq_desc_extra *dxp;
+	int needed;
+	uint16_t head_idx;
+
+	needed = readable + writable;
+	VQASSERT(vq, needed <= vq->vq_max_indirect_size,
+	    "enqueuing too many indirect descriptors");
+
+	head_idx = vq->vq_desc_head_idx;
+	VQ_RING_ASSERT_VALID_IDX(vq, head_idx);
+	dp = &vq->vq_ring.desc[head_idx];
+	dxp = &vq->vq_descx[head_idx];
+
+	VQASSERT(vq, dxp->cookie == NULL,
+	    "cookie already exists for index %d", head_idx);
+	dxp->cookie = cookie;
+	dxp->ndescs = 1;
+
+	dp->addr = dxp->indirect_paddr;
+	dp->len = needed * sizeof(struct vring_desc);
+	dp->flags = VRING_DESC_F_INDIRECT;
+
+	vq_ring_enqueue_segments(vq, dxp->indirect, 0,
+	    sg, readable, writable);
+
+	vq->vq_desc_head_idx = dp->next;
+	vq->vq_free_cnt--;
+	if (vq->vq_free_cnt == 0)
+		VQ_RING_ASSERT_CHAIN_TERM(vq);
+	else
+		VQ_RING_ASSERT_VALID_IDX(vq, vq->vq_desc_head_idx);
+
+	vq_ring_update_avail(vq, head_idx);
+}
+
+static void
+vq_ring_notify_host(struct virtqueue *vq, int force)
+{
+
+	mb();
+
+	if (force ||
+	    (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) == 0)
+		VIRTIO_BUS_NOTIFY_VQ(vq->vq_dev, vq->vq_queue_index);
+}
+
+static void
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+	struct vring_desc *dp;
+	struct vq_desc_extra *dxp;
+
+	VQ_RING_ASSERT_VALID_IDX(vq, desc_idx);
+	dp = &vq->vq_ring.desc[desc_idx];
+	dxp = &vq->vq_descx[desc_idx];
+
+	if (vq->vq_free_cnt == 0)
+		VQ_RING_ASSERT_CHAIN_TERM(vq);
+
+	vq->vq_free_cnt += dxp->ndescs;
+	dxp->ndescs--;
+
+	if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
+		while (dp->flags & VRING_DESC_F_NEXT) {
+			VQ_RING_ASSERT_VALID_IDX(vq, dp->next);
+			dp = &vq->vq_ring.desc[dp->next];
+			dxp->ndescs--;
+		}
+	}
+	VQASSERT(vq, dxp->ndescs == 0, "failed to free entire desc chain");
+
+	/*
+	 * We must append the existing free chain, if any, to the end of
+	 * newly freed chain. If the virtqueue was completely used, then
+	 * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+	 */
+	dp->next = vq->vq_desc_head_idx;
+	vq->vq_desc_head_idx = desc_idx;
+}
diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h
new file mode 100644
index 0000000..e790e65
--- /dev/null
+++ b/sys/dev/virtio/virtqueue.h
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_VIRTQUEUE_H
+#define _VIRTIO_VIRTQUEUE_H
+
+#include <sys/types.h>
+
+struct virtqueue;
+struct sglist;
+
+/* Support for indirect buffer descriptors. */
+#define VIRTIO_RING_F_INDIRECT_DESC	(1 << 28)
+
+/* The guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ * The host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX		(1 << 29)
+
+/* Device callback for a virtqueue interrupt. */
+typedef int virtqueue_intr_t(void *);
+
+#define VIRTQUEUE_MAX_NAME_SZ	32
+
+/* One for each virtqueue the device wishes to allocate. */
+struct vq_alloc_info {
+	char		   vqai_name[VIRTQUEUE_MAX_NAME_SZ];
+	int		   vqai_maxindirsz;
+	virtqueue_intr_t  *vqai_intr;
+	void		  *vqai_intr_arg;
+	struct virtqueue **vqai_vq;
+};
+
+#define VQ_ALLOC_INFO_INIT(_i,_nsegs,_intr,_arg,_vqp,_str,...) do {	\
+	snprintf((_i)->vqai_name, VIRTQUEUE_MAX_NAME_SZ, _str,		\
+	    ##__VA_ARGS__);						\
+	(_i)->vqai_maxindirsz = (_nsegs);				\
+	(_i)->vqai_intr = (_intr);					\
+	(_i)->vqai_intr_arg = (_arg);					\
+	(_i)->vqai_vq = (_vqp);						\
+} while (0)
+
+uint64_t virtqueue_filter_features(uint64_t features);
+
+int	 virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
+	     int align, vm_paddr_t highaddr, struct vq_alloc_info *info,
+	     struct virtqueue **vqp);
+void	*virtqueue_drain(struct virtqueue *vq, int *last);
+void	 virtqueue_free(struct virtqueue *vq);
+int	 virtqueue_reinit(struct virtqueue *vq, uint16_t size);
+
+int	 virtqueue_intr(struct virtqueue *vq);
+int	 virtqueue_enable_intr(struct virtqueue *vq);
+void	 virtqueue_disable_intr(struct virtqueue *vq);
+
+/* Get physical address of the virtqueue ring. */
+vm_paddr_t virtqueue_paddr(struct virtqueue *vq);
+
+int	 virtqueue_full(struct virtqueue *vq);
+int	 virtqueue_empty(struct virtqueue *vq);
+int	 virtqueue_size(struct virtqueue *vq);
+int	 virtqueue_nused(struct virtqueue *vq);
+void	 virtqueue_notify(struct virtqueue *vq);
+void	 virtqueue_dump(struct virtqueue *vq);
+
+int	 virtqueue_enqueue(struct virtqueue *vq, void *cookie,
+	     struct sglist *sg, int readable, int writable);
+void	*virtqueue_dequeue(struct virtqueue *vq, uint32_t *len);
+void	*virtqueue_poll(struct virtqueue *vq, uint32_t *len);
+
+#endif /* _VIRTIO_VIRTQUEUE_H */
author	grehan <grehan@FreeBSD.org>	2011-11-18 05:43:43 +0000
committer	grehan <grehan@FreeBSD.org>	2011-11-18 05:43:43 +0000
commit	1a42b19ed0cb934631927da4b71fde34c8afdb34 (patch)
tree	4d3e53fb89135e392dab5a548569b236b9ba213e /sys/dev/virtio
parent	7b8778fe5a226bee40cd88fc68926e898e9bd8f7 (diff)
download	FreeBSD-src-1a42b19ed0cb934631927da4b71fde34c8afdb34.zip FreeBSD-src-1a42b19ed0cb934631927da4b71fde34c8afdb34.tar.gz