summaryrefslogtreecommitdiffstats
path: root/sys/dev/virtio
diff options
context:
space:
mode:
authorgrehan <grehan@FreeBSD.org>2011-11-18 05:43:43 +0000
committergrehan <grehan@FreeBSD.org>2011-11-18 05:43:43 +0000
commit1a42b19ed0cb934631927da4b71fde34c8afdb34 (patch)
tree4d3e53fb89135e392dab5a548569b236b9ba213e /sys/dev/virtio
parent7b8778fe5a226bee40cd88fc68926e898e9bd8f7 (diff)
downloadFreeBSD-src-1a42b19ed0cb934631927da4b71fde34c8afdb34.zip
FreeBSD-src-1a42b19ed0cb934631927da4b71fde34c8afdb34.tar.gz
Import virtio base, PCI front-end, and net/block/balloon drivers.
Tested on Qemu/KVM, VirtualBox, and BHyVe. Currently built as modules-only on i386/amd64. Man pages not yet hooked up, pending review. Submitted by: Bryan Venteicher bryanv at daemoninthecloset dot org Reviewed by: bz MFC after: 4 weeks or so
Diffstat (limited to 'sys/dev/virtio')
-rw-r--r--sys/dev/virtio/balloon/virtio_balloon.c569
-rw-r--r--sys/dev/virtio/balloon/virtio_balloon.h41
-rw-r--r--sys/dev/virtio/block/virtio_blk.c1149
-rw-r--r--sys/dev/virtio/block/virtio_blk.h106
-rw-r--r--sys/dev/virtio/network/if_vtnet.c2746
-rw-r--r--sys/dev/virtio/network/if_vtnetvar.h240
-rw-r--r--sys/dev/virtio/network/virtio_net.h138
-rw-r--r--sys/dev/virtio/pci/virtio_pci.c1081
-rw-r--r--sys/dev/virtio/pci/virtio_pci.h64
-rw-r--r--sys/dev/virtio/virtio.c283
-rw-r--r--sys/dev/virtio/virtio.h130
-rw-r--r--sys/dev/virtio/virtio_bus_if.m92
-rw-r--r--sys/dev/virtio/virtio_if.m43
-rw-r--r--sys/dev/virtio/virtio_ring.h119
-rw-r--r--sys/dev/virtio/virtqueue.c755
-rw-r--r--sys/dev/virtio/virtqueue.h98
16 files changed, 7654 insertions, 0 deletions
diff --git a/sys/dev/virtio/balloon/virtio_balloon.c b/sys/dev/virtio/balloon/virtio_balloon.c
new file mode 100644
index 0000000..ef7aca9
--- /dev/null
+++ b/sys/dev/virtio/balloon/virtio_balloon.c
@@ -0,0 +1,569 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for VirtIO memory balloon devices. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/endian.h>
+#include <sys/kthread.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sglist.h>
+#include <sys/sysctl.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/balloon/virtio_balloon.h>
+
+#include "virtio_if.h"
+
+struct vtballoon_softc {
+ device_t vtballoon_dev;
+ struct mtx vtballoon_mtx;
+ uint64_t vtballoon_features;
+ uint32_t vtballoon_flags;
+#define VTBALLOON_FLAG_DETACH 0x01
+
+ struct virtqueue *vtballoon_inflate_vq;
+ struct virtqueue *vtballoon_deflate_vq;
+
+ uint32_t vtballoon_desired_npages;
+ uint32_t vtballoon_current_npages;
+ TAILQ_HEAD(,vm_page) vtballoon_pages;
+
+ struct proc *vtballoon_kproc;
+ uint32_t *vtballoon_page_frames;
+ int vtballoon_timeout;
+};
+
+static struct virtio_feature_desc vtballoon_feature_desc[] = {
+ { VIRTIO_BALLOON_F_MUST_TELL_HOST, "MustTellHost" },
+ { VIRTIO_BALLOON_F_STATS_VQ, "StatsVq" },
+
+ { 0, NULL }
+};
+
+static int vtballoon_probe(device_t);
+static int vtballoon_attach(device_t);
+static int vtballoon_detach(device_t);
+static int vtballoon_config_change(device_t);
+
+static void vtballoon_negotiate_features(struct vtballoon_softc *);
+static int vtballoon_alloc_virtqueues(struct vtballoon_softc *);
+
+static int vtballoon_vq_intr(void *);
+
+static void vtballoon_inflate(struct vtballoon_softc *, int);
+static void vtballoon_deflate(struct vtballoon_softc *, int);
+
+static void vtballoon_send_page_frames(struct vtballoon_softc *,
+ struct virtqueue *, int);
+
+static void vtballoon_pop(struct vtballoon_softc *);
+static void vtballoon_stop(struct vtballoon_softc *);
+
+static vm_page_t
+ vtballoon_alloc_page(struct vtballoon_softc *);
+static void vtballoon_free_page(struct vtballoon_softc *, vm_page_t);
+
+static int vtballoon_sleep(struct vtballoon_softc *);
+static void vtballoon_thread(void *);
+static void vtballoon_add_sysctl(struct vtballoon_softc *);
+
+/* Features desired/implemented by this driver. */
+#define VTBALLOON_FEATURES 0
+
+/* Timeout between retries when the balloon needs inflating. */
+#define VTBALLOON_LOWMEM_TIMEOUT hz
+
+/*
+ * Maximum number of pages we'll request to inflate or deflate
+ * the balloon in one virtqueue request. Both Linux and NetBSD
+ * have settled on 256, doing up to 1MB at a time.
+ */
+#define VTBALLOON_PAGES_PER_REQUEST 256
+
+#define VTBALLOON_MTX(_sc) &(_sc)->vtballoon_mtx
+#define VTBALLOON_LOCK_INIT(_sc, _name) mtx_init(VTBALLOON_MTX((_sc)), _name, \
+ "VirtIO Balloon Lock", MTX_SPIN)
+#define VTBALLOON_LOCK(_sc) mtx_lock_spin(VTBALLOON_MTX((_sc)))
+#define VTBALLOON_UNLOCK(_sc) mtx_unlock_spin(VTBALLOON_MTX((_sc)))
+#define VTBALLOON_LOCK_DESTROY(_sc) mtx_destroy(VTBALLOON_MTX((_sc)))
+
+static device_method_t vtballoon_methods[] = {
+ /* Device methods. */
+ DEVMETHOD(device_probe, vtballoon_probe),
+ DEVMETHOD(device_attach, vtballoon_attach),
+ DEVMETHOD(device_detach, vtballoon_detach),
+
+ /* VirtIO methods. */
+ DEVMETHOD(virtio_config_change, vtballoon_config_change),
+
+ { 0, 0 }
+};
+
+static driver_t vtballoon_driver = {
+ "vtballoon",
+ vtballoon_methods,
+ sizeof(struct vtballoon_softc)
+};
+static devclass_t vtballoon_devclass;
+
+DRIVER_MODULE(virtio_balloon, virtio_pci, vtballoon_driver,
+ vtballoon_devclass, 0, 0);
+MODULE_VERSION(virtio_balloon, 1);
+MODULE_DEPEND(virtio_balloon, virtio, 1, 1, 1);
+
+static int
+vtballoon_probe(device_t dev)
+{
+
+ if (virtio_get_device_type(dev) != VIRTIO_ID_BALLOON)
+ return (ENXIO);
+
+ device_set_desc(dev, "VirtIO Balloon Adapter");
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtballoon_attach(device_t dev)
+{
+ struct vtballoon_softc *sc;
+ int error;
+
+ sc = device_get_softc(dev);
+ sc->vtballoon_dev = dev;
+
+ VTBALLOON_LOCK_INIT(sc, device_get_nameunit(dev));
+ TAILQ_INIT(&sc->vtballoon_pages);
+
+ vtballoon_add_sysctl(sc);
+
+ virtio_set_feature_desc(dev, vtballoon_feature_desc);
+ vtballoon_negotiate_features(sc);
+
+ sc->vtballoon_page_frames = malloc(VTBALLOON_PAGES_PER_REQUEST *
+ sizeof(uint32_t), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (sc->vtballoon_page_frames == NULL) {
+ error = ENOMEM;
+ device_printf(dev,
+ "cannot allocate page frame request array\n");
+ goto fail;
+ }
+
+ error = vtballoon_alloc_virtqueues(sc);
+ if (error) {
+ device_printf(dev, "cannot allocate virtqueues\n");
+ goto fail;
+ }
+
+ error = virtio_setup_intr(dev, INTR_TYPE_MISC);
+ if (error) {
+ device_printf(dev, "cannot setup virtqueue interrupts\n");
+ goto fail;
+ }
+
+ error = kproc_create(vtballoon_thread, sc, &sc->vtballoon_kproc,
+ 0, 0, "virtio_balloon");
+ if (error) {
+ device_printf(dev, "cannot create balloon kproc\n");
+ goto fail;
+ }
+
+ virtqueue_enable_intr(sc->vtballoon_inflate_vq);
+ virtqueue_enable_intr(sc->vtballoon_deflate_vq);
+
+fail:
+ if (error)
+ vtballoon_detach(dev);
+
+ return (error);
+}
+
+static int
+vtballoon_detach(device_t dev)
+{
+ struct vtballoon_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ if (sc->vtballoon_kproc != NULL) {
+ VTBALLOON_LOCK(sc);
+ sc->vtballoon_flags |= VTBALLOON_FLAG_DETACH;
+ wakeup_one(sc);
+ msleep_spin(sc->vtballoon_kproc, VTBALLOON_MTX(sc),
+ "vtbdth", 0);
+ VTBALLOON_UNLOCK(sc);
+
+ sc->vtballoon_kproc = NULL;
+ }
+
+ if (device_is_attached(dev)) {
+ vtballoon_pop(sc);
+ vtballoon_stop(sc);
+ }
+
+ if (sc->vtballoon_page_frames != NULL) {
+ free(sc->vtballoon_page_frames, M_DEVBUF);
+ sc->vtballoon_page_frames = NULL;
+ }
+
+ VTBALLOON_LOCK_DESTROY(sc);
+
+ return (0);
+}
+
+static int
+vtballoon_config_change(device_t dev)
+{
+ struct vtballoon_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ VTBALLOON_LOCK(sc);
+ wakeup_one(sc);
+ VTBALLOON_UNLOCK(sc);
+
+ return (1);
+}
+
+static void
+vtballoon_negotiate_features(struct vtballoon_softc *sc)
+{
+ device_t dev;
+ uint64_t features;
+
+ dev = sc->vtballoon_dev;
+ features = virtio_negotiate_features(dev, VTBALLOON_FEATURES);
+ sc->vtballoon_features = features;
+}
+
+static int
+vtballoon_alloc_virtqueues(struct vtballoon_softc *sc)
+{
+ device_t dev;
+ struct vq_alloc_info vq_info[2];
+ int nvqs;
+
+ dev = sc->vtballoon_dev;
+ nvqs = 2;
+
+ VQ_ALLOC_INFO_INIT(&vq_info[0], 0, vtballoon_vq_intr, sc,
+ &sc->vtballoon_inflate_vq, "%s inflate", device_get_nameunit(dev));
+
+ VQ_ALLOC_INFO_INIT(&vq_info[1], 0, vtballoon_vq_intr, sc,
+ &sc->vtballoon_deflate_vq, "%s deflate", device_get_nameunit(dev));
+
+ return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info));
+}
+
+static int
+vtballoon_vq_intr(void *xsc)
+{
+ struct vtballoon_softc *sc;
+
+ sc = xsc;
+
+ VTBALLOON_LOCK(sc);
+ wakeup_one(sc);
+ VTBALLOON_UNLOCK(sc);
+
+ return (1);
+}
+
+static void
+vtballoon_inflate(struct vtballoon_softc *sc, int npages)
+{
+ struct virtqueue *vq;
+ vm_page_t m;
+ int i;
+
+ vq = sc->vtballoon_inflate_vq;
+ m = NULL;
+
+ if (npages > VTBALLOON_PAGES_PER_REQUEST)
+ npages = VTBALLOON_PAGES_PER_REQUEST;
+ KASSERT(npages > 0, ("balloon doesn't need inflating?"));
+
+ for (i = 0; i < npages; i++) {
+ if ((m = vtballoon_alloc_page(sc)) == NULL)
+ break;
+
+ sc->vtballoon_page_frames[i] =
+ VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT;
+
+ KASSERT(m->queue == PQ_NONE, ("allocated page on queue"));
+ TAILQ_INSERT_TAIL(&sc->vtballoon_pages, m, pageq);
+ }
+
+ if (i > 0)
+ vtballoon_send_page_frames(sc, vq, i);
+
+ if (m == NULL)
+ sc->vtballoon_timeout = VTBALLOON_LOWMEM_TIMEOUT;
+}
+
+static void
+vtballoon_deflate(struct vtballoon_softc *sc, int npages)
+{
+ TAILQ_HEAD(, vm_page) free_pages;
+ struct virtqueue *vq;
+ vm_page_t m;
+ int i;
+
+ vq = sc->vtballoon_deflate_vq;
+ TAILQ_INIT(&free_pages);
+
+ if (npages > VTBALLOON_PAGES_PER_REQUEST)
+ npages = VTBALLOON_PAGES_PER_REQUEST;
+ KASSERT(npages > 0, ("balloon doesn't need deflating?"));
+
+ for (i = 0; i < npages; i++) {
+ m = TAILQ_FIRST(&sc->vtballoon_pages);
+ KASSERT(m != NULL, ("no more pages to deflate"));
+
+ sc->vtballoon_page_frames[i] =
+ VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT;
+
+ TAILQ_REMOVE(&sc->vtballoon_pages, m, pageq);
+ TAILQ_INSERT_TAIL(&free_pages, m, pageq);
+ }
+
+ if (i > 0) {
+ /* Always tell host first before freeing the pages. */
+ vtballoon_send_page_frames(sc, vq, i);
+
+ while ((m = TAILQ_FIRST(&free_pages)) != NULL) {
+ TAILQ_REMOVE(&free_pages, m, pageq);
+ vtballoon_free_page(sc, m);
+ }
+ }
+
+ KASSERT((TAILQ_EMPTY(&sc->vtballoon_pages) &&
+ sc->vtballoon_current_npages == 0) ||
+ (!TAILQ_EMPTY(&sc->vtballoon_pages) &&
+ sc->vtballoon_current_npages != 0), ("balloon empty?"));
+}
+
+static void
+vtballoon_send_page_frames(struct vtballoon_softc *sc, struct virtqueue *vq,
+ int npages)
+{
+ struct sglist sg;
+ struct sglist_seg segs[1];
+ void *c;
+ int error;
+
+ sglist_init(&sg, 1, segs);
+
+ error = sglist_append(&sg, sc->vtballoon_page_frames,
+ npages * sizeof(uint32_t));
+ KASSERT(error == 0, ("error adding page frames to sglist"));
+
+ error = virtqueue_enqueue(vq, vq, &sg, 1, 0);
+ KASSERT(error == 0, ("error enqueuing page frames to virtqueue"));
+
+ /*
+ * Inflate and deflate operations are done synchronously. The
+ * interrupt handler will wake us up.
+ */
+ VTBALLOON_LOCK(sc);
+ virtqueue_notify(vq);
+
+ while ((c = virtqueue_dequeue(vq, NULL)) == NULL)
+ msleep_spin(sc, VTBALLOON_MTX(sc), "vtbspf", 0);
+ VTBALLOON_UNLOCK(sc);
+
+ KASSERT(c == vq, ("unexpected balloon operation response"));
+}
+
+static void
+vtballoon_pop(struct vtballoon_softc *sc)
+{
+
+ while (!TAILQ_EMPTY(&sc->vtballoon_pages))
+ vtballoon_deflate(sc, sc->vtballoon_current_npages);
+}
+
+static void
+vtballoon_stop(struct vtballoon_softc *sc)
+{
+
+ virtqueue_disable_intr(sc->vtballoon_inflate_vq);
+ virtqueue_disable_intr(sc->vtballoon_deflate_vq);
+
+ virtio_stop(sc->vtballoon_dev);
+}
+
+static vm_page_t
+vtballoon_alloc_page(struct vtballoon_softc *sc)
+{
+ vm_page_t m;
+
+ m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED |
+ VM_ALLOC_NOOBJ);
+ if (m != NULL)
+ sc->vtballoon_current_npages++;
+
+ return (m);
+}
+
+static void
+vtballoon_free_page(struct vtballoon_softc *sc, vm_page_t m)
+{
+
+ vm_page_unwire(m, 0);
+ vm_page_free(m);
+ sc->vtballoon_current_npages--;
+}
+
+static uint32_t
+vtballoon_desired_size(struct vtballoon_softc *sc)
+{
+ uint32_t desired;
+
+ desired = virtio_read_dev_config_4(sc->vtballoon_dev,
+ offsetof(struct virtio_balloon_config, num_pages));
+
+ return (le32toh(desired));
+}
+
+static void
+vtballoon_update_size(struct vtballoon_softc *sc)
+{
+
+ virtio_write_dev_config_4(sc->vtballoon_dev,
+ offsetof(struct virtio_balloon_config, actual),
+ htole32(sc->vtballoon_current_npages));
+
+}
+
+static int
+vtballoon_sleep(struct vtballoon_softc *sc)
+{
+ int rc, timeout;
+ uint32_t current, desired;
+
+ rc = 0;
+ current = sc->vtballoon_current_npages;
+
+ VTBALLOON_LOCK(sc);
+ for (;;) {
+ if (sc->vtballoon_flags & VTBALLOON_FLAG_DETACH) {
+ rc = 1;
+ break;
+ }
+
+ desired = vtballoon_desired_size(sc);
+ sc->vtballoon_desired_npages = desired;
+
+ /*
+ * If given, use non-zero timeout on the first time through
+ * the loop. On subsequent times, timeout will be zero so
+ * we will reevaluate the desired size of the balloon and
+ * break out to retry if needed.
+ */
+ timeout = sc->vtballoon_timeout;
+ sc->vtballoon_timeout = 0;
+
+ if (current > desired)
+ break;
+ if (current < desired && timeout == 0)
+ break;
+
+ msleep_spin(sc, VTBALLOON_MTX(sc), "vtbslp", timeout);
+ }
+ VTBALLOON_UNLOCK(sc);
+
+ return (rc);
+}
+
+static void
+vtballoon_thread(void *xsc)
+{
+ struct vtballoon_softc *sc;
+ uint32_t current, desired;
+
+ sc = xsc;
+
+ for (;;) {
+ if (vtballoon_sleep(sc) != 0)
+ break;
+
+ current = sc->vtballoon_current_npages;
+ desired = sc->vtballoon_desired_npages;
+
+ if (desired != current) {
+ if (desired > current)
+ vtballoon_inflate(sc, desired - current);
+ else
+ vtballoon_deflate(sc, current - desired);
+
+ vtballoon_update_size(sc);
+ }
+ }
+
+ kproc_exit(0);
+}
+
+static void
+vtballoon_add_sysctl(struct vtballoon_softc *sc)
+{
+ device_t dev;
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid *tree;
+ struct sysctl_oid_list *child;
+
+ dev = sc->vtballoon_dev;
+ ctx = device_get_sysctl_ctx(dev);
+ tree = device_get_sysctl_tree(dev);
+ child = SYSCTL_CHILDREN(tree);
+
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "desired",
+ CTLFLAG_RD, &sc->vtballoon_desired_npages, sizeof(uint32_t),
+ "Desired balloon size in pages");
+
+ SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "current",
+ CTLFLAG_RD, &sc->vtballoon_current_npages, sizeof(uint32_t),
+ "Current balloon size in pages");
+}
diff --git a/sys/dev/virtio/balloon/virtio_balloon.h b/sys/dev/virtio/balloon/virtio_balloon.h
new file mode 100644
index 0000000..cea84ba
--- /dev/null
+++ b/sys/dev/virtio/balloon/virtio_balloon.h
@@ -0,0 +1,41 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_BALLOON_H
+#define _VIRTIO_BALLOON_H
+
+#include <sys/types.h>
+
+/* Feature bits. */
+#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0x1 /* Tell before reclaiming pages */
+#define VIRTIO_BALLOON_F_STATS_VQ 0x2 /* Memory stats virtqueue */
+
+/* Size of a PFN in the balloon interface. */
+#define VIRTIO_BALLOON_PFN_SHIFT 12
+
+struct virtio_balloon_config {
+ /* Number of pages host wants Guest to give up. */
+ uint32_t num_pages;
+
+ /* Number of pages we've actually got in balloon. */
+ uint32_t actual;
+};
+
+#define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */
+#define VIRTIO_BALLOON_S_SWAP_OUT 1 /* Amount of memory swapped out */
+#define VIRTIO_BALLOON_S_MAJFLT 2 /* Number of major faults */
+#define VIRTIO_BALLOON_S_MINFLT 3 /* Number of minor faults */
+#define VIRTIO_BALLOON_S_MEMFREE 4 /* Total amount of free memory */
+#define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */
+#define VIRTIO_BALLOON_S_NR 6
+
+struct virtio_balloon_stat {
+ uint16_t tag;
+ uint64_t val;
+} __packed;
+
+#endif /* _VIRTIO_BALLOON_H */
diff --git a/sys/dev/virtio/block/virtio_blk.c b/sys/dev/virtio/block/virtio_blk.c
new file mode 100644
index 0000000..09783a8
--- /dev/null
+++ b/sys/dev/virtio/block/virtio_blk.c
@@ -0,0 +1,1149 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for VirtIO block devices. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bio.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/taskqueue.h>
+
+#include <geom/geom_disk.h>
+#include <vm/uma.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/block/virtio_blk.h>
+
+#include "virtio_if.h"
+
+struct vtblk_request {
+ struct virtio_blk_outhdr vbr_hdr;
+ struct bio *vbr_bp;
+ uint8_t vbr_ack;
+
+ TAILQ_ENTRY(vtblk_request) vbr_link;
+};
+
+struct vtblk_softc {
+ device_t vtblk_dev;
+ struct mtx vtblk_mtx;
+ uint64_t vtblk_features;
+ uint32_t vtblk_flags;
+#define VTBLK_FLAG_INDIRECT 0x0001
+#define VTBLK_FLAG_READONLY 0x0002
+#define VTBLK_FLAG_DETACHING 0x0004
+#define VTBLK_FLAG_SUSPENDED 0x0008
+#define VTBLK_FLAG_DUMPING 0x0010
+
+ struct virtqueue *vtblk_vq;
+ struct sglist *vtblk_sglist;
+ struct disk *vtblk_disk;
+
+ struct bio_queue_head vtblk_bioq;
+ TAILQ_HEAD(, vtblk_request)
+ vtblk_req_free;
+ TAILQ_HEAD(, vtblk_request)
+ vtblk_req_ready;
+
+ struct taskqueue *vtblk_tq;
+ struct task vtblk_intr_task;
+
+ int vtblk_sector_size;
+ int vtblk_max_nsegs;
+ int vtblk_unit;
+ int vtblk_request_count;
+
+ struct vtblk_request vtblk_dump_request;
+};
+
+static struct virtio_feature_desc vtblk_feature_desc[] = {
+ { VIRTIO_BLK_F_BARRIER, "HostBarrier" },
+ { VIRTIO_BLK_F_SIZE_MAX, "MaxSegSize" },
+ { VIRTIO_BLK_F_SEG_MAX, "MaxNumSegs" },
+ { VIRTIO_BLK_F_GEOMETRY, "DiskGeometry" },
+ { VIRTIO_BLK_F_RO, "ReadOnly" },
+ { VIRTIO_BLK_F_BLK_SIZE, "BlockSize" },
+ { VIRTIO_BLK_F_SCSI, "SCSICmds" },
+ { VIRTIO_BLK_F_FLUSH, "FlushCmd" },
+ { VIRTIO_BLK_F_TOPOLOGY, "Topology" },
+
+ { 0, NULL }
+};
+
+static int vtblk_modevent(module_t, int, void *);
+
+static int vtblk_probe(device_t);
+static int vtblk_attach(device_t);
+static int vtblk_detach(device_t);
+static int vtblk_suspend(device_t);
+static int vtblk_resume(device_t);
+static int vtblk_shutdown(device_t);
+
+static void vtblk_negotiate_features(struct vtblk_softc *);
+static int vtblk_maximum_segments(struct vtblk_softc *,
+ struct virtio_blk_config *);
+static int vtblk_alloc_virtqueue(struct vtblk_softc *);
+static void vtblk_alloc_disk(struct vtblk_softc *,
+ struct virtio_blk_config *);
+static void vtblk_create_disk(struct vtblk_softc *);
+
+static int vtblk_open(struct disk *);
+static int vtblk_close(struct disk *);
+static int vtblk_ioctl(struct disk *, u_long, void *, int,
+ struct thread *);
+static int vtblk_dump(void *, void *, vm_offset_t, off_t, size_t);
+static void vtblk_strategy(struct bio *);
+
+static void vtblk_startio(struct vtblk_softc *);
+static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *);
+static int vtblk_execute_request(struct vtblk_softc *,
+ struct vtblk_request *);
+
+static int vtblk_vq_intr(void *);
+static void vtblk_intr_task(void *, int);
+
+static void vtblk_stop(struct vtblk_softc *);
+
+static void vtblk_get_ident(struct vtblk_softc *);
+static void vtblk_prepare_dump(struct vtblk_softc *);
+static int vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t);
+static int vtblk_flush_dump(struct vtblk_softc *);
+static int vtblk_poll_request(struct vtblk_softc *,
+ struct vtblk_request *);
+
+static void vtblk_drain_vq(struct vtblk_softc *, int);
+static void vtblk_drain(struct vtblk_softc *);
+
+static int vtblk_alloc_requests(struct vtblk_softc *);
+static void vtblk_free_requests(struct vtblk_softc *);
+static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *);
+static void vtblk_enqueue_request(struct vtblk_softc *,
+ struct vtblk_request *);
+
+static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *);
+static void vtblk_enqueue_ready(struct vtblk_softc *,
+ struct vtblk_request *);
+
+static void vtblk_bio_error(struct bio *, int);
+
+/* Tunables. */
+static int vtblk_no_ident = 0;
+TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident);
+
+/* Features desired/implemented by this driver. */
+#define VTBLK_FEATURES \
+ (VIRTIO_BLK_F_BARRIER | \
+ VIRTIO_BLK_F_SIZE_MAX | \
+ VIRTIO_BLK_F_SEG_MAX | \
+ VIRTIO_BLK_F_GEOMETRY | \
+ VIRTIO_BLK_F_RO | \
+ VIRTIO_BLK_F_BLK_SIZE | \
+ VIRTIO_BLK_F_FLUSH | \
+ VIRTIO_RING_F_INDIRECT_DESC)
+
+#define VTBLK_MTX(_sc) &(_sc)->vtblk_mtx
+#define VTBLK_LOCK_INIT(_sc, _name) \
+ mtx_init(VTBLK_MTX((_sc)), (_name), \
+ "VTBLK Lock", MTX_DEF)
+#define VTBLK_LOCK(_sc) mtx_lock(VTBLK_MTX((_sc)))
+#define VTBLK_TRYLOCK(_sc) mtx_trylock(VTBLK_MTX((_sc)))
+#define VTBLK_UNLOCK(_sc) mtx_unlock(VTBLK_MTX((_sc)))
+#define VTBLK_LOCK_DESTROY(_sc) mtx_destroy(VTBLK_MTX((_sc)))
+#define VTBLK_LOCK_ASSERT(_sc) mtx_assert(VTBLK_MTX((_sc)), MA_OWNED)
+#define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \
+ mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED)
+
+#define VTBLK_BIO_SEGMENTS(_bp) sglist_count((_bp)->bio_data, (_bp)->bio_bcount)
+
+#define VTBLK_DISK_NAME "vtbd"
+
+/*
+ * Each block request uses at least two segments - one for the header
+ * and one for the status.
+ */
+#define VTBLK_MIN_SEGMENTS 2
+
+static uma_zone_t vtblk_req_zone;
+
+static device_method_t vtblk_methods[] = {
+ /* Device methods. */
+ DEVMETHOD(device_probe, vtblk_probe),
+ DEVMETHOD(device_attach, vtblk_attach),
+ DEVMETHOD(device_detach, vtblk_detach),
+ DEVMETHOD(device_suspend, vtblk_suspend),
+ DEVMETHOD(device_resume, vtblk_resume),
+ DEVMETHOD(device_shutdown, vtblk_shutdown),
+
+ { 0, 0 }
+};
+
+static driver_t vtblk_driver = {
+ "vtblk",
+ vtblk_methods,
+ sizeof(struct vtblk_softc)
+};
+static devclass_t vtblk_devclass;
+
+DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass,
+ vtblk_modevent, 0);
+MODULE_VERSION(virtio_blk, 1);
+MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1);
+
+static int
+vtblk_modevent(module_t mod, int type, void *unused)
+{
+ int error;
+
+ error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ vtblk_req_zone = uma_zcreate("vtblk_request",
+ sizeof(struct vtblk_request),
+ NULL, NULL, NULL, NULL, 0, 0);
+ break;
+ case MOD_QUIESCE:
+ case MOD_UNLOAD:
+ if (uma_zone_get_cur(vtblk_req_zone) > 0)
+ error = EBUSY;
+ else if (type == MOD_UNLOAD) {
+ uma_zdestroy(vtblk_req_zone);
+ vtblk_req_zone = NULL;
+ }
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+static int
+vtblk_probe(device_t dev)
+{
+
+ if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK)
+ return (ENXIO);
+
+ device_set_desc(dev, "VirtIO Block Adapter");
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtblk_attach(device_t dev)
+{
+ struct vtblk_softc *sc;
+ struct virtio_blk_config blkcfg;
+ int error;
+
+ sc = device_get_softc(dev);
+ sc->vtblk_dev = dev;
+ sc->vtblk_unit = device_get_unit(dev);
+
+ VTBLK_LOCK_INIT(sc, device_get_nameunit(dev));
+
+ bioq_init(&sc->vtblk_bioq);
+ TAILQ_INIT(&sc->vtblk_req_free);
+ TAILQ_INIT(&sc->vtblk_req_ready);
+
+ virtio_set_feature_desc(dev, vtblk_feature_desc);
+ vtblk_negotiate_features(sc);
+
+ if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC))
+ sc->vtblk_flags |= VTBLK_FLAG_INDIRECT;
+
+ if (virtio_with_feature(dev, VIRTIO_BLK_F_RO))
+ sc->vtblk_flags |= VTBLK_FLAG_READONLY;
+
+ /* Get local copy of config. */
+ if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) == 0) {
+ bzero(&blkcfg, sizeof(struct virtio_blk_config));
+ virtio_read_device_config(dev, 0, &blkcfg,
+ offsetof(struct virtio_blk_config, physical_block_exp));
+ } else
+ virtio_read_device_config(dev, 0, &blkcfg,
+ sizeof(struct virtio_blk_config));
+
+ /*
+ * With the current sglist(9) implementation, it is not easy
+ * for us to support a maximum segment size as adjacent
+ * segments are coalesced. For now, just make sure it's larger
+ * than the maximum supported transfer size.
+ */
+ if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) {
+ if (blkcfg.size_max < MAXPHYS) {
+ error = ENOTSUP;
+ device_printf(dev, "host requires unsupported "
+ "maximum segment size feature\n");
+ goto fail;
+ }
+ }
+
+ sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg);
+
+ /*
+ * Allocate working sglist. The number of segments may be too
+ * large to safely store on the stack.
+ */
+ sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT);
+ if (sc->vtblk_sglist == NULL) {
+ error = ENOMEM;
+ device_printf(dev, "cannot allocate sglist\n");
+ goto fail;
+ }
+
+ error = vtblk_alloc_virtqueue(sc);
+ if (error) {
+ device_printf(dev, "cannot allocate virtqueue\n");
+ goto fail;
+ }
+
+ error = vtblk_alloc_requests(sc);
+ if (error) {
+ device_printf(dev, "cannot preallocate requests\n");
+ goto fail;
+ }
+
+ vtblk_alloc_disk(sc, &blkcfg);
+
+ TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc);
+ sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT,
+ taskqueue_thread_enqueue, &sc->vtblk_tq);
+ if (sc->vtblk_tq == NULL) {
+ error = ENOMEM;
+ device_printf(dev, "cannot allocate taskqueue\n");
+ goto fail;
+ }
+ taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq",
+ device_get_nameunit(dev));
+
+ error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY);
+ if (error) {
+ device_printf(dev, "cannot setup virtqueue interrupt\n");
+ goto fail;
+ }
+
+ vtblk_create_disk(sc);
+
+ virtqueue_enable_intr(sc->vtblk_vq);
+
+fail:
+ if (error)
+ vtblk_detach(dev);
+
+ return (error);
+}
+
+static int
+vtblk_detach(device_t dev)
+{
+ struct vtblk_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ VTBLK_LOCK(sc);
+ sc->vtblk_flags |= VTBLK_FLAG_DETACHING;
+ if (device_is_attached(dev))
+ vtblk_stop(sc);
+ VTBLK_UNLOCK(sc);
+
+ if (sc->vtblk_tq != NULL) {
+ taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task);
+ taskqueue_free(sc->vtblk_tq);
+ sc->vtblk_tq = NULL;
+ }
+
+ vtblk_drain(sc);
+
+ if (sc->vtblk_disk != NULL) {
+ disk_destroy(sc->vtblk_disk);
+ sc->vtblk_disk = NULL;
+ }
+
+ if (sc->vtblk_sglist != NULL) {
+ sglist_free(sc->vtblk_sglist);
+ sc->vtblk_sglist = NULL;
+ }
+
+ VTBLK_LOCK_DESTROY(sc);
+
+ return (0);
+}
+
+static int
+vtblk_suspend(device_t dev)
+{
+ struct vtblk_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ VTBLK_LOCK(sc);
+ sc->vtblk_flags |= VTBLK_FLAG_SUSPENDED;
+ /* TODO Wait for any inflight IO to complete? */
+ VTBLK_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+vtblk_resume(device_t dev)
+{
+ struct vtblk_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ VTBLK_LOCK(sc);
+ sc->vtblk_flags &= ~VTBLK_FLAG_SUSPENDED;
+ /* TODO Resume IO? */
+ VTBLK_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+vtblk_shutdown(device_t dev)
+{
+
+ return (0);
+}
+
+static int
+vtblk_open(struct disk *dp)
+{
+ struct vtblk_softc *sc;
+
+ if ((sc = dp->d_drv1) == NULL)
+ return (ENXIO);
+
+ return (sc->vtblk_flags & VTBLK_FLAG_DETACHING ? ENXIO : 0);
+}
+
+static int
+vtblk_close(struct disk *dp)
+{
+ struct vtblk_softc *sc;
+
+ if ((sc = dp->d_drv1) == NULL)
+ return (ENXIO);
+
+ return (0);
+}
+
+static int
+vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag,
+ struct thread *td)
+{
+ struct vtblk_softc *sc;
+
+ if ((sc = dp->d_drv1) == NULL)
+ return (ENXIO);
+
+ return (ENOTTY);
+}
+
+static int
+vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset,
+ size_t length)
+{
+ struct disk *dp;
+ struct vtblk_softc *sc;
+ int error;
+
+ dp = arg;
+ error = 0;
+
+ if ((sc = dp->d_drv1) == NULL)
+ return (ENXIO);
+
+ if (VTBLK_TRYLOCK(sc) == 0) {
+ device_printf(sc->vtblk_dev,
+ "softc already locked, cannot dump...\n");
+ return (EBUSY);
+ }
+
+ if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) {
+ vtblk_prepare_dump(sc);
+ sc->vtblk_flags |= VTBLK_FLAG_DUMPING;
+ }
+
+ if (length > 0)
+ error = vtblk_write_dump(sc, virtual, offset, length);
+ else if (virtual == NULL && offset == 0)
+ error = vtblk_flush_dump(sc);
+
+ VTBLK_UNLOCK(sc);
+
+ return (error);
+}
+
+static void
+vtblk_strategy(struct bio *bp)
+{
+ struct vtblk_softc *sc;
+
+ if ((sc = bp->bio_disk->d_drv1) == NULL) {
+ vtblk_bio_error(bp, EINVAL);
+ return;
+ }
+
+ /*
+ * Fail any write if RO. Unfortunately, there does not seem to
+ * be a better way to report our readonly'ness to GEOM above.
+ */
+ if (sc->vtblk_flags & VTBLK_FLAG_READONLY &&
+ (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) {
+ vtblk_bio_error(bp, EROFS);
+ return;
+ }
+
+ /*
+ * Prevent read/write buffers spanning too many segments from
+ * getting into the queue. This should only trip if d_maxsize
+ * was incorrectly set.
+ */
+ if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
+ KASSERT(VTBLK_BIO_SEGMENTS(bp) <= sc->vtblk_max_nsegs -
+ VTBLK_MIN_SEGMENTS,
+ ("bio spanned too many segments: %d, max: %d",
+ VTBLK_BIO_SEGMENTS(bp),
+ sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS));
+ }
+
+ VTBLK_LOCK(sc);
+ if ((sc->vtblk_flags & VTBLK_FLAG_DETACHING) == 0) {
+ bioq_disksort(&sc->vtblk_bioq, bp);
+ vtblk_startio(sc);
+ } else
+ vtblk_bio_error(bp, ENXIO);
+ VTBLK_UNLOCK(sc);
+}
+
+static void
+vtblk_negotiate_features(struct vtblk_softc *sc)
+{
+ device_t dev;
+ uint64_t features;
+
+ dev = sc->vtblk_dev;
+ features = VTBLK_FEATURES;
+
+ sc->vtblk_features = virtio_negotiate_features(dev, features);
+}
+
+static int
+vtblk_maximum_segments(struct vtblk_softc *sc,
+ struct virtio_blk_config *blkcfg)
+{
+ device_t dev;
+ int nsegs;
+
+ dev = sc->vtblk_dev;
+ nsegs = VTBLK_MIN_SEGMENTS;
+
+ if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) {
+ nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1);
+ if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT)
+ nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT);
+ } else
+ nsegs += 1;
+
+ return (nsegs);
+}
+
+static int
+vtblk_alloc_virtqueue(struct vtblk_softc *sc)
+{
+ device_t dev;
+ struct vq_alloc_info vq_info;
+
+ dev = sc->vtblk_dev;
+
+ VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs,
+ vtblk_vq_intr, sc, &sc->vtblk_vq,
+ "%s request", device_get_nameunit(dev));
+
+ return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info));
+}
+
+static void
+vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg)
+{
+ device_t dev;
+ struct disk *dp;
+
+ dev = sc->vtblk_dev;
+
+ sc->vtblk_disk = dp = disk_alloc();
+ dp->d_open = vtblk_open;
+ dp->d_close = vtblk_close;
+ dp->d_ioctl = vtblk_ioctl;
+ dp->d_strategy = vtblk_strategy;
+ dp->d_name = VTBLK_DISK_NAME;
+ dp->d_unit = sc->vtblk_unit;
+ dp->d_drv1 = sc;
+
+ if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0)
+ dp->d_dump = vtblk_dump;
+
+ /* Capacity is always in 512-byte units. */
+ dp->d_mediasize = blkcfg->capacity * 512;
+
+ if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE))
+ sc->vtblk_sector_size = blkcfg->blk_size;
+ else
+ sc->vtblk_sector_size = 512;
+ dp->d_sectorsize = sc->vtblk_sector_size;
+
+ /*
+ * The VirtIO maximum I/O size is given in terms of segments.
+ * However, FreeBSD limits I/O size by logical buffer size, not
+ * by physically contiguous pages. Therefore, we have to assume
+ * no pages are contiguous. This may impose an artificially low
+ * maximum I/O size. But in practice, since QEMU advertises 128
+ * segments, this gives us a maximum IO size of 125 * PAGE_SIZE,
+ * which is typically greater than MAXPHYS. Eventually we should
+ * just advertise MAXPHYS and split buffers that are too big.
+ *
+ * Note we must subtract one additional segment in case of non
+ * page aligned buffers.
+ */
+ dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) *
+ PAGE_SIZE;
+ if (dp->d_maxsize < PAGE_SIZE)
+ dp->d_maxsize = PAGE_SIZE; /* XXX */
+
+ if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) {
+ dp->d_fwsectors = blkcfg->geometry.sectors;
+ dp->d_fwheads = blkcfg->geometry.heads;
+ }
+
+ if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH))
+ dp->d_flags |= DISKFLAG_CANFLUSHCACHE;
+}
+
+static void
+vtblk_create_disk(struct vtblk_softc *sc)
+{
+ struct disk *dp;
+
+ dp = sc->vtblk_disk;
+
+ /*
+ * Retrieving the identification string must be done after
+ * the virtqueue interrupt is setup otherwise it will hang.
+ */
+ vtblk_get_ident(sc);
+
+ device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n",
+ (uintmax_t) dp->d_mediasize >> 20,
+ (uintmax_t) dp->d_mediasize / dp->d_sectorsize,
+ dp->d_sectorsize);
+
+ disk_create(dp, DISK_VERSION);
+}
+
+static void
+vtblk_startio(struct vtblk_softc *sc)
+{
+ struct virtqueue *vq;
+ struct vtblk_request *req;
+ int enq;
+
+ vq = sc->vtblk_vq;
+ enq = 0;
+
+ VTBLK_LOCK_ASSERT(sc);
+
+ if (sc->vtblk_flags & VTBLK_FLAG_SUSPENDED)
+ return;
+
+ while (!virtqueue_full(vq)) {
+ if ((req = vtblk_dequeue_ready(sc)) == NULL)
+ req = vtblk_bio_request(sc);
+ if (req == NULL)
+ break;
+
+ if (vtblk_execute_request(sc, req) != 0) {
+ vtblk_enqueue_ready(sc, req);
+ break;
+ }
+
+ enq++;
+ }
+
+ if (enq > 0)
+ virtqueue_notify(vq);
+}
+
+static struct vtblk_request *
+vtblk_bio_request(struct vtblk_softc *sc)
+{
+ struct bio_queue_head *bioq;
+ struct vtblk_request *req;
+ struct bio *bp;
+
+ bioq = &sc->vtblk_bioq;
+
+ if (bioq_first(bioq) == NULL)
+ return (NULL);
+
+ req = vtblk_dequeue_request(sc);
+ if (req == NULL)
+ return (NULL);
+
+ bp = bioq_takefirst(bioq);
+ req->vbr_bp = bp;
+ req->vbr_ack = -1;
+ req->vbr_hdr.ioprio = 1;
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
+ break;
+ case BIO_READ:
+ req->vbr_hdr.type = VIRTIO_BLK_T_IN;
+ req->vbr_hdr.sector = bp->bio_offset / 512;
+ break;
+ case BIO_WRITE:
+ req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
+ req->vbr_hdr.sector = bp->bio_offset / 512;
+ break;
+ default:
+ KASSERT(0, ("bio with unhandled cmd: %d", bp->bio_cmd));
+ req->vbr_hdr.type = -1;
+ break;
+ }
+
+ if (bp->bio_flags & BIO_ORDERED)
+ req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER;
+
+ return (req);
+}
+
+static int
+vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+ struct sglist *sg;
+ struct bio *bp;
+ int writable, error;
+
+ sg = sc->vtblk_sglist;
+ bp = req->vbr_bp;
+ writable = 0;
+
+ VTBLK_LOCK_ASSERT(sc);
+
+ sglist_reset(sg);
+ error = sglist_append(sg, &req->vbr_hdr,
+ sizeof(struct virtio_blk_outhdr));
+ KASSERT(error == 0, ("error adding header to sglist"));
+ KASSERT(sg->sg_nseg == 1,
+ ("header spanned multiple segments: %d", sg->sg_nseg));
+
+ if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) {
+ error = sglist_append(sg, bp->bio_data, bp->bio_bcount);
+ KASSERT(error == 0, ("error adding buffer to sglist"));
+
+ /* BIO_READ means the host writes into our buffer. */
+ if (bp->bio_cmd == BIO_READ)
+ writable += sg->sg_nseg - 1;
+ }
+
+ error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t));
+ KASSERT(error == 0, ("error adding ack to sglist"));
+ writable++;
+
+ KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS,
+ ("fewer than min segments: %d", sg->sg_nseg));
+
+ error = virtqueue_enqueue(sc->vtblk_vq, req, sg,
+ sg->sg_nseg - writable, writable);
+
+ return (error);
+}
+
+static int
+vtblk_vq_intr(void *xsc)
+{
+ struct vtblk_softc *sc;
+
+ sc = xsc;
+
+ virtqueue_disable_intr(sc->vtblk_vq);
+ taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task);
+
+ return (1);
+}
+
+static void
+vtblk_intr_task(void *arg, int pending)
+{
+ struct vtblk_softc *sc;
+ struct vtblk_request *req;
+ struct virtqueue *vq;
+ struct bio *bp;
+
+ sc = arg;
+ vq = sc->vtblk_vq;
+
+ VTBLK_LOCK(sc);
+ if (sc->vtblk_flags & VTBLK_FLAG_DETACHING) {
+ VTBLK_UNLOCK(sc);
+ return;
+ }
+
+ while ((req = virtqueue_dequeue(vq, NULL)) != NULL) {
+ bp = req->vbr_bp;
+
+ if (req->vbr_ack == VIRTIO_BLK_S_OK)
+ bp->bio_resid = 0;
+ else {
+ bp->bio_flags |= BIO_ERROR;
+ if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP)
+ bp->bio_error = ENOTSUP;
+ else
+ bp->bio_error = EIO;
+ }
+
+ biodone(bp);
+ vtblk_enqueue_request(sc, req);
+ }
+
+ vtblk_startio(sc);
+
+ if (virtqueue_enable_intr(vq) != 0) {
+ virtqueue_disable_intr(vq);
+ VTBLK_UNLOCK(sc);
+ taskqueue_enqueue_fast(sc->vtblk_tq,
+ &sc->vtblk_intr_task);
+ return;
+ }
+
+ VTBLK_UNLOCK(sc);
+}
+
+static void
+vtblk_stop(struct vtblk_softc *sc)
+{
+
+ virtqueue_disable_intr(sc->vtblk_vq);
+ virtio_stop(sc->vtblk_dev);
+}
+
+static void
+vtblk_get_ident(struct vtblk_softc *sc)
+{
+ struct bio buf;
+ struct disk *dp;
+ struct vtblk_request *req;
+ int len, error;
+
+ dp = sc->vtblk_disk;
+ len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE);
+
+ if (vtblk_no_ident != 0)
+ return;
+
+ req = vtblk_dequeue_request(sc);
+ if (req == NULL)
+ return;
+
+ req->vbr_ack = -1;
+ req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID;
+ req->vbr_hdr.ioprio = 1;
+ req->vbr_hdr.sector = 0;
+
+ req->vbr_bp = &buf;
+ bzero(&buf, sizeof(struct bio));
+
+ buf.bio_cmd = BIO_READ;
+ buf.bio_data = dp->d_ident;
+ buf.bio_bcount = len;
+
+ VTBLK_LOCK(sc);
+ error = vtblk_poll_request(sc, req);
+ vtblk_enqueue_request(sc, req);
+ VTBLK_UNLOCK(sc);
+
+ if (error) {
+ device_printf(sc->vtblk_dev,
+ "error getting device identifier: %d\n", error);
+ }
+}
+
+static void
+vtblk_prepare_dump(struct vtblk_softc *sc)
+{
+ device_t dev;
+ struct virtqueue *vq;
+
+ dev = sc->vtblk_dev;
+ vq = sc->vtblk_vq;
+
+ vtblk_stop(sc);
+
+ /*
+ * Drain all requests caught in-flight in the virtqueue,
+ * skipping biodone(). When dumping, only one request is
+ * outstanding at a time, and we just poll the virtqueue
+ * for the response.
+ */
+ vtblk_drain_vq(sc, 1);
+
+ if (virtio_reinit(dev, sc->vtblk_features) != 0)
+ panic("cannot reinit VirtIO block device during dump");
+
+ virtqueue_disable_intr(vq);
+ virtio_reinit_complete(dev);
+}
+
+static int
+vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset,
+ size_t length)
+{
+ struct bio buf;
+ struct vtblk_request *req;
+
+ req = &sc->vtblk_dump_request;
+ req->vbr_ack = -1;
+ req->vbr_hdr.type = VIRTIO_BLK_T_OUT;
+ req->vbr_hdr.ioprio = 1;
+ req->vbr_hdr.sector = offset / 512;
+
+ req->vbr_bp = &buf;
+ bzero(&buf, sizeof(struct bio));
+
+ buf.bio_cmd = BIO_WRITE;
+ buf.bio_data = virtual;
+ buf.bio_bcount = length;
+
+ return (vtblk_poll_request(sc, req));
+}
+
+static int
+vtblk_flush_dump(struct vtblk_softc *sc)
+{
+ struct bio buf;
+ struct vtblk_request *req;
+
+ req = &sc->vtblk_dump_request;
+ req->vbr_ack = -1;
+ req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH;
+ req->vbr_hdr.ioprio = 1;
+ req->vbr_hdr.sector = 0;
+
+ req->vbr_bp = &buf;
+ bzero(&buf, sizeof(struct bio));
+
+ buf.bio_cmd = BIO_FLUSH;
+
+ return (vtblk_poll_request(sc, req));
+}
+
+static int
+vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+ device_t dev;
+ struct virtqueue *vq;
+ struct vtblk_request *r;
+ int error;
+
+ dev = sc->vtblk_dev;
+ vq = sc->vtblk_vq;
+
+ if (!virtqueue_empty(vq))
+ return (EBUSY);
+
+ error = vtblk_execute_request(sc, req);
+ if (error)
+ return (error);
+
+ virtqueue_notify(vq);
+
+ r = virtqueue_poll(vq, NULL);
+ KASSERT(r == req, ("unexpected request response"));
+
+ if (req->vbr_ack != VIRTIO_BLK_S_OK) {
+ error = req->vbr_ack == VIRTIO_BLK_S_UNSUPP ? ENOTSUP : EIO;
+ if (bootverbose)
+ device_printf(dev,
+ "vtblk_poll_request: IO error: %d\n", error);
+ }
+
+ return (error);
+}
+
+static void
+vtblk_drain_vq(struct vtblk_softc *sc, int skip_done)
+{
+ struct virtqueue *vq;
+ struct vtblk_request *req;
+ int last;
+
+ vq = sc->vtblk_vq;
+ last = 0;
+
+ while ((req = virtqueue_drain(vq, &last)) != NULL) {
+ if (!skip_done)
+ vtblk_bio_error(req->vbr_bp, ENXIO);
+
+ vtblk_enqueue_request(sc, req);
+ }
+
+ KASSERT(virtqueue_empty(vq), ("virtqueue not empty"));
+}
+
+static void
+vtblk_drain(struct vtblk_softc *sc)
+{
+ struct bio_queue_head *bioq;
+ struct vtblk_request *req;
+ struct bio *bp;
+
+ bioq = &sc->vtblk_bioq;
+
+ if (sc->vtblk_vq != NULL)
+ vtblk_drain_vq(sc, 0);
+
+ while ((req = vtblk_dequeue_ready(sc)) != NULL) {
+ vtblk_bio_error(req->vbr_bp, ENXIO);
+ vtblk_enqueue_request(sc, req);
+ }
+
+ while (bioq_first(bioq) != NULL) {
+ bp = bioq_takefirst(bioq);
+ vtblk_bio_error(bp, ENXIO);
+ }
+
+ vtblk_free_requests(sc);
+}
+
+static int
+vtblk_alloc_requests(struct vtblk_softc *sc)
+{
+ struct vtblk_request *req;
+ int i, size;
+
+ size = virtqueue_size(sc->vtblk_vq);
+
+ /*
+ * Preallocate sufficient requests to keep the virtqueue full. Each
+ * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce
+ * the number allocated when indirect descriptors are not available.
+ */
+ if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0)
+ size /= VTBLK_MIN_SEGMENTS;
+
+ for (i = 0; i < size; i++) {
+ req = uma_zalloc(vtblk_req_zone, M_NOWAIT);
+ if (req == NULL)
+ return (ENOMEM);
+
+ sc->vtblk_request_count++;
+ vtblk_enqueue_request(sc, req);
+ }
+
+ return (0);
+}
+
+static void
+vtblk_free_requests(struct vtblk_softc *sc)
+{
+ struct vtblk_request *req;
+
+ while ((req = vtblk_dequeue_request(sc)) != NULL) {
+ sc->vtblk_request_count--;
+ uma_zfree(vtblk_req_zone, req);
+ }
+
+ KASSERT(sc->vtblk_request_count == 0, ("leaked requests"));
+}
+
+static struct vtblk_request *
+vtblk_dequeue_request(struct vtblk_softc *sc)
+{
+ struct vtblk_request *req;
+
+ req = TAILQ_FIRST(&sc->vtblk_req_free);
+ if (req != NULL)
+ TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link);
+
+ return (req);
+}
+
+static void
+vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+
+ bzero(req, sizeof(struct vtblk_request));
+ TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link);
+}
+
+static struct vtblk_request *
+vtblk_dequeue_ready(struct vtblk_softc *sc)
+{
+ struct vtblk_request *req;
+
+ req = TAILQ_FIRST(&sc->vtblk_req_ready);
+ if (req != NULL)
+ TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link);
+
+ return (req);
+}
+
+static void
+vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req)
+{
+
+ TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link);
+}
+
+static void
+vtblk_bio_error(struct bio *bp, int error)
+{
+
+ biofinish(bp, NULL, error);
+}
diff --git a/sys/dev/virtio/block/virtio_blk.h b/sys/dev/virtio/block/virtio_blk.h
new file mode 100644
index 0000000..4fb32e0
--- /dev/null
+++ b/sys/dev/virtio/block/virtio_blk.h
@@ -0,0 +1,106 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_BLK_H
+#define _VIRTIO_BLK_H
+
+#include <sys/types.h>
+
+/* Feature bits */
+#define VIRTIO_BLK_F_BARRIER 0x0001 /* Does host support barriers? */
+#define VIRTIO_BLK_F_SIZE_MAX 0x0002 /* Indicates maximum segment size */
+#define VIRTIO_BLK_F_SEG_MAX 0x0004 /* Indicates maximum # of segments */
+#define VIRTIO_BLK_F_GEOMETRY 0x0010 /* Legacy geometry available */
+#define VIRTIO_BLK_F_RO 0x0020 /* Disk is read-only */
+#define VIRTIO_BLK_F_BLK_SIZE 0x0040 /* Block size of disk is available*/
+#define VIRTIO_BLK_F_SCSI 0x0080 /* Supports scsi command passthru */
+#define VIRTIO_BLK_F_FLUSH 0x0200 /* Cache flush command support */
+#define VIRTIO_BLK_F_TOPOLOGY 0x0400 /* Topology information is available */
+
+#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */
+
+struct virtio_blk_config {
+ /* The capacity (in 512-byte sectors). */
+ uint64_t capacity;
+ /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */
+ uint32_t size_max;
+ /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */
+ uint32_t seg_max;
+ /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */
+ struct virtio_blk_geometry {
+ uint16_t cylinders;
+ uint8_t heads;
+ uint8_t sectors;
+ } geometry;
+
+ /* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
+ uint32_t blk_size;
+
+ /* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY */
+ /* exponent for physical block per logical block. */
+ uint8_t physical_block_exp;
+ /* alignment offset in logical blocks. */
+ uint8_t alignment_offset;
+ /* minimum I/O size without performance penalty in logical blocks. */
+ uint16_t min_io_size;
+ /* optimal sustained I/O size in logical blocks. */
+ uint32_t opt_io_size;
+} __packed;
+
+/*
+ * Command types
+ *
+ * Usage is a bit tricky as some bits are used as flags and some are not.
+ *
+ * Rules:
+ * VIRTIO_BLK_T_OUT may be combined with VIRTIO_BLK_T_SCSI_CMD or
+ * VIRTIO_BLK_T_BARRIER. VIRTIO_BLK_T_FLUSH is a command of its own
+ * and may not be combined with any of the other flags.
+ */
+
+/* These two define direction. */
+#define VIRTIO_BLK_T_IN 0
+#define VIRTIO_BLK_T_OUT 1
+
+/* This bit says it's a scsi command, not an actual read or write. */
+#define VIRTIO_BLK_T_SCSI_CMD 2
+
+/* Cache flush command */
+#define VIRTIO_BLK_T_FLUSH 4
+
+/* Get device ID command */
+#define VIRTIO_BLK_T_GET_ID 8
+
+/* Barrier before this op. */
+#define VIRTIO_BLK_T_BARRIER 0x80000000
+
+/* ID string length */
+#define VIRTIO_BLK_ID_BYTES 20
+
+/* This is the first element of the read scatter-gather list. */
+struct virtio_blk_outhdr {
+ /* VIRTIO_BLK_T* */
+ uint32_t type;
+ /* io priority. */
+ uint32_t ioprio;
+ /* Sector (ie. 512 byte offset) */
+ uint64_t sector;
+};
+
+struct virtio_scsi_inhdr {
+ uint32_t errors;
+ uint32_t data_len;
+ uint32_t sense_len;
+ uint32_t residual;
+};
+
+/* And this is the final byte of the write scatter-gather list. */
+#define VIRTIO_BLK_S_OK 0
+#define VIRTIO_BLK_S_IOERR 1
+#define VIRTIO_BLK_S_UNSUPP 2
+
+#endif /* _VIRTIO_BLK_H */
diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c
new file mode 100644
index 0000000..22becb1
--- /dev/null
+++ b/sys/dev/virtio/network/if_vtnet.c
@@ -0,0 +1,2746 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for VirtIO network devices. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#ifdef HAVE_KERNEL_OPTION_HEADERS
+#include "opt_device_polling.h"
+#endif
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sockio.h>
+#include <sys/mbuf.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/random.h>
+#include <sys/sglist.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+
+#include <vm/uma.h>
+
+#include <net/ethernet.h>
+#include <net/if.h>
+#include <net/if_arp.h>
+#include <net/if_dl.h>
+#include <net/if_types.h>
+#include <net/if_media.h>
+#include <net/if_vlan_var.h>
+
+#include <net/bpf.h>
+
+#include <netinet/in_systm.h>
+#include <netinet/in.h>
+#include <netinet/ip.h>
+#include <netinet/ip6.h>
+#include <netinet/udp.h>
+#include <netinet/tcp.h>
+#include <netinet/sctp.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/network/virtio_net.h>
+#include <dev/virtio/network/if_vtnetvar.h>
+
+#include "virtio_if.h"
+
+static int vtnet_modevent(module_t, int, void *);
+
+static int vtnet_probe(device_t);
+static int vtnet_attach(device_t);
+static int vtnet_detach(device_t);
+static int vtnet_suspend(device_t);
+static int vtnet_resume(device_t);
+static int vtnet_shutdown(device_t);
+static int vtnet_config_change(device_t);
+
+static void vtnet_negotiate_features(struct vtnet_softc *);
+static int vtnet_alloc_virtqueues(struct vtnet_softc *);
+static void vtnet_get_hwaddr(struct vtnet_softc *);
+static void vtnet_set_hwaddr(struct vtnet_softc *);
+static int vtnet_is_link_up(struct vtnet_softc *);
+static void vtnet_update_link_status(struct vtnet_softc *);
+static void vtnet_watchdog(struct vtnet_softc *);
+static void vtnet_config_change_task(void *, int);
+static int vtnet_change_mtu(struct vtnet_softc *, int);
+static int vtnet_ioctl(struct ifnet *, u_long, caddr_t);
+
+static int vtnet_init_rx_vq(struct vtnet_softc *);
+static void vtnet_free_rx_mbufs(struct vtnet_softc *);
+static void vtnet_free_tx_mbufs(struct vtnet_softc *);
+static void vtnet_free_ctrl_vq(struct vtnet_softc *);
+
+#ifdef DEVICE_POLLING
+static poll_handler_t vtnet_poll;
+#endif
+
+static struct mbuf * vtnet_alloc_rxbuf(struct vtnet_softc *, int,
+ struct mbuf **);
+static int vtnet_replace_rxbuf(struct vtnet_softc *,
+ struct mbuf *, int);
+static int vtnet_newbuf(struct vtnet_softc *);
+static void vtnet_discard_merged_rxbuf(struct vtnet_softc *, int);
+static void vtnet_discard_rxbuf(struct vtnet_softc *, struct mbuf *);
+static int vtnet_enqueue_rxbuf(struct vtnet_softc *, struct mbuf *);
+static void vtnet_vlan_tag_remove(struct mbuf *);
+static int vtnet_rx_csum(struct vtnet_softc *, struct mbuf *,
+ struct virtio_net_hdr *);
+static int vtnet_rxeof_merged(struct vtnet_softc *, struct mbuf *, int);
+static int vtnet_rxeof(struct vtnet_softc *, int, int *);
+static void vtnet_rx_intr_task(void *, int);
+static int vtnet_rx_vq_intr(void *);
+
+static void vtnet_txeof(struct vtnet_softc *);
+static struct mbuf * vtnet_tx_offload(struct vtnet_softc *, struct mbuf *,
+ struct virtio_net_hdr *);
+static int vtnet_enqueue_txbuf(struct vtnet_softc *, struct mbuf **,
+ struct vtnet_tx_header *);
+static int vtnet_encap(struct vtnet_softc *, struct mbuf **);
+static void vtnet_start_locked(struct ifnet *);
+static void vtnet_start(struct ifnet *);
+static void vtnet_tick(void *);
+static void vtnet_tx_intr_task(void *, int);
+static int vtnet_tx_vq_intr(void *);
+
+static void vtnet_stop(struct vtnet_softc *);
+static int vtnet_reinit(struct vtnet_softc *);
+static void vtnet_init_locked(struct vtnet_softc *);
+static void vtnet_init(void *);
+
+static void vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *,
+ struct sglist *, int, int);
+
+static void vtnet_rx_filter(struct vtnet_softc *sc);
+static int vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int);
+static int vtnet_set_promisc(struct vtnet_softc *, int);
+static int vtnet_set_allmulti(struct vtnet_softc *, int);
+static void vtnet_rx_filter_mac(struct vtnet_softc *);
+
+static int vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t);
+static void vtnet_rx_filter_vlan(struct vtnet_softc *);
+static void vtnet_set_vlan_filter(struct vtnet_softc *, int, uint16_t);
+static void vtnet_register_vlan(void *, struct ifnet *, uint16_t);
+static void vtnet_unregister_vlan(void *, struct ifnet *, uint16_t);
+
+static int vtnet_ifmedia_upd(struct ifnet *);
+static void vtnet_ifmedia_sts(struct ifnet *, struct ifmediareq *);
+
+static void vtnet_add_statistics(struct vtnet_softc *);
+
+static int vtnet_enable_rx_intr(struct vtnet_softc *);
+static int vtnet_enable_tx_intr(struct vtnet_softc *);
+static void vtnet_disable_rx_intr(struct vtnet_softc *);
+static void vtnet_disable_tx_intr(struct vtnet_softc *);
+
+/* Tunables. */
+static int vtnet_csum_disable = 0;
+TUNABLE_INT("hw.vtnet.csum_disable", &vtnet_csum_disable);
+static int vtnet_tso_disable = 0;
+TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable);
+static int vtnet_lro_disable = 0;
+TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable);
+
+/*
+ * Reducing the number of transmit completed interrupts can
+ * improve performance. To do so, the define below keeps the
+ * Tx vq interrupt disabled and adds calls to vtnet_txeof()
+ * in the start and watchdog paths. The price to pay for this
+ * is the m_free'ing of transmitted mbufs may be delayed until
+ * the watchdog fires.
+ */
+#define VTNET_TX_INTR_MODERATION
+
+static uma_zone_t vtnet_tx_header_zone;
+
+static struct virtio_feature_desc vtnet_feature_desc[] = {
+ { VIRTIO_NET_F_CSUM, "TxChecksum" },
+ { VIRTIO_NET_F_GUEST_CSUM, "RxChecksum" },
+ { VIRTIO_NET_F_MAC, "MacAddress" },
+ { VIRTIO_NET_F_GSO, "TxAllGSO" },
+ { VIRTIO_NET_F_GUEST_TSO4, "RxTSOv4" },
+ { VIRTIO_NET_F_GUEST_TSO6, "RxTSOv6" },
+ { VIRTIO_NET_F_GUEST_ECN, "RxECN" },
+ { VIRTIO_NET_F_GUEST_UFO, "RxUFO" },
+ { VIRTIO_NET_F_HOST_TSO4, "TxTSOv4" },
+ { VIRTIO_NET_F_HOST_TSO6, "TxTSOv6" },
+ { VIRTIO_NET_F_HOST_ECN, "TxTSOECN" },
+ { VIRTIO_NET_F_HOST_UFO, "TxUFO" },
+ { VIRTIO_NET_F_MRG_RXBUF, "MrgRxBuf" },
+ { VIRTIO_NET_F_STATUS, "Status" },
+ { VIRTIO_NET_F_CTRL_VQ, "ControlVq" },
+ { VIRTIO_NET_F_CTRL_RX, "RxMode" },
+ { VIRTIO_NET_F_CTRL_VLAN, "VLanFilter" },
+ { VIRTIO_NET_F_CTRL_RX_EXTRA, "RxModeExtra" },
+
+ { 0, NULL }
+};
+
+static device_method_t vtnet_methods[] = {
+ /* Device methods. */
+ DEVMETHOD(device_probe, vtnet_probe),
+ DEVMETHOD(device_attach, vtnet_attach),
+ DEVMETHOD(device_detach, vtnet_detach),
+ DEVMETHOD(device_suspend, vtnet_suspend),
+ DEVMETHOD(device_resume, vtnet_resume),
+ DEVMETHOD(device_shutdown, vtnet_shutdown),
+
+ /* VirtIO methods. */
+ DEVMETHOD(virtio_config_change, vtnet_config_change),
+
+ { 0, 0 }
+};
+
+static driver_t vtnet_driver = {
+ "vtnet",
+ vtnet_methods,
+ sizeof(struct vtnet_softc)
+};
+static devclass_t vtnet_devclass;
+
+DRIVER_MODULE(vtnet, virtio_pci, vtnet_driver, vtnet_devclass,
+ vtnet_modevent, 0);
+MODULE_VERSION(vtnet, 1);
+MODULE_DEPEND(vtnet, virtio, 1, 1, 1);
+
+static int
+vtnet_modevent(module_t mod, int type, void *unused)
+{
+ int error;
+
+ error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr",
+ sizeof(struct vtnet_tx_header),
+ NULL, NULL, NULL, NULL, 0, 0);
+ break;
+ case MOD_QUIESCE:
+ case MOD_UNLOAD:
+ if (uma_zone_get_cur(vtnet_tx_header_zone) > 0)
+ error = EBUSY;
+ else if (type == MOD_UNLOAD) {
+ uma_zdestroy(vtnet_tx_header_zone);
+ vtnet_tx_header_zone = NULL;
+ }
+ break;
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+static int
+vtnet_probe(device_t dev)
+{
+
+ if (virtio_get_device_type(dev) != VIRTIO_ID_NETWORK)
+ return (ENXIO);
+
+ device_set_desc(dev, "VirtIO Networking Adapter");
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtnet_attach(device_t dev)
+{
+ struct vtnet_softc *sc;
+ struct ifnet *ifp;
+ int tx_size, error;
+
+ sc = device_get_softc(dev);
+ sc->vtnet_dev = dev;
+
+ VTNET_LOCK_INIT(sc);
+ callout_init_mtx(&sc->vtnet_tick_ch, VTNET_MTX(sc), 0);
+
+ ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd,
+ vtnet_ifmedia_sts);
+ ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL);
+ ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE);
+
+ vtnet_add_statistics(sc);
+
+ virtio_set_feature_desc(dev, vtnet_feature_desc);
+ vtnet_negotiate_features(sc);
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) {
+ sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS;
+ sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ } else
+ sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr);
+
+ sc->vtnet_rx_mbuf_size = MCLBYTES;
+ sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc);
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) {
+ sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ;
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX))
+ sc->vtnet_flags |= VTNET_FLAG_CTRL_RX;
+ if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN))
+ sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER;
+ }
+
+ vtnet_get_hwaddr(sc);
+
+ error = vtnet_alloc_virtqueues(sc);
+ if (error) {
+ device_printf(dev, "cannot allocate virtqueues\n");
+ goto fail;
+ }
+
+ ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER);
+ if (ifp == NULL) {
+ device_printf(dev, "cannot allocate ifnet structure\n");
+ error = ENOSPC;
+ goto fail;
+ }
+
+ ifp->if_softc = sc;
+ if_initname(ifp, device_get_name(dev), device_get_unit(dev));
+ ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
+ ifp->if_init = vtnet_init;
+ ifp->if_start = vtnet_start;
+ ifp->if_ioctl = vtnet_ioctl;
+
+ sc->vtnet_rx_size = virtqueue_size(sc->vtnet_rx_vq);
+ sc->vtnet_rx_process_limit = sc->vtnet_rx_size;
+
+ tx_size = virtqueue_size(sc->vtnet_tx_vq);
+ sc->vtnet_tx_size = tx_size;
+ IFQ_SET_MAXLEN(&ifp->if_snd, tx_size - 1);
+ ifp->if_snd.ifq_drv_maxlen = tx_size - 1;
+ IFQ_SET_READY(&ifp->if_snd);
+
+ ether_ifattach(ifp, sc->vtnet_hwaddr);
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS))
+ ifp->if_capabilities |= IFCAP_LINKSTATE;
+
+ /* Tell the upper layer(s) we support long frames. */
+ ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header);
+ ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU;
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) {
+ ifp->if_capabilities |= IFCAP_TXCSUM;
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4))
+ ifp->if_capabilities |= IFCAP_TSO4;
+ if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6))
+ ifp->if_capabilities |= IFCAP_TSO6;
+ if (ifp->if_capabilities & IFCAP_TSO)
+ ifp->if_capabilities |= IFCAP_VLAN_HWTSO;
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN))
+ sc->vtnet_flags |= VTNET_FLAG_TSO_ECN;
+ }
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) {
+ ifp->if_capabilities |= IFCAP_RXCSUM;
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) ||
+ virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6))
+ ifp->if_capabilities |= IFCAP_LRO;
+ }
+
+ if (ifp->if_capabilities & IFCAP_HWCSUM) {
+ /*
+ * VirtIO does not support VLAN tagging, but we can fake
+ * it by inserting and removing the 802.1Q header during
+ * transmit and receive. We are then able to do checksum
+ * offloading of VLAN frames.
+ */
+ ifp->if_capabilities |=
+ IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM;
+ }
+
+ ifp->if_capenable = ifp->if_capabilities;
+
+ /*
+ * Capabilities after here are not enabled by default.
+ */
+
+ if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) {
+ ifp->if_capabilities |= IFCAP_VLAN_HWFILTER;
+
+ sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config,
+ vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST);
+ sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig,
+ vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST);
+ }
+
+#ifdef DEVICE_POLLING
+ ifp->if_capabilities |= IFCAP_POLLING;
+#endif
+
+ TASK_INIT(&sc->vtnet_rx_intr_task, 0, vtnet_rx_intr_task, sc);
+ TASK_INIT(&sc->vtnet_tx_intr_task, 0, vtnet_tx_intr_task, sc);
+ TASK_INIT(&sc->vtnet_cfgchg_task, 0, vtnet_config_change_task, sc);
+
+ sc->vtnet_tq = taskqueue_create_fast("vtnet_taskq", M_NOWAIT,
+ taskqueue_thread_enqueue, &sc->vtnet_tq);
+ if (sc->vtnet_tq == NULL) {
+ error = ENOMEM;
+ device_printf(dev, "cannot allocate taskqueue\n");
+ ether_ifdetach(ifp);
+ goto fail;
+ }
+ taskqueue_start_threads(&sc->vtnet_tq, 1, PI_NET, "%s taskq",
+ device_get_nameunit(dev));
+
+ error = virtio_setup_intr(dev, INTR_TYPE_NET);
+ if (error) {
+ device_printf(dev, "cannot setup virtqueue interrupts\n");
+ taskqueue_free(sc->vtnet_tq);
+ sc->vtnet_tq = NULL;
+ ether_ifdetach(ifp);
+ goto fail;
+ }
+
+ /*
+ * Device defaults to promiscuous mode for backwards
+ * compatibility. Turn it off if possible.
+ */
+ if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
+ VTNET_LOCK(sc);
+ if (vtnet_set_promisc(sc, 0) != 0) {
+ ifp->if_flags |= IFF_PROMISC;
+ device_printf(dev,
+ "cannot disable promiscuous mode\n");
+ }
+ VTNET_UNLOCK(sc);
+ } else
+ ifp->if_flags |= IFF_PROMISC;
+
+fail:
+ if (error)
+ vtnet_detach(dev);
+
+ return (error);
+}
+
+static int
+vtnet_detach(device_t dev)
+{
+ struct vtnet_softc *sc;
+ struct ifnet *ifp;
+
+ sc = device_get_softc(dev);
+ ifp = sc->vtnet_ifp;
+
+ KASSERT(mtx_initialized(VTNET_MTX(sc)),
+ ("vtnet mutex not initialized"));
+
+#ifdef DEVICE_POLLING
+ if (ifp != NULL && ifp->if_capenable & IFCAP_POLLING)
+ ether_poll_deregister(ifp);
+#endif
+
+ if (device_is_attached(dev)) {
+ VTNET_LOCK(sc);
+ vtnet_stop(sc);
+ VTNET_UNLOCK(sc);
+
+ callout_drain(&sc->vtnet_tick_ch);
+ taskqueue_drain(taskqueue_fast, &sc->vtnet_cfgchg_task);
+
+ ether_ifdetach(ifp);
+ }
+
+ if (sc->vtnet_tq != NULL) {
+ taskqueue_drain(sc->vtnet_tq, &sc->vtnet_rx_intr_task);
+ taskqueue_drain(sc->vtnet_tq, &sc->vtnet_tx_intr_task);
+ taskqueue_free(sc->vtnet_tq);
+ sc->vtnet_tq = NULL;
+ }
+
+ if (sc->vtnet_vlan_attach != NULL) {
+ EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach);
+ sc->vtnet_vlan_attach = NULL;
+ }
+ if (sc->vtnet_vlan_detach != NULL) {
+ EVENTHANDLER_DEREGISTER(vlan_unconfg, sc->vtnet_vlan_detach);
+ sc->vtnet_vlan_detach = NULL;
+ }
+
+ if (ifp) {
+ if_free(ifp);
+ sc->vtnet_ifp = NULL;
+ }
+
+ if (sc->vtnet_rx_vq != NULL)
+ vtnet_free_rx_mbufs(sc);
+ if (sc->vtnet_tx_vq != NULL)
+ vtnet_free_tx_mbufs(sc);
+ if (sc->vtnet_ctrl_vq != NULL)
+ vtnet_free_ctrl_vq(sc);
+
+ ifmedia_removeall(&sc->vtnet_media);
+ VTNET_LOCK_DESTROY(sc);
+
+ return (0);
+}
+
+static int
+vtnet_suspend(device_t dev)
+{
+ struct vtnet_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ VTNET_LOCK(sc);
+ vtnet_stop(sc);
+ sc->vtnet_flags |= VTNET_FLAG_SUSPENDED;
+ VTNET_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+vtnet_resume(device_t dev)
+{
+ struct vtnet_softc *sc;
+ struct ifnet *ifp;
+
+ sc = device_get_softc(dev);
+ ifp = sc->vtnet_ifp;
+
+ VTNET_LOCK(sc);
+ if (ifp->if_flags & IFF_UP)
+ vtnet_init_locked(sc);
+ sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED;
+ VTNET_UNLOCK(sc);
+
+ return (0);
+}
+
+static int
+vtnet_shutdown(device_t dev)
+{
+
+ /*
+ * Suspend already does all of what we need to
+ * do here; we just never expect to be resumed.
+ */
+ return (vtnet_suspend(dev));
+}
+
+static int
+vtnet_config_change(device_t dev)
+{
+ struct vtnet_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ taskqueue_enqueue_fast(taskqueue_fast, &sc->vtnet_cfgchg_task);
+
+ return (1);
+}
+
+static void
+vtnet_negotiate_features(struct vtnet_softc *sc)
+{
+ device_t dev;
+ uint64_t mask, features;
+
+ dev = sc->vtnet_dev;
+ mask = 0;
+
+ if (vtnet_csum_disable)
+ mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM;
+
+ /*
+ * TSO and LRO are only available when their corresponding
+ * checksum offload feature is also negotiated.
+ */
+
+ if (vtnet_csum_disable || vtnet_tso_disable)
+ mask |= VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 |
+ VIRTIO_NET_F_HOST_ECN;
+
+ if (vtnet_csum_disable || vtnet_lro_disable)
+ mask |= VTNET_LRO_FEATURES;
+
+ features = VTNET_FEATURES & ~mask;
+#ifdef VTNET_TX_INTR_MODERATION
+ features |= VIRTIO_F_NOTIFY_ON_EMPTY;
+#endif
+ sc->vtnet_features = virtio_negotiate_features(dev, features);
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0 &&
+ virtio_with_feature(dev, VTNET_LRO_FEATURES)) {
+ /*
+ * LRO without mergeable buffers requires special care. This
+ * is not ideal because every receive buffer must be large
+ * enough to hold the maximum TCP packet, the Ethernet header,
+ * and the vtnet_rx_header. This requires up to 34 descriptors
+ * when using MCLBYTES clusters. If we do not have indirect
+ * descriptors, LRO is disabled since the virtqueue will not
+ * be able to contain very many receive buffers.
+ */
+ if (virtio_with_feature(dev,
+ VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+ device_printf(dev,
+ "LRO disabled due to lack of both mergeable "
+ "buffers and indirect descriptors\n");
+
+ sc->vtnet_features = virtio_negotiate_features(dev,
+ features & ~VTNET_LRO_FEATURES);
+ } else
+ sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG;
+ }
+}
+
+static int
+vtnet_alloc_virtqueues(struct vtnet_softc *sc)
+{
+ device_t dev;
+ struct vq_alloc_info vq_info[3];
+ int nvqs, rxsegs;
+
+ dev = sc->vtnet_dev;
+ nvqs = 2;
+
+ /*
+ * Indirect descriptors are not needed for the Rx
+ * virtqueue when mergeable buffers are negotiated.
+ * The header is placed inline with the data, not
+ * in a separate descriptor, and mbuf clusters are
+ * always physically contiguous.
+ */
+ if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+ rxsegs = sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG ?
+ VTNET_MAX_RX_SEGS : VTNET_MIN_RX_SEGS;
+ } else
+ rxsegs = 0;
+
+ VQ_ALLOC_INFO_INIT(&vq_info[0], rxsegs,
+ vtnet_rx_vq_intr, sc, &sc->vtnet_rx_vq,
+ "%s receive", device_get_nameunit(dev));
+
+ VQ_ALLOC_INFO_INIT(&vq_info[1], VTNET_MAX_TX_SEGS,
+ vtnet_tx_vq_intr, sc, &sc->vtnet_tx_vq,
+ "%s transmit", device_get_nameunit(dev));
+
+ if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
+ nvqs++;
+
+ VQ_ALLOC_INFO_INIT(&vq_info[2], 0, NULL, NULL,
+ &sc->vtnet_ctrl_vq, "%s control",
+ device_get_nameunit(dev));
+ }
+
+ return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info));
+}
+
+static void
+vtnet_get_hwaddr(struct vtnet_softc *sc)
+{
+ device_t dev;
+
+ dev = sc->vtnet_dev;
+
+ if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) {
+ virtio_read_device_config(dev,
+ offsetof(struct virtio_net_config, mac),
+ sc->vtnet_hwaddr, ETHER_ADDR_LEN);
+ } else {
+ /* Generate random locally administered unicast address. */
+ sc->vtnet_hwaddr[0] = 0xB2;
+ arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0);
+
+ vtnet_set_hwaddr(sc);
+ }
+}
+
+static void
+vtnet_set_hwaddr(struct vtnet_softc *sc)
+{
+ device_t dev;
+
+ dev = sc->vtnet_dev;
+
+ virtio_write_device_config(dev,
+ offsetof(struct virtio_net_config, mac),
+ sc->vtnet_hwaddr, ETHER_ADDR_LEN);
+}
+
+static int
+vtnet_is_link_up(struct vtnet_softc *sc)
+{
+ device_t dev;
+ struct ifnet *ifp;
+ uint16_t status;
+
+ dev = sc->vtnet_dev;
+ ifp = sc->vtnet_ifp;
+
+ VTNET_LOCK_ASSERT(sc);
+
+ if ((ifp->if_capenable & IFCAP_LINKSTATE) == 0)
+ return (1);
+
+ status = virtio_read_dev_config_2(dev,
+ offsetof(struct virtio_net_config, status));
+
+ return ((status & VIRTIO_NET_S_LINK_UP) != 0);
+}
+
+static void
+vtnet_update_link_status(struct vtnet_softc *sc)
+{
+ device_t dev;
+ struct ifnet *ifp;
+ int link;
+
+ dev = sc->vtnet_dev;
+ ifp = sc->vtnet_ifp;
+
+ link = vtnet_is_link_up(sc);
+
+ if (link && ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0)) {
+ sc->vtnet_flags |= VTNET_FLAG_LINK;
+ if (bootverbose)
+ device_printf(dev, "Link is up\n");
+
+ if_link_state_change(ifp, LINK_STATE_UP);
+ if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+ vtnet_start_locked(ifp);
+ } else if (!link && (sc->vtnet_flags & VTNET_FLAG_LINK)) {
+ sc->vtnet_flags &= ~VTNET_FLAG_LINK;
+ if (bootverbose)
+ device_printf(dev, "Link is down\n");
+
+ if_link_state_change(ifp, LINK_STATE_DOWN);
+ }
+}
+
+static void
+vtnet_watchdog(struct vtnet_softc *sc)
+{
+ struct ifnet *ifp;
+
+ ifp = sc->vtnet_ifp;
+
+#ifdef VTNET_TX_INTR_MODERATION
+ vtnet_txeof(sc);
+#endif
+
+ if (sc->vtnet_watchdog_timer == 0 || --sc->vtnet_watchdog_timer)
+ return;
+
+ if_printf(ifp, "watchdog timeout -- resetting\n");
+#ifdef VTNET_DEBUG
+ virtqueue_dump(sc->vtnet_tx_vq);
+#endif
+ ifp->if_oerrors++;
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ vtnet_init_locked(sc);
+}
+
+static void
+vtnet_config_change_task(void *arg, int pending)
+{
+ struct vtnet_softc *sc;
+
+ sc = arg;
+
+ VTNET_LOCK(sc);
+ vtnet_update_link_status(sc);
+ VTNET_UNLOCK(sc);
+}
+
+static int
+vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
+{
+ struct vtnet_softc *sc;
+ struct ifreq *ifr;
+ int reinit, mask, error;
+
+ sc = ifp->if_softc;
+ ifr = (struct ifreq *) data;
+ reinit = 0;
+ error = 0;
+
+ switch (cmd) {
+ case SIOCSIFMTU:
+ if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > VTNET_MAX_MTU)
+ error = EINVAL;
+ else if (ifp->if_mtu != ifr->ifr_mtu) {
+ VTNET_LOCK(sc);
+ error = vtnet_change_mtu(sc, ifr->ifr_mtu);
+ VTNET_UNLOCK(sc);
+ }
+ break;
+
+ case SIOCSIFFLAGS:
+ VTNET_LOCK(sc);
+ if ((ifp->if_flags & IFF_UP) == 0) {
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ vtnet_stop(sc);
+ } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ if ((ifp->if_flags ^ sc->vtnet_if_flags) &
+ (IFF_PROMISC | IFF_ALLMULTI)) {
+ if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX)
+ vtnet_rx_filter(sc);
+ else
+ error = ENOTSUP;
+ }
+ } else
+ vtnet_init_locked(sc);
+
+ if (error == 0)
+ sc->vtnet_if_flags = ifp->if_flags;
+ VTNET_UNLOCK(sc);
+ break;
+
+ case SIOCADDMULTI:
+ case SIOCDELMULTI:
+ VTNET_LOCK(sc);
+ if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) &&
+ (ifp->if_drv_flags & IFF_DRV_RUNNING))
+ vtnet_rx_filter_mac(sc);
+ VTNET_UNLOCK(sc);
+ break;
+
+ case SIOCSIFMEDIA:
+ case SIOCGIFMEDIA:
+ error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd);
+ break;
+
+ case SIOCSIFCAP:
+ mask = ifr->ifr_reqcap ^ ifp->if_capenable;
+
+#ifdef DEVICE_POLLING
+ if (mask & IFCAP_POLLING) {
+ if (ifr->ifr_reqcap & IFCAP_POLLING) {
+ error = ether_poll_register(vtnet_poll, ifp);
+ if (error)
+ break;
+
+ VTNET_LOCK(sc);
+ vtnet_disable_rx_intr(sc);
+ vtnet_disable_tx_intr(sc);
+ ifp->if_capenable |= IFCAP_POLLING;
+ VTNET_UNLOCK(sc);
+ } else {
+ error = ether_poll_deregister(ifp);
+
+ /* Enable interrupts even in error case. */
+ VTNET_LOCK(sc);
+ vtnet_enable_tx_intr(sc);
+ vtnet_enable_rx_intr(sc);
+ ifp->if_capenable &= ~IFCAP_POLLING;
+ VTNET_UNLOCK(sc);
+ }
+ }
+#endif
+ VTNET_LOCK(sc);
+
+ if (mask & IFCAP_TXCSUM) {
+ ifp->if_capenable ^= IFCAP_TXCSUM;
+ if (ifp->if_capenable & IFCAP_TXCSUM)
+ ifp->if_hwassist |= VTNET_CSUM_OFFLOAD;
+ else
+ ifp->if_hwassist &= ~VTNET_CSUM_OFFLOAD;
+ }
+
+ if (mask & IFCAP_TSO4) {
+ ifp->if_capenable ^= IFCAP_TSO4;
+ if (ifp->if_capenable & IFCAP_TSO4)
+ ifp->if_hwassist |= CSUM_TSO;
+ else
+ ifp->if_hwassist &= ~CSUM_TSO;
+ }
+
+ if (mask & IFCAP_RXCSUM) {
+ ifp->if_capenable ^= IFCAP_RXCSUM;
+ reinit = 1;
+ }
+
+ if (mask & IFCAP_LRO) {
+ ifp->if_capenable ^= IFCAP_LRO;
+ reinit = 1;
+ }
+
+ if (mask & IFCAP_VLAN_HWFILTER) {
+ ifp->if_capenable ^= IFCAP_VLAN_HWFILTER;
+ reinit = 1;
+ }
+
+ if (mask & IFCAP_VLAN_HWTSO)
+ ifp->if_capenable ^= IFCAP_VLAN_HWTSO;
+
+ if (mask & IFCAP_VLAN_HWTAGGING)
+ ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING;
+
+ if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ vtnet_init_locked(sc);
+ }
+ VLAN_CAPABILITIES(ifp);
+
+ VTNET_UNLOCK(sc);
+ break;
+
+ default:
+ error = ether_ioctl(ifp, cmd, data);
+ break;
+ }
+
+ VTNET_LOCK_ASSERT_NOTOWNED(sc);
+
+ return (error);
+}
+
+static int
+vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu)
+{
+ struct ifnet *ifp;
+ int new_frame_size, clsize;
+
+ ifp = sc->vtnet_ifp;
+
+ if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+ new_frame_size = sizeof(struct vtnet_rx_header) +
+ sizeof(struct ether_vlan_header) + new_mtu;
+
+ if (new_frame_size > MJUM9BYTES)
+ return (EINVAL);
+
+ if (new_frame_size <= MCLBYTES)
+ clsize = MCLBYTES;
+ else
+ clsize = MJUM9BYTES;
+ } else {
+ new_frame_size = sizeof(struct virtio_net_hdr_mrg_rxbuf) +
+ sizeof(struct ether_vlan_header) + new_mtu;
+
+ if (new_frame_size <= MCLBYTES)
+ clsize = MCLBYTES;
+ else
+ clsize = MJUMPAGESIZE;
+ }
+
+ sc->vtnet_rx_mbuf_size = clsize;
+ sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc);
+ KASSERT(sc->vtnet_rx_mbuf_count < VTNET_MAX_RX_SEGS,
+ ("too many rx mbufs: %d", sc->vtnet_rx_mbuf_count));
+
+ ifp->if_mtu = new_mtu;
+
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
+ vtnet_init_locked(sc);
+ }
+
+ return (0);
+}
+
+static int
+vtnet_init_rx_vq(struct vtnet_softc *sc)
+{
+ struct virtqueue *vq;
+ int nbufs, error;
+
+ vq = sc->vtnet_rx_vq;
+ nbufs = 0;
+ error = ENOSPC;
+
+ while (!virtqueue_full(vq)) {
+ if ((error = vtnet_newbuf(sc)) != 0)
+ break;
+ nbufs++;
+ }
+
+ if (nbufs > 0) {
+ virtqueue_notify(vq);
+
+ /*
+ * EMSGSIZE signifies the virtqueue did not have enough
+ * entries available to hold the last mbuf. This is not
+ * an error. We should not get ENOSPC since we check if
+ * the virtqueue is full before attempting to add a
+ * buffer.
+ */
+ if (error == EMSGSIZE)
+ error = 0;
+ }
+
+ return (error);
+}
+
+static void
+vtnet_free_rx_mbufs(struct vtnet_softc *sc)
+{
+ struct virtqueue *vq;
+ struct mbuf *m;
+ int last;
+
+ vq = sc->vtnet_rx_vq;
+ last = 0;
+
+ while ((m = virtqueue_drain(vq, &last)) != NULL)
+ m_freem(m);
+
+ KASSERT(virtqueue_empty(vq), ("mbufs remaining in Rx Vq"));
+}
+
+static void
+vtnet_free_tx_mbufs(struct vtnet_softc *sc)
+{
+ struct virtqueue *vq;
+ struct vtnet_tx_header *txhdr;
+ int last;
+
+ vq = sc->vtnet_tx_vq;
+ last = 0;
+
+ while ((txhdr = virtqueue_drain(vq, &last)) != NULL) {
+ m_freem(txhdr->vth_mbuf);
+ uma_zfree(vtnet_tx_header_zone, txhdr);
+ }
+
+ KASSERT(virtqueue_empty(vq), ("mbufs remaining in Tx Vq"));
+}
+
+static void
+vtnet_free_ctrl_vq(struct vtnet_softc *sc)
+{
+
+ /*
+ * The control virtqueue is only polled, therefore
+ * it should already be empty.
+ */
+ KASSERT(virtqueue_empty(sc->vtnet_ctrl_vq),
+ ("Ctrl Vq not empty"));
+}
+
+#ifdef DEVICE_POLLING
+static int
+vtnet_poll(struct ifnet *ifp, enum poll_cmd cmd, int count)
+{
+ struct vtnet_softc *sc;
+ int rx_done;
+
+ sc = ifp->if_softc;
+ rx_done = 0;
+
+ VTNET_LOCK(sc);
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
+ if (cmd == POLL_AND_CHECK_STATUS)
+ vtnet_update_link_status(sc);
+
+ if (virtqueue_nused(sc->vtnet_rx_vq) > 0)
+ vtnet_rxeof(sc, count, &rx_done);
+
+ vtnet_txeof(sc);
+ if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+ vtnet_start_locked(ifp);
+ }
+ VTNET_UNLOCK(sc);
+
+ return (rx_done);
+}
+#endif /* DEVICE_POLLING */
+
+static struct mbuf *
+vtnet_alloc_rxbuf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp)
+{
+ struct mbuf *m_head, *m_tail, *m;
+ int i, clsize;
+
+ clsize = sc->vtnet_rx_mbuf_size;
+
+ m_head = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, clsize);
+ if (m_head == NULL)
+ goto fail;
+
+ m_head->m_len = clsize;
+ m_tail = m_head;
+
+ if (nbufs > 1) {
+ KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
+ ("chained Rx mbuf requested without LRO_NOMRG"));
+
+ for (i = 0; i < nbufs - 1; i++) {
+ m = m_getjcl(M_DONTWAIT, MT_DATA, 0, clsize);
+ if (m == NULL)
+ goto fail;
+
+ m->m_len = clsize;
+ m_tail->m_next = m;
+ m_tail = m;
+ }
+ }
+
+ if (m_tailp != NULL)
+ *m_tailp = m_tail;
+
+ return (m_head);
+
+fail:
+ sc->vtnet_stats.mbuf_alloc_failed++;
+ m_freem(m_head);
+
+ return (NULL);
+}
+
+static int
+vtnet_replace_rxbuf(struct vtnet_softc *sc, struct mbuf *m0, int len0)
+{
+ struct mbuf *m, *m_prev;
+ struct mbuf *m_new, *m_tail;
+ int len, clsize, nreplace, error;
+
+ m = m0;
+ m_prev = NULL;
+ len = len0;
+
+ m_tail = NULL;
+ clsize = sc->vtnet_rx_mbuf_size;
+ nreplace = 0;
+
+ if (m->m_next != NULL)
+ KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG,
+ ("chained Rx mbuf without LRO_NOMRG"));
+
+ /*
+ * Since LRO_NOMRG mbuf chains are so large, we want to avoid
+ * allocating an entire chain for each received frame. When
+ * the received frame's length is less than that of the chain,
+ * the unused mbufs are reassigned to the new chain.
+ */
+ while (len > 0) {
+ /*
+ * Something is seriously wrong if we received
+ * a frame larger than the mbuf chain. Drop it.
+ */
+ if (m == NULL) {
+ sc->vtnet_stats.rx_frame_too_large++;
+ return (EMSGSIZE);
+ }
+
+ KASSERT(m->m_len == clsize,
+ ("mbuf length not expected cluster size: %d",
+ m->m_len));
+
+ m->m_len = MIN(m->m_len, len);
+ len -= m->m_len;
+
+ m_prev = m;
+ m = m->m_next;
+ nreplace++;
+ }
+
+ KASSERT(m_prev != NULL, ("m_prev == NULL"));
+ KASSERT(nreplace <= sc->vtnet_rx_mbuf_count,
+ ("too many replacement mbufs: %d/%d", nreplace,
+ sc->vtnet_rx_mbuf_count));
+
+ m_new = vtnet_alloc_rxbuf(sc, nreplace, &m_tail);
+ if (m_new == NULL) {
+ m_prev->m_len = clsize;
+ return (ENOBUFS);
+ }
+
+ /*
+ * Move unused mbufs, if any, from the original chain
+ * onto the end of the new chain.
+ */
+ if (m_prev->m_next != NULL) {
+ m_tail->m_next = m_prev->m_next;
+ m_prev->m_next = NULL;
+ }
+
+ error = vtnet_enqueue_rxbuf(sc, m_new);
+ if (error) {
+ /*
+ * BAD! We could not enqueue the replacement mbuf chain. We
+ * must restore the m0 chain to the original state if it was
+ * modified so we can subsequently discard it.
+ *
+ * NOTE: The replacement is suppose to be an identical copy
+ * to the one just dequeued so this is an unexpected error.
+ */
+ sc->vtnet_stats.rx_enq_replacement_failed++;
+
+ if (m_tail->m_next != NULL) {
+ m_prev->m_next = m_tail->m_next;
+ m_tail->m_next = NULL;
+ }
+
+ m_prev->m_len = clsize;
+ m_freem(m_new);
+ }
+
+ return (error);
+}
+
+static int
+vtnet_newbuf(struct vtnet_softc *sc)
+{
+ struct mbuf *m;
+ int error;
+
+ m = vtnet_alloc_rxbuf(sc, sc->vtnet_rx_mbuf_count, NULL);
+ if (m == NULL)
+ return (ENOBUFS);
+
+ error = vtnet_enqueue_rxbuf(sc, m);
+ if (error)
+ m_freem(m);
+
+ return (error);
+}
+
+static void
+vtnet_discard_merged_rxbuf(struct vtnet_softc *sc, int nbufs)
+{
+ struct virtqueue *vq;
+ struct mbuf *m;
+
+ vq = sc->vtnet_rx_vq;
+
+ while (--nbufs > 0) {
+ if ((m = virtqueue_dequeue(vq, NULL)) == NULL)
+ break;
+ vtnet_discard_rxbuf(sc, m);
+ }
+}
+
+static void
+vtnet_discard_rxbuf(struct vtnet_softc *sc, struct mbuf *m)
+{
+ int error;
+
+ /*
+ * Requeue the discarded mbuf. This should always be
+ * successful since it was just dequeued.
+ */
+ error = vtnet_enqueue_rxbuf(sc, m);
+ KASSERT(error == 0, ("cannot requeue discarded mbuf"));
+}
+
+static int
+vtnet_enqueue_rxbuf(struct vtnet_softc *sc, struct mbuf *m)
+{
+ struct sglist sg;
+ struct sglist_seg segs[VTNET_MAX_RX_SEGS];
+ struct vtnet_rx_header *rxhdr;
+ struct virtio_net_hdr *hdr;
+ uint8_t *mdata;
+ int offset, error;
+
+ VTNET_LOCK_ASSERT(sc);
+ if ((sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) == 0)
+ KASSERT(m->m_next == NULL, ("chained Rx mbuf"));
+
+ sglist_init(&sg, VTNET_MAX_RX_SEGS, segs);
+
+ mdata = mtod(m, uint8_t *);
+ offset = 0;
+
+ if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+ rxhdr = (struct vtnet_rx_header *) mdata;
+ hdr = &rxhdr->vrh_hdr;
+ offset += sizeof(struct vtnet_rx_header);
+
+ error = sglist_append(&sg, hdr, sc->vtnet_hdr_size);
+ KASSERT(error == 0, ("cannot add header to sglist"));
+ }
+
+ error = sglist_append(&sg, mdata + offset, m->m_len - offset);
+ if (error)
+ return (error);
+
+ if (m->m_next != NULL) {
+ error = sglist_append_mbuf(&sg, m->m_next);
+ if (error)
+ return (error);
+ }
+
+ return (virtqueue_enqueue(sc->vtnet_rx_vq, m, &sg, 0, sg.sg_nseg));
+}
+
+static void
+vtnet_vlan_tag_remove(struct mbuf *m)
+{
+ struct ether_vlan_header *evl;
+
+ evl = mtod(m, struct ether_vlan_header *);
+
+ m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag);
+ m->m_flags |= M_VLANTAG;
+
+ /* Strip the 802.1Q header. */
+ bcopy((char *) evl, (char *) evl + ETHER_VLAN_ENCAP_LEN,
+ ETHER_HDR_LEN - ETHER_TYPE_LEN);
+ m_adj(m, ETHER_VLAN_ENCAP_LEN);
+}
+
+#ifdef notyet
+static int
+vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m,
+ struct virtio_net_hdr *hdr)
+{
+ struct ether_header *eh;
+ struct ether_vlan_header *evh;
+ struct ip *ip;
+ struct ip6_hdr *ip6;
+ struct udphdr *udp;
+ int ip_offset, csum_start, csum_offset, hlen;
+ uint16_t eth_type;
+ uint8_t ip_proto;
+
+ /*
+ * Convert the VirtIO checksum interface to FreeBSD's interface.
+ * The host only provides us with the offset at which to start
+ * checksumming, and the offset from that to place the completed
+ * checksum. While this maps well with how Linux does checksums,
+ * for FreeBSD, we must parse the received packet in order to set
+ * the appropriate CSUM_* flags.
+ */
+
+ /*
+ * Every mbuf added to the receive virtqueue is always at least
+ * MCLBYTES big, so assume something is amiss if the first mbuf
+ * does not contain both the Ethernet and protocol headers.
+ */
+ ip_offset = sizeof(struct ether_header);
+ if (m->m_len < ip_offset)
+ return (1);
+
+ eh = mtod(m, struct ether_header *);
+ eth_type = ntohs(eh->ether_type);
+ if (eth_type == ETHERTYPE_VLAN) {
+ ip_offset = sizeof(struct ether_vlan_header);
+ if (m->m_len < ip_offset)
+ return (1);
+ evh = mtod(m, struct ether_vlan_header *);
+ eth_type = ntohs(evh->evl_proto);
+ }
+
+ switch (eth_type) {
+ case ETHERTYPE_IP:
+ if (m->m_len < ip_offset + sizeof(struct ip))
+ return (1);
+
+ ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset);
+ /* Sanity check the IP header. */
+ if (ip->ip_v != IPVERSION)
+ return (1);
+ hlen = ip->ip_hl << 2;
+ if (hlen < sizeof(struct ip))
+ return (1);
+ if (ntohs(ip->ip_len) < hlen)
+ return (1);
+ if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset))
+ return (1);
+
+ ip_proto = ip->ip_p;
+ csum_start = ip_offset + hlen;
+ break;
+
+ case ETHERTYPE_IPV6:
+ if (m->m_len < ip_offset + sizeof(struct ip6_hdr))
+ return (1);
+
+ /*
+ * XXX FreeBSD does not handle any IPv6 checksum offloading
+ * at the moment.
+ */
+
+ ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset);
+ /* XXX Assume no extension headers are present. */
+ ip_proto = ip6->ip6_nxt;
+ csum_start = ip_offset + sizeof(struct ip6_hdr);
+ break;
+
+ default:
+ sc->vtnet_stats.rx_csum_bad_ethtype++;
+ return (1);
+ }
+
+ /* Assume checksum begins right after the IP header. */
+ if (hdr->csum_start != csum_start) {
+ sc->vtnet_stats.rx_csum_bad_start++;
+ return (1);
+ }
+
+ switch (ip_proto) {
+ case IPPROTO_TCP:
+ csum_offset = offsetof(struct tcphdr, th_sum);
+ break;
+
+ case IPPROTO_UDP:
+ csum_offset = offsetof(struct udphdr, uh_sum);
+ break;
+
+ case IPPROTO_SCTP:
+ csum_offset = offsetof(struct sctphdr, checksum);
+ break;
+
+ default:
+ sc->vtnet_stats.rx_csum_bad_ipproto++;
+ return (1);
+ }
+
+ if (hdr->csum_offset != csum_offset) {
+ sc->vtnet_stats.rx_csum_bad_offset++;
+ return (1);
+ }
+
+ /*
+ * The IP header checksum is almost certainly valid but I'm
+ * uncertain if that is guaranteed.
+ *
+ * m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID;
+ */
+
+ switch (ip_proto) {
+ case IPPROTO_UDP:
+ if (m->m_len < csum_start + sizeof(struct udphdr))
+ return (1);
+
+ udp = (struct udphdr *)(mtod(m, uint8_t *) + csum_start);
+ if (udp->uh_sum == 0)
+ return (0);
+
+ /* FALLTHROUGH */
+
+ case IPPROTO_TCP:
+ m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xFFFF;
+ break;
+
+ case IPPROTO_SCTP:
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+ break;
+ }
+
+ sc->vtnet_stats.rx_csum_offloaded++;
+
+ return (0);
+}
+#endif
+
+/*
+ * Alternative method of doing receive checksum offloading. Rather
+ * than parsing the received frame down to the IP header, use the
+ * csum_offset to determine which CSUM_* flags are appropriate. We
+ * can get by with doing this only because the checksum offsets are
+ * unique for the things we care about.
+ */
+static int
+vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m,
+ struct virtio_net_hdr *hdr)
+{
+ struct ether_header *eh;
+ struct ether_vlan_header *evh;
+ struct udphdr *udp;
+ int csum_len;
+ uint16_t eth_type;
+
+ csum_len = hdr->csum_start + hdr->csum_offset;
+
+ if (csum_len < sizeof(struct ether_header) + sizeof(struct ip))
+ return (1);
+ if (m->m_len < csum_len)
+ return (1);
+
+ eh = mtod(m, struct ether_header *);
+ eth_type = ntohs(eh->ether_type);
+ if (eth_type == ETHERTYPE_VLAN) {
+ evh = mtod(m, struct ether_vlan_header *);
+ eth_type = ntohs(evh->evl_proto);
+ }
+
+ if (eth_type != ETHERTYPE_IP && eth_type != ETHERTYPE_IPV6) {
+ sc->vtnet_stats.rx_csum_bad_ethtype++;
+ return (1);
+ }
+
+ /* Use the offset to determine the appropriate CSUM_* flags. */
+ switch (hdr->csum_offset) {
+ case offsetof(struct udphdr, uh_sum):
+ if (m->m_len < hdr->csum_start + sizeof(struct udphdr))
+ return (1);
+ udp = (struct udphdr *)(mtod(m, uint8_t *) + hdr->csum_start);
+ if (udp->uh_sum == 0)
+ return (0);
+
+ /* FALLTHROUGH */
+
+ case offsetof(struct tcphdr, th_sum):
+ m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
+ m->m_pkthdr.csum_data = 0xFFFF;
+ break;
+
+ case offsetof(struct sctphdr, checksum):
+ m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
+ break;
+
+ default:
+ sc->vtnet_stats.rx_csum_bad_offset++;
+ return (1);
+ }
+
+ sc->vtnet_stats.rx_csum_offloaded++;
+
+ return (0);
+}
+
+static int
+vtnet_rxeof_merged(struct vtnet_softc *sc, struct mbuf *m_head, int nbufs)
+{
+ struct ifnet *ifp;
+ struct virtqueue *vq;
+ struct mbuf *m, *m_tail;
+ int len;
+
+ ifp = sc->vtnet_ifp;
+ vq = sc->vtnet_rx_vq;
+ m_tail = m_head;
+
+ while (--nbufs > 0) {
+ m = virtqueue_dequeue(vq, &len);
+ if (m == NULL) {
+ ifp->if_ierrors++;
+ goto fail;
+ }
+
+ if (vtnet_newbuf(sc) != 0) {
+ ifp->if_iqdrops++;
+ vtnet_discard_rxbuf(sc, m);
+ if (nbufs > 1)
+ vtnet_discard_merged_rxbuf(sc, nbufs);
+ goto fail;
+ }
+
+ if (m->m_len < len)
+ len = m->m_len;
+
+ m->m_len = len;
+ m->m_flags &= ~M_PKTHDR;
+
+ m_head->m_pkthdr.len += len;
+ m_tail->m_next = m;
+ m_tail = m;
+ }
+
+ return (0);
+
+fail:
+ sc->vtnet_stats.rx_mergeable_failed++;
+ m_freem(m_head);
+
+ return (1);
+}
+
+static int
+vtnet_rxeof(struct vtnet_softc *sc, int count, int *rx_npktsp)
+{
+ struct virtio_net_hdr lhdr;
+ struct ifnet *ifp;
+ struct virtqueue *vq;
+ struct mbuf *m;
+ struct ether_header *eh;
+ struct virtio_net_hdr *hdr;
+ struct virtio_net_hdr_mrg_rxbuf *mhdr;
+ int len, deq, nbufs, adjsz, rx_npkts;
+
+ ifp = sc->vtnet_ifp;
+ vq = sc->vtnet_rx_vq;
+ hdr = &lhdr;
+ deq = 0;
+ rx_npkts = 0;
+
+ VTNET_LOCK_ASSERT(sc);
+
+ while (--count >= 0) {
+ m = virtqueue_dequeue(vq, &len);
+ if (m == NULL)
+ break;
+ deq++;
+
+ if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) {
+ ifp->if_ierrors++;
+ vtnet_discard_rxbuf(sc, m);
+ continue;
+ }
+
+ if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) {
+ nbufs = 1;
+ adjsz = sizeof(struct vtnet_rx_header);
+ /*
+ * Account for our pad between the header and
+ * the actual start of the frame.
+ */
+ len += VTNET_RX_HEADER_PAD;
+ } else {
+ mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *);
+ nbufs = mhdr->num_buffers;
+ adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+ }
+
+ if (vtnet_replace_rxbuf(sc, m, len) != 0) {
+ ifp->if_iqdrops++;
+ vtnet_discard_rxbuf(sc, m);
+ if (nbufs > 1)
+ vtnet_discard_merged_rxbuf(sc, nbufs);
+ continue;
+ }
+
+ m->m_pkthdr.len = len;
+ m->m_pkthdr.rcvif = ifp;
+ m->m_pkthdr.csum_flags = 0;
+
+ if (nbufs > 1) {
+ if (vtnet_rxeof_merged(sc, m, nbufs) != 0)
+ continue;
+ }
+
+ ifp->if_ipackets++;
+
+ /*
+ * Save copy of header before we strip it. For both mergeable
+ * and non-mergeable, the VirtIO header is placed first in the
+ * mbuf's data. We no longer need num_buffers, so always use a
+ * virtio_net_hdr.
+ */
+ memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr));
+ m_adj(m, adjsz);
+
+ if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) {
+ eh = mtod(m, struct ether_header *);
+ if (eh->ether_type == htons(ETHERTYPE_VLAN)) {
+ vtnet_vlan_tag_remove(m);
+
+ /*
+ * With the 802.1Q header removed, update the
+ * checksum starting location accordingly.
+ */
+ if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM)
+ hdr->csum_start -=
+ ETHER_VLAN_ENCAP_LEN;
+ }
+ }
+
+ if (ifp->if_capenable & IFCAP_RXCSUM &&
+ hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ if (vtnet_rx_csum(sc, m, hdr) != 0)
+ sc->vtnet_stats.rx_csum_failed++;
+ }
+
+ VTNET_UNLOCK(sc);
+ rx_npkts++;
+ (*ifp->if_input)(ifp, m);
+ VTNET_LOCK(sc);
+
+ /*
+ * The interface may have been stopped while we were
+ * passing the packet up the network stack.
+ */
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0)
+ break;
+ }
+
+ virtqueue_notify(vq);
+
+ if (rx_npktsp != NULL)
+ *rx_npktsp = rx_npkts;
+
+ return (count > 0 ? 0 : EAGAIN);
+}
+
+static void
+vtnet_rx_intr_task(void *arg, int pending)
+{
+ struct vtnet_softc *sc;
+ struct ifnet *ifp;
+ int more;
+
+ sc = arg;
+ ifp = sc->vtnet_ifp;
+
+ VTNET_LOCK(sc);
+
+#ifdef DEVICE_POLLING
+ if (ifp->if_capenable & IFCAP_POLLING) {
+ VTNET_UNLOCK(sc);
+ return;
+ }
+#endif
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+ vtnet_enable_rx_intr(sc);
+ VTNET_UNLOCK(sc);
+ return;
+ }
+
+ more = vtnet_rxeof(sc, sc->vtnet_rx_process_limit, NULL);
+ if (!more && vtnet_enable_rx_intr(sc) != 0) {
+ vtnet_disable_rx_intr(sc);
+ more = 1;
+ }
+
+ VTNET_UNLOCK(sc);
+
+ if (more) {
+ sc->vtnet_stats.rx_task_rescheduled++;
+ taskqueue_enqueue_fast(sc->vtnet_tq,
+ &sc->vtnet_rx_intr_task);
+ }
+}
+
+static int
+vtnet_rx_vq_intr(void *xsc)
+{
+ struct vtnet_softc *sc;
+
+ sc = xsc;
+
+ vtnet_disable_rx_intr(sc);
+ taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_rx_intr_task);
+
+ return (1);
+}
+
+static void
+vtnet_txeof(struct vtnet_softc *sc)
+{
+ struct virtqueue *vq;
+ struct ifnet *ifp;
+ struct vtnet_tx_header *txhdr;
+ int deq;
+
+ vq = sc->vtnet_tx_vq;
+ ifp = sc->vtnet_ifp;
+ deq = 0;
+
+ VTNET_LOCK_ASSERT(sc);
+
+ while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) {
+ deq++;
+ ifp->if_opackets++;
+ m_freem(txhdr->vth_mbuf);
+ uma_zfree(vtnet_tx_header_zone, txhdr);
+ }
+
+ if (deq > 0) {
+ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+ if (virtqueue_empty(vq))
+ sc->vtnet_watchdog_timer = 0;
+ }
+}
+
+static struct mbuf *
+vtnet_tx_offload(struct vtnet_softc *sc, struct mbuf *m,
+ struct virtio_net_hdr *hdr)
+{
+ struct ifnet *ifp;
+ struct ether_header *eh;
+ struct ether_vlan_header *evh;
+ struct ip *ip;
+ struct ip6_hdr *ip6;
+ struct tcphdr *tcp;
+ int ip_offset;
+ uint16_t eth_type, csum_start;
+ uint8_t ip_proto, gso_type;
+
+ ifp = sc->vtnet_ifp;
+ M_ASSERTPKTHDR(m);
+
+ ip_offset = sizeof(struct ether_header);
+ if (m->m_len < ip_offset) {
+ if ((m = m_pullup(m, ip_offset)) == NULL)
+ return (NULL);
+ }
+
+ eh = mtod(m, struct ether_header *);
+ eth_type = ntohs(eh->ether_type);
+ if (eth_type == ETHERTYPE_VLAN) {
+ ip_offset = sizeof(struct ether_vlan_header);
+ if (m->m_len < ip_offset) {
+ if ((m = m_pullup(m, ip_offset)) == NULL)
+ return (NULL);
+ }
+ evh = mtod(m, struct ether_vlan_header *);
+ eth_type = ntohs(evh->evl_proto);
+ }
+
+ switch (eth_type) {
+ case ETHERTYPE_IP:
+ if (m->m_len < ip_offset + sizeof(struct ip)) {
+ m = m_pullup(m, ip_offset + sizeof(struct ip));
+ if (m == NULL)
+ return (NULL);
+ }
+
+ ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset);
+ ip_proto = ip->ip_p;
+ csum_start = ip_offset + (ip->ip_hl << 2);
+ gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ break;
+
+ case ETHERTYPE_IPV6:
+ if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) {
+ m = m_pullup(m, ip_offset + sizeof(struct ip6_hdr));
+ if (m == NULL)
+ return (NULL);
+ }
+
+ ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset);
+ /*
+ * XXX Assume no extension headers are present. Presently,
+ * this will always be true in the case of TSO, and FreeBSD
+ * does not perform checksum offloading of IPv6 yet.
+ */
+ ip_proto = ip6->ip6_nxt;
+ csum_start = ip_offset + sizeof(struct ip6_hdr);
+ gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ break;
+
+ default:
+ return (m);
+ }
+
+ if (m->m_pkthdr.csum_flags & VTNET_CSUM_OFFLOAD) {
+ hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ hdr->csum_start = csum_start;
+ hdr->csum_offset = m->m_pkthdr.csum_data;
+
+ sc->vtnet_stats.tx_csum_offloaded++;
+ }
+
+ if (m->m_pkthdr.csum_flags & CSUM_TSO) {
+ if (ip_proto != IPPROTO_TCP)
+ return (m);
+
+ if (m->m_len < csum_start + sizeof(struct tcphdr)) {
+ m = m_pullup(m, csum_start + sizeof(struct tcphdr));
+ if (m == NULL)
+ return (NULL);
+ }
+
+ tcp = (struct tcphdr *)(mtod(m, uint8_t *) + csum_start);
+ hdr->gso_type = gso_type;
+ hdr->hdr_len = csum_start + (tcp->th_off << 2);
+ hdr->gso_size = m->m_pkthdr.tso_segsz;
+
+ if (tcp->th_flags & TH_CWR) {
+ /*
+ * Drop if we did not negotiate VIRTIO_NET_F_HOST_ECN.
+ * ECN support is only configurable globally with the
+ * net.inet.tcp.ecn.enable sysctl knob.
+ */
+ if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) {
+ if_printf(ifp, "TSO with ECN not supported "
+ "by host\n");
+ m_freem(m);
+ return (NULL);
+ }
+
+ hdr->flags |= VIRTIO_NET_HDR_GSO_ECN;
+ }
+
+ sc->vtnet_stats.tx_tso_offloaded++;
+ }
+
+ return (m);
+}
+
+static int
+vtnet_enqueue_txbuf(struct vtnet_softc *sc, struct mbuf **m_head,
+ struct vtnet_tx_header *txhdr)
+{
+ struct sglist sg;
+ struct sglist_seg segs[VTNET_MAX_TX_SEGS];
+ struct virtqueue *vq;
+ struct mbuf *m;
+ int collapsed, error;
+
+ vq = sc->vtnet_tx_vq;
+ m = *m_head;
+ collapsed = 0;
+
+ sglist_init(&sg, VTNET_MAX_TX_SEGS, segs);
+ error = sglist_append(&sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size);
+ KASSERT(error == 0 && sg.sg_nseg == 1,
+ ("cannot add header to sglist"));
+
+again:
+ error = sglist_append_mbuf(&sg, m);
+ if (error) {
+ if (collapsed)
+ goto fail;
+
+ m = m_collapse(m, M_DONTWAIT, VTNET_MAX_TX_SEGS - 1);
+ if (m == NULL)
+ goto fail;
+
+ *m_head = m;
+ collapsed = 1;
+ goto again;
+ }
+
+ txhdr->vth_mbuf = m;
+
+ return (virtqueue_enqueue(vq, txhdr, &sg, sg.sg_nseg, 0));
+
+fail:
+ m_freem(*m_head);
+ *m_head = NULL;
+
+ return (ENOBUFS);
+}
+
+static int
+vtnet_encap(struct vtnet_softc *sc, struct mbuf **m_head)
+{
+ struct vtnet_tx_header *txhdr;
+ struct virtio_net_hdr *hdr;
+ struct mbuf *m;
+ int error;
+
+ txhdr = uma_zalloc(vtnet_tx_header_zone, M_NOWAIT | M_ZERO);
+ if (txhdr == NULL)
+ return (ENOMEM);
+
+ /*
+ * Always use the non-mergeable header to simplify things. When
+ * the mergeable feature is negotiated, the num_buffers field
+ * must be set to zero. We use vtnet_hdr_size later to enqueue
+ * the correct header size to the host.
+ */
+ hdr = &txhdr->vth_uhdr.hdr;
+ m = *m_head;
+
+ error = ENOBUFS;
+
+ if (m->m_flags & M_VLANTAG) {
+ m = ether_vlanencap(m, m->m_pkthdr.ether_vtag);
+ if ((*m_head = m) == NULL)
+ goto fail;
+ m->m_flags &= ~M_VLANTAG;
+ }
+
+ if (m->m_pkthdr.csum_flags != 0) {
+ m = vtnet_tx_offload(sc, m, hdr);
+ if ((*m_head = m) == NULL)
+ goto fail;
+ }
+
+ error = vtnet_enqueue_txbuf(sc, m_head, txhdr);
+fail:
+ if (error)
+ uma_zfree(vtnet_tx_header_zone, txhdr);
+
+ return (error);
+}
+
+static void
+vtnet_start(struct ifnet *ifp)
+{
+ struct vtnet_softc *sc;
+
+ sc = ifp->if_softc;
+
+ VTNET_LOCK(sc);
+ vtnet_start_locked(ifp);
+ VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_start_locked(struct ifnet *ifp)
+{
+ struct vtnet_softc *sc;
+ struct virtqueue *vq;
+ struct mbuf *m0;
+ int enq;
+
+ sc = ifp->if_softc;
+ vq = sc->vtnet_tx_vq;
+ enq = 0;
+
+ VTNET_LOCK_ASSERT(sc);
+
+ if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) !=
+ IFF_DRV_RUNNING || ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0))
+ return;
+
+#ifdef VTNET_TX_INTR_MODERATION
+ if (virtqueue_nused(vq) >= sc->vtnet_tx_size / 2)
+ vtnet_txeof(sc);
+#endif
+
+ while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) {
+ if (virtqueue_full(vq)) {
+ ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+ break;
+ }
+
+ IFQ_DRV_DEQUEUE(&ifp->if_snd, m0);
+ if (m0 == NULL)
+ break;
+
+ if (vtnet_encap(sc, &m0) != 0) {
+ if (m0 == NULL)
+ break;
+ IFQ_DRV_PREPEND(&ifp->if_snd, m0);
+ ifp->if_drv_flags |= IFF_DRV_OACTIVE;
+ break;
+ }
+
+ enq++;
+ ETHER_BPF_MTAP(ifp, m0);
+ }
+
+ if (enq > 0) {
+ virtqueue_notify(vq);
+ sc->vtnet_watchdog_timer = VTNET_WATCHDOG_TIMEOUT;
+ }
+}
+
+static void
+vtnet_tick(void *xsc)
+{
+ struct vtnet_softc *sc;
+
+ sc = xsc;
+
+ VTNET_LOCK_ASSERT(sc);
+#ifdef VTNET_DEBUG
+ virtqueue_dump(sc->vtnet_rx_vq);
+ virtqueue_dump(sc->vtnet_tx_vq);
+#endif
+
+ vtnet_watchdog(sc);
+ callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
+}
+
+static void
+vtnet_tx_intr_task(void *arg, int pending)
+{
+ struct vtnet_softc *sc;
+ struct ifnet *ifp;
+
+ sc = arg;
+ ifp = sc->vtnet_ifp;
+
+ VTNET_LOCK(sc);
+
+#ifdef DEVICE_POLLING
+ if (ifp->if_capenable & IFCAP_POLLING) {
+ VTNET_UNLOCK(sc);
+ return;
+ }
+#endif
+
+ if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) {
+ vtnet_enable_tx_intr(sc);
+ VTNET_UNLOCK(sc);
+ return;
+ }
+
+ vtnet_txeof(sc);
+
+ if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
+ vtnet_start_locked(ifp);
+
+ if (vtnet_enable_tx_intr(sc) != 0) {
+ vtnet_disable_tx_intr(sc);
+ sc->vtnet_stats.tx_task_rescheduled++;
+ VTNET_UNLOCK(sc);
+ taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_tx_intr_task);
+ return;
+ }
+
+ VTNET_UNLOCK(sc);
+}
+
+static int
+vtnet_tx_vq_intr(void *xsc)
+{
+ struct vtnet_softc *sc;
+
+ sc = xsc;
+
+ vtnet_disable_tx_intr(sc);
+ taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_tx_intr_task);
+
+ return (1);
+}
+
+static void
+vtnet_stop(struct vtnet_softc *sc)
+{
+ device_t dev;
+ struct ifnet *ifp;
+
+ dev = sc->vtnet_dev;
+ ifp = sc->vtnet_ifp;
+
+ VTNET_LOCK_ASSERT(sc);
+
+ sc->vtnet_watchdog_timer = 0;
+ callout_stop(&sc->vtnet_tick_ch);
+ ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE);
+
+ vtnet_disable_rx_intr(sc);
+ vtnet_disable_tx_intr(sc);
+
+ /*
+ * Stop the host VirtIO adapter. Note this will reset the host
+ * adapter's state back to the pre-initialized state, so in
+ * order to make the device usable again, we must drive it
+ * through virtio_reinit() and virtio_reinit_complete().
+ */
+ virtio_stop(dev);
+
+ sc->vtnet_flags &= ~VTNET_FLAG_LINK;
+
+ vtnet_free_rx_mbufs(sc);
+ vtnet_free_tx_mbufs(sc);
+}
+
+static int
+vtnet_reinit(struct vtnet_softc *sc)
+{
+ struct ifnet *ifp;
+ uint64_t features;
+
+ ifp = sc->vtnet_ifp;
+ features = sc->vtnet_features;
+
+ /*
+ * Re-negotiate with the host, removing any disabled receive
+ * features. Transmit features are disabled only on our side
+ * via if_capenable and if_hwassist.
+ */
+
+ if (ifp->if_capabilities & IFCAP_RXCSUM) {
+ if ((ifp->if_capenable & IFCAP_RXCSUM) == 0)
+ features &= ~VIRTIO_NET_F_GUEST_CSUM;
+ }
+
+ if (ifp->if_capabilities & IFCAP_LRO) {
+ if ((ifp->if_capenable & IFCAP_LRO) == 0)
+ features &= ~VTNET_LRO_FEATURES;
+ }
+
+ if (ifp->if_capabilities & IFCAP_VLAN_HWFILTER) {
+ if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0)
+ features &= ~VIRTIO_NET_F_CTRL_VLAN;
+ }
+
+ return (virtio_reinit(sc->vtnet_dev, features));
+}
+
+static void
+vtnet_init_locked(struct vtnet_softc *sc)
+{
+ device_t dev;
+ struct ifnet *ifp;
+ int error;
+
+ dev = sc->vtnet_dev;
+ ifp = sc->vtnet_ifp;
+
+ VTNET_LOCK_ASSERT(sc);
+
+ if (ifp->if_drv_flags & IFF_DRV_RUNNING)
+ return;
+
+ /* Stop host's adapter, cancel any pending I/O. */
+ vtnet_stop(sc);
+
+ /* Reinitialize the host device. */
+ error = vtnet_reinit(sc);
+ if (error) {
+ device_printf(dev,
+ "reinitialization failed, stopping device...\n");
+ vtnet_stop(sc);
+ return;
+ }
+
+ /* Update host with assigned MAC address. */
+ bcopy(IF_LLADDR(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN);
+ vtnet_set_hwaddr(sc);
+
+ ifp->if_hwassist = 0;
+ if (ifp->if_capenable & IFCAP_TXCSUM)
+ ifp->if_hwassist |= VTNET_CSUM_OFFLOAD;
+ if (ifp->if_capenable & IFCAP_TSO4)
+ ifp->if_hwassist |= CSUM_TSO;
+
+ error = vtnet_init_rx_vq(sc);
+ if (error) {
+ device_printf(dev,
+ "cannot allocate mbufs for Rx virtqueue\n");
+ vtnet_stop(sc);
+ return;
+ }
+
+ if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) {
+ if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) {
+ /* Restore promiscuous and all-multicast modes. */
+ vtnet_rx_filter(sc);
+
+ /* Restore filtered MAC addresses. */
+ vtnet_rx_filter_mac(sc);
+ }
+
+ /* Restore VLAN filters. */
+ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER)
+ vtnet_rx_filter_vlan(sc);
+ }
+
+#ifdef DEVICE_POLLING
+ if (ifp->if_capenable & IFCAP_POLLING) {
+ vtnet_disable_rx_intr(sc);
+ vtnet_disable_tx_intr(sc);
+ } else
+#endif
+ {
+ vtnet_enable_rx_intr(sc);
+ vtnet_enable_tx_intr(sc);
+ }
+
+ ifp->if_drv_flags |= IFF_DRV_RUNNING;
+ ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
+
+ virtio_reinit_complete(dev);
+
+ vtnet_update_link_status(sc);
+ callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc);
+}
+
+static void
+vtnet_init(void *xsc)
+{
+ struct vtnet_softc *sc;
+
+ sc = xsc;
+
+ VTNET_LOCK(sc);
+ vtnet_init_locked(sc);
+ VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie,
+ struct sglist *sg, int readable, int writable)
+{
+ struct virtqueue *vq;
+ void *c;
+
+ vq = sc->vtnet_ctrl_vq;
+
+ VTNET_LOCK_ASSERT(sc);
+ KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ,
+ ("no control virtqueue"));
+ KASSERT(virtqueue_empty(vq),
+ ("control command already enqueued"));
+
+ if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0)
+ return;
+
+ virtqueue_notify(vq);
+
+ /*
+ * Poll until the command is complete. Previously, we would
+ * sleep until the control virtqueue interrupt handler woke
+ * us up, but dropping the VTNET_MTX leads to serialization
+ * difficulties.
+ *
+ * Furthermore, it appears QEMU/KVM only allocates three MSIX
+ * vectors. Two of those vectors are needed for the Rx and Tx
+ * virtqueues. We do not support sharing both a Vq and config
+ * changed notification on the same MSIX vector.
+ */
+ c = virtqueue_poll(vq, NULL);
+ KASSERT(c == cookie, ("unexpected control command response"));
+}
+
+static void
+vtnet_rx_filter(struct vtnet_softc *sc)
+{
+ device_t dev;
+ struct ifnet *ifp;
+
+ dev = sc->vtnet_dev;
+ ifp = sc->vtnet_ifp;
+
+ VTNET_LOCK_ASSERT(sc);
+ KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
+ ("CTRL_RX feature not negotiated"));
+
+ if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0)
+ device_printf(dev, "cannot %s promiscuous mode\n",
+ ifp->if_flags & IFF_PROMISC ? "enable" : "disable");
+
+ if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0)
+ device_printf(dev, "cannot %s all-multicast mode\n",
+ ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable");
+}
+
+static int
+vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on)
+{
+ struct virtio_net_ctrl_hdr hdr;
+ struct sglist_seg segs[3];
+ struct sglist sg;
+ uint8_t onoff, ack;
+ int error;
+
+ if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0)
+ return (ENOTSUP);
+
+ error = 0;
+
+ hdr.class = VIRTIO_NET_CTRL_RX;
+ hdr.cmd = cmd;
+ onoff = !!on;
+ ack = VIRTIO_NET_ERR;
+
+ sglist_init(&sg, 3, segs);
+ error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
+ error |= sglist_append(&sg, &onoff, sizeof(uint8_t));
+ error |= sglist_append(&sg, &ack, sizeof(uint8_t));
+ KASSERT(error == 0 && sg.sg_nseg == 3,
+ ("error adding Rx filter message to sglist"));
+
+ vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
+
+ return (ack == VIRTIO_NET_OK ? 0 : EIO);
+}
+
+static int
+vtnet_set_promisc(struct vtnet_softc *sc, int on)
+{
+
+ return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on));
+}
+
+static int
+vtnet_set_allmulti(struct vtnet_softc *sc, int on)
+{
+
+ return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on));
+}
+
+static void
+vtnet_rx_filter_mac(struct vtnet_softc *sc)
+{
+ struct virtio_net_ctrl_hdr hdr;
+ struct vtnet_mac_filter *filter;
+ struct sglist_seg segs[4];
+ struct sglist sg;
+ struct ifnet *ifp;
+ struct ifaddr *ifa;
+ struct ifmultiaddr *ifma;
+ int ucnt, mcnt, promisc, allmulti, error;
+ uint8_t ack;
+
+ ifp = sc->vtnet_ifp;
+ ucnt = 0;
+ mcnt = 0;
+ promisc = 0;
+ allmulti = 0;
+ error = 0;
+
+ VTNET_LOCK_ASSERT(sc);
+ KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX,
+ ("CTRL_RX feature not negotiated"));
+
+ /*
+ * Allocate the MAC filtering table. Note we could do this
+ * at attach time, but it is probably not worth keeping it
+ * around for an infrequent occurrence.
+ */
+ filter = malloc(sizeof(struct vtnet_mac_filter), M_DEVBUF,
+ M_NOWAIT | M_ZERO);
+ if (filter == NULL) {
+ device_printf(sc->vtnet_dev,
+ "cannot allocate MAC address filtering table\n");
+ return;
+ }
+
+ /* Unicast MAC addresses: */
+ if_addr_rlock(ifp);
+ TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
+ if (ifa->ifa_addr->sa_family != AF_LINK)
+ continue;
+ else if (ucnt == VTNET_MAX_MAC_ENTRIES)
+ break;
+
+ bcopy(LLADDR((struct sockaddr_dl *)ifa->ifa_addr),
+ &filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN);
+ ucnt++;
+ }
+ if_addr_runlock(ifp);
+
+ if (ucnt >= VTNET_MAX_MAC_ENTRIES) {
+ promisc = 1;
+ filter->vmf_unicast.nentries = 0;
+
+ if_printf(ifp, "more than %d MAC addresses assigned, "
+ "falling back to promiscuous mode\n",
+ VTNET_MAX_MAC_ENTRIES);
+ } else
+ filter->vmf_unicast.nentries = ucnt;
+
+ /* Multicast MAC addresses: */
+ if_maddr_rlock(ifp);
+ TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
+ if (ifma->ifma_addr->sa_family != AF_LINK)
+ continue;
+ else if (mcnt == VTNET_MAX_MAC_ENTRIES)
+ break;
+
+ bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr),
+ &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN);
+ mcnt++;
+ }
+ if_maddr_runlock(ifp);
+
+ if (mcnt >= VTNET_MAX_MAC_ENTRIES) {
+ allmulti = 1;
+ filter->vmf_multicast.nentries = 0;
+
+ if_printf(ifp, "more than %d multicast MAC addresses "
+ "assigned, falling back to all-multicast mode\n",
+ VTNET_MAX_MAC_ENTRIES);
+ } else
+ filter->vmf_multicast.nentries = mcnt;
+
+ if (promisc && allmulti)
+ goto out;
+
+ hdr.class = VIRTIO_NET_CTRL_MAC;
+ hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET;
+ ack = VIRTIO_NET_ERR;
+
+ sglist_init(&sg, 4, segs);
+ error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
+ error |= sglist_append(&sg, &filter->vmf_unicast,
+ sizeof(struct vtnet_mac_table));
+ error |= sglist_append(&sg, &filter->vmf_multicast,
+ sizeof(struct vtnet_mac_table));
+ error |= sglist_append(&sg, &ack, sizeof(uint8_t));
+ KASSERT(error == 0 && sg.sg_nseg == 4,
+ ("error adding MAC filtering message to sglist"));
+
+ vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
+
+ if (ack != VIRTIO_NET_OK)
+ if_printf(ifp, "error setting host MAC filter table\n");
+
+out:
+ free(filter, M_DEVBUF);
+
+ if (promisc)
+ if (vtnet_set_promisc(sc, 1) != 0)
+ if_printf(ifp, "cannot enable promiscuous mode\n");
+ if (allmulti)
+ if (vtnet_set_allmulti(sc, 1) != 0)
+ if_printf(ifp, "cannot enable all-multicast mode\n");
+}
+
+static int
+vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
+{
+ struct virtio_net_ctrl_hdr hdr;
+ struct sglist_seg segs[3];
+ struct sglist sg;
+ uint8_t ack;
+ int error;
+
+ hdr.class = VIRTIO_NET_CTRL_VLAN;
+ hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL;
+ ack = VIRTIO_NET_ERR;
+ error = 0;
+
+ sglist_init(&sg, 3, segs);
+ error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr));
+ error |= sglist_append(&sg, &tag, sizeof(uint16_t));
+ error |= sglist_append(&sg, &ack, sizeof(uint8_t));
+ KASSERT(error == 0 && sg.sg_nseg == 3,
+ ("error adding VLAN control message to sglist"));
+
+ vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1);
+
+ return (ack == VIRTIO_NET_OK ? 0 : EIO);
+}
+
+static void
+vtnet_rx_filter_vlan(struct vtnet_softc *sc)
+{
+ device_t dev;
+ uint32_t w, mask;
+ uint16_t tag;
+ int i, nvlans, error;
+
+ VTNET_LOCK_ASSERT(sc);
+ KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER,
+ ("VLAN_FILTER feature not negotiated"));
+
+ dev = sc->vtnet_dev;
+ nvlans = sc->vtnet_nvlans;
+ error = 0;
+
+ /* Enable filtering for each configured VLAN. */
+ for (i = 0; i < VTNET_VLAN_SHADOW_SIZE && nvlans > 0; i++) {
+ w = sc->vtnet_vlan_shadow[i];
+ for (mask = 1, tag = i * 32; w != 0; mask <<= 1, tag++) {
+ if ((w & mask) != 0) {
+ w &= ~mask;
+ nvlans--;
+ if (vtnet_exec_vlan_filter(sc, 1, tag) != 0)
+ error++;
+ }
+ }
+ }
+
+ KASSERT(nvlans == 0, ("VLAN count incorrect"));
+ if (error)
+ device_printf(dev, "cannot restore VLAN filter table\n");
+}
+
+static void
+vtnet_set_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag)
+{
+ struct ifnet *ifp;
+ int idx, bit;
+
+ KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER,
+ ("VLAN_FILTER feature not negotiated"));
+
+ if ((tag == 0) || (tag > 4095))
+ return;
+
+ ifp = sc->vtnet_ifp;
+ idx = (tag >> 5) & 0x7F;
+ bit = tag & 0x1F;
+
+ VTNET_LOCK(sc);
+
+ /* Update shadow VLAN table. */
+ if (add) {
+ sc->vtnet_nvlans++;
+ sc->vtnet_vlan_shadow[idx] |= (1 << bit);
+ } else {
+ sc->vtnet_nvlans--;
+ sc->vtnet_vlan_shadow[idx] &= ~(1 << bit);
+ }
+
+ if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) {
+ if (vtnet_exec_vlan_filter(sc, add, tag) != 0) {
+ device_printf(sc->vtnet_dev,
+ "cannot %s VLAN %d %s the host filter table\n",
+ add ? "add" : "remove", tag,
+ add ? "to" : "from");
+ }
+ }
+
+ VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag)
+{
+
+ if (ifp->if_softc != arg)
+ return;
+
+ vtnet_set_vlan_filter(arg, 1, tag);
+}
+
+static void
+vtnet_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag)
+{
+
+ if (ifp->if_softc != arg)
+ return;
+
+ vtnet_set_vlan_filter(arg, 0, tag);
+}
+
+static int
+vtnet_ifmedia_upd(struct ifnet *ifp)
+{
+ struct vtnet_softc *sc;
+ struct ifmedia *ifm;
+
+ sc = ifp->if_softc;
+ ifm = &sc->vtnet_media;
+
+ if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
+ return (EINVAL);
+
+ return (0);
+}
+
+static void
+vtnet_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr)
+{
+ struct vtnet_softc *sc;
+
+ sc = ifp->if_softc;
+
+ ifmr->ifm_status = IFM_AVALID;
+ ifmr->ifm_active = IFM_ETHER;
+
+ VTNET_LOCK(sc);
+ if (vtnet_is_link_up(sc) != 0) {
+ ifmr->ifm_status |= IFM_ACTIVE;
+ ifmr->ifm_active |= VTNET_MEDIATYPE;
+ } else
+ ifmr->ifm_active |= IFM_NONE;
+ VTNET_UNLOCK(sc);
+}
+
+static void
+vtnet_add_statistics(struct vtnet_softc *sc)
+{
+ device_t dev;
+ struct vtnet_statistics *stats;
+ struct sysctl_ctx_list *ctx;
+ struct sysctl_oid *tree;
+ struct sysctl_oid_list *child;
+
+ dev = sc->vtnet_dev;
+ stats = &sc->vtnet_stats;
+ ctx = device_get_sysctl_ctx(dev);
+ tree = device_get_sysctl_tree(dev);
+ child = SYSCTL_CHILDREN(tree);
+
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_alloc_failed",
+ CTLFLAG_RD, &stats->mbuf_alloc_failed,
+ "Mbuf cluster allocation failures");
+
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_frame_too_large",
+ CTLFLAG_RD, &stats->rx_frame_too_large,
+ "Received frame larger than the mbuf chain");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_enq_replacement_failed",
+ CTLFLAG_RD, &stats->rx_enq_replacement_failed,
+ "Enqueuing the replacement receive mbuf failed");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_mergeable_failed",
+ CTLFLAG_RD, &stats->rx_mergeable_failed,
+ "Mergeable buffers receive failures");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ethtype",
+ CTLFLAG_RD, &stats->rx_csum_bad_ethtype,
+ "Received checksum offloaded buffer with unsupported "
+ "Ethernet type");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_start",
+ CTLFLAG_RD, &stats->rx_csum_bad_start,
+ "Received checksum offloaded buffer with incorrect start offset");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ipproto",
+ CTLFLAG_RD, &stats->rx_csum_bad_ipproto,
+ "Received checksum offloaded buffer with incorrect IP protocol");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_offset",
+ CTLFLAG_RD, &stats->rx_csum_bad_offset,
+ "Received checksum offloaded buffer with incorrect offset");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_failed",
+ CTLFLAG_RD, &stats->rx_csum_failed,
+ "Received buffer checksum offload failed");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_offloaded",
+ CTLFLAG_RD, &stats->rx_csum_offloaded,
+ "Received buffer checksum offload succeeded");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_task_rescheduled",
+ CTLFLAG_RD, &stats->rx_task_rescheduled,
+ "Times the receive interrupt task rescheduled itself");
+
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_offloaded",
+ CTLFLAG_RD, &stats->tx_csum_offloaded,
+ "Offloaded checksum of transmitted buffer");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_offloaded",
+ CTLFLAG_RD, &stats->tx_tso_offloaded,
+ "Segmentation offload of transmitted buffer");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_bad_ethtype",
+ CTLFLAG_RD, &stats->tx_csum_bad_ethtype,
+ "Aborted transmit of checksum offloaded buffer with unknown "
+ "Ethernet type");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_bad_ethtype",
+ CTLFLAG_RD, &stats->tx_tso_bad_ethtype,
+ "Aborted transmit of TSO buffer with unknown Ethernet type");
+ SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_task_rescheduled",
+ CTLFLAG_RD, &stats->tx_task_rescheduled,
+ "Times the transmit interrupt task rescheduled itself");
+}
+
+static int
+vtnet_enable_rx_intr(struct vtnet_softc *sc)
+{
+
+ return (virtqueue_enable_intr(sc->vtnet_rx_vq));
+}
+
+static void
+vtnet_disable_rx_intr(struct vtnet_softc *sc)
+{
+
+ virtqueue_disable_intr(sc->vtnet_rx_vq);
+}
+
+static int
+vtnet_enable_tx_intr(struct vtnet_softc *sc)
+{
+
+#ifdef VTNET_TX_INTR_MODERATION
+ return (0);
+#else
+ return (virtqueue_enable_intr(sc->vtnet_tx_vq));
+#endif
+}
+
+static void
+vtnet_disable_tx_intr(struct vtnet_softc *sc)
+{
+
+ virtqueue_disable_intr(sc->vtnet_tx_vq);
+}
diff --git a/sys/dev/virtio/network/if_vtnetvar.h b/sys/dev/virtio/network/if_vtnetvar.h
new file mode 100644
index 0000000..613b2b0
--- /dev/null
+++ b/sys/dev/virtio/network/if_vtnetvar.h
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _IF_VTNETVAR_H
+#define _IF_VTNETVAR_H
+
+struct vtnet_statistics {
+ unsigned long mbuf_alloc_failed;
+
+ unsigned long rx_frame_too_large;
+ unsigned long rx_enq_replacement_failed;
+ unsigned long rx_mergeable_failed;
+ unsigned long rx_csum_bad_ethtype;
+ unsigned long rx_csum_bad_start;
+ unsigned long rx_csum_bad_ipproto;
+ unsigned long rx_csum_bad_offset;
+ unsigned long rx_csum_failed;
+ unsigned long rx_csum_offloaded;
+ unsigned long rx_task_rescheduled;
+
+ unsigned long tx_csum_offloaded;
+ unsigned long tx_tso_offloaded;
+ unsigned long tx_csum_bad_ethtype;
+ unsigned long tx_tso_bad_ethtype;
+ unsigned long tx_task_rescheduled;
+};
+
+struct vtnet_softc {
+ device_t vtnet_dev;
+ struct ifnet *vtnet_ifp;
+ struct mtx vtnet_mtx;
+
+ uint32_t vtnet_flags;
+#define VTNET_FLAG_LINK 0x0001
+#define VTNET_FLAG_SUSPENDED 0x0002
+#define VTNET_FLAG_CTRL_VQ 0x0004
+#define VTNET_FLAG_CTRL_RX 0x0008
+#define VTNET_FLAG_VLAN_FILTER 0x0010
+#define VTNET_FLAG_TSO_ECN 0x0020
+#define VTNET_FLAG_MRG_RXBUFS 0x0040
+#define VTNET_FLAG_LRO_NOMRG 0x0080
+
+ struct virtqueue *vtnet_rx_vq;
+ struct virtqueue *vtnet_tx_vq;
+ struct virtqueue *vtnet_ctrl_vq;
+
+ int vtnet_hdr_size;
+ int vtnet_tx_size;
+ int vtnet_rx_size;
+ int vtnet_rx_process_limit;
+ int vtnet_rx_mbuf_size;
+ int vtnet_rx_mbuf_count;
+ int vtnet_if_flags;
+ int vtnet_watchdog_timer;
+ uint64_t vtnet_features;
+
+ struct taskqueue *vtnet_tq;
+ struct task vtnet_rx_intr_task;
+ struct task vtnet_tx_intr_task;
+ struct task vtnet_cfgchg_task;
+
+ struct vtnet_statistics vtnet_stats;
+
+ struct callout vtnet_tick_ch;
+
+ eventhandler_tag vtnet_vlan_attach;
+ eventhandler_tag vtnet_vlan_detach;
+
+ struct ifmedia vtnet_media;
+ /*
+ * Fake media type; the host does not provide us with
+ * any real media information.
+ */
+#define VTNET_MEDIATYPE (IFM_ETHER | IFM_1000_T | IFM_FDX)
+ char vtnet_hwaddr[ETHER_ADDR_LEN];
+
+ /*
+ * During reset, the host's VLAN filtering table is lost. The
+ * array below is used to restore all the VLANs configured on
+ * this interface after a reset.
+ */
+#define VTNET_VLAN_SHADOW_SIZE (4096 / 32)
+ int vtnet_nvlans;
+ uint32_t vtnet_vlan_shadow[VTNET_VLAN_SHADOW_SIZE];
+
+ char vtnet_mtx_name[16];
+};
+
+/*
+ * When mergeable buffers are not negotiated, the vtnet_rx_header structure
+ * below is placed at the beginning of the mbuf data. Use 4 bytes of pad to
+ * both keep the VirtIO header and the data non-contiguous and to keep the
+ * frame's payload 4 byte aligned.
+ *
+ * When mergeable buffers are negotiated, the host puts the VirtIO header in
+ * the beginning of the first mbuf's data.
+ */
+#define VTNET_RX_HEADER_PAD 4
+struct vtnet_rx_header {
+ struct virtio_net_hdr vrh_hdr;
+ char vrh_pad[VTNET_RX_HEADER_PAD];
+} __packed;
+
+/*
+ * For each outgoing frame, the vtnet_tx_header below is allocated from
+ * the vtnet_tx_header_zone.
+ */
+struct vtnet_tx_header {
+ union {
+ struct virtio_net_hdr hdr;
+ struct virtio_net_hdr_mrg_rxbuf mhdr;
+ } vth_uhdr;
+
+ struct mbuf *vth_mbuf;
+};
+
+/*
+ * The VirtIO specification does not place a limit on the number of MAC
+ * addresses the guest driver may request to be filtered. In practice,
+ * the host is constrained by available resources. To simplify this driver,
+ * impose a reasonably high limit of MAC addresses we will filter before
+ * falling back to promiscuous or all-multicast modes.
+ */
+#define VTNET_MAX_MAC_ENTRIES 128
+
+struct vtnet_mac_table {
+ uint32_t nentries;
+ uint8_t macs[VTNET_MAX_MAC_ENTRIES][ETHER_ADDR_LEN];
+} __packed;
+
+struct vtnet_mac_filter {
+ struct vtnet_mac_table vmf_unicast;
+ uint32_t vmf_pad; /* Make tables non-contiguous. */
+ struct vtnet_mac_table vmf_multicast;
+};
+
+/*
+ * The MAC filter table is malloc(9)'d when needed. Ensure it will
+ * always fit in one segment.
+ */
+CTASSERT(sizeof(struct vtnet_mac_filter) <= PAGE_SIZE);
+
+#define VTNET_WATCHDOG_TIMEOUT 5
+#define VTNET_CSUM_OFFLOAD (CSUM_TCP | CSUM_UDP | CSUM_SCTP)
+
+/* Features desired/implemented by this driver. */
+#define VTNET_FEATURES \
+ (VIRTIO_NET_F_MAC | \
+ VIRTIO_NET_F_STATUS | \
+ VIRTIO_NET_F_CTRL_VQ | \
+ VIRTIO_NET_F_CTRL_RX | \
+ VIRTIO_NET_F_CTRL_VLAN | \
+ VIRTIO_NET_F_CSUM | \
+ VIRTIO_NET_F_HOST_TSO4 | \
+ VIRTIO_NET_F_HOST_TSO6 | \
+ VIRTIO_NET_F_HOST_ECN | \
+ VIRTIO_NET_F_GUEST_CSUM | \
+ VIRTIO_NET_F_GUEST_TSO4 | \
+ VIRTIO_NET_F_GUEST_TSO6 | \
+ VIRTIO_NET_F_GUEST_ECN | \
+ VIRTIO_NET_F_MRG_RXBUF | \
+ VIRTIO_RING_F_INDIRECT_DESC)
+
+/*
+ * The VIRTIO_NET_F_GUEST_TSO[46] features permit the host to send us
+ * frames larger than 1514 bytes. We do not yet support software LRO
+ * via tcp_lro_rx().
+ */
+#define VTNET_LRO_FEATURES (VIRTIO_NET_F_GUEST_TSO4 | \
+ VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN)
+
+#define VTNET_MAX_MTU 65536
+#define VTNET_MAX_RX_SIZE 65550
+
+/*
+ * Used to preallocate the Vq indirect descriptors. The first segment
+ * is reserved for the header.
+ */
+#define VTNET_MIN_RX_SEGS 2
+#define VTNET_MAX_RX_SEGS 34
+#define VTNET_MAX_TX_SEGS 34
+
+/*
+ * Assert we can receive and transmit the maximum with regular
+ * size clusters.
+ */
+CTASSERT(((VTNET_MAX_RX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_RX_SIZE);
+CTASSERT(((VTNET_MAX_TX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_MTU);
+
+/*
+ * Determine how many mbufs are in each receive buffer. For LRO without
+ * mergeable descriptors, we must allocate an mbuf chain large enough to
+ * hold both the vtnet_rx_header and the maximum receivable data.
+ */
+#define VTNET_NEEDED_RX_MBUFS(_sc) \
+ ((_sc)->vtnet_flags & VTNET_FLAG_LRO_NOMRG) == 0 ? 1 : \
+ howmany(sizeof(struct vtnet_rx_header) + VTNET_MAX_RX_SIZE, \
+ (_sc)->vtnet_rx_mbuf_size)
+
+#define VTNET_MTX(_sc) &(_sc)->vtnet_mtx
+#define VTNET_LOCK(_sc) mtx_lock(VTNET_MTX((_sc)))
+#define VTNET_UNLOCK(_sc) mtx_unlock(VTNET_MTX((_sc)))
+#define VTNET_LOCK_DESTROY(_sc) mtx_destroy(VTNET_MTX((_sc)))
+#define VTNET_LOCK_ASSERT(_sc) mtx_assert(VTNET_MTX((_sc)), MA_OWNED)
+#define VTNET_LOCK_ASSERT_NOTOWNED(_sc) \
+ mtx_assert(VTNET_MTX((_sc)), MA_NOTOWNED)
+
+#define VTNET_LOCK_INIT(_sc) do { \
+ snprintf((_sc)->vtnet_mtx_name, sizeof((_sc)->vtnet_mtx_name), \
+ "%s", device_get_nameunit((_sc)->vtnet_dev)); \
+ mtx_init(VTNET_MTX((_sc)), (_sc)->vtnet_mtx_name, \
+ "VTNET Core Lock", MTX_DEF); \
+} while (0)
+
+#endif /* _IF_VTNETVAR_H */
diff --git a/sys/dev/virtio/network/virtio_net.h b/sys/dev/virtio/network/virtio_net.h
new file mode 100644
index 0000000..7361aa1
--- /dev/null
+++ b/sys/dev/virtio/network/virtio_net.h
@@ -0,0 +1,138 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_NET_H
+#define _VIRTIO_NET_H
+
+#include <sys/types.h>
+
+/* The feature bitmap for virtio net */
+#define VIRTIO_NET_F_CSUM 0x00001 /* Host handles pkts w/ partial csum */
+#define VIRTIO_NET_F_GUEST_CSUM 0x00002 /* Guest handles pkts w/ partial csum*/
+#define VIRTIO_NET_F_MAC 0x00020 /* Host has given MAC address. */
+#define VIRTIO_NET_F_GSO 0x00040 /* Host handles pkts w/ any GSO type */
+#define VIRTIO_NET_F_GUEST_TSO4 0x00080 /* Guest can handle TSOv4 in. */
+#define VIRTIO_NET_F_GUEST_TSO6 0x00100 /* Guest can handle TSOv6 in. */
+#define VIRTIO_NET_F_GUEST_ECN 0x00200 /* Guest can handle TSO[6] w/ ECN in.*/
+#define VIRTIO_NET_F_GUEST_UFO 0x00400 /* Guest can handle UFO in. */
+#define VIRTIO_NET_F_HOST_TSO4 0x00800 /* Host can handle TSOv4 in. */
+#define VIRTIO_NET_F_HOST_TSO6 0x01000 /* Host can handle TSOv6 in. */
+#define VIRTIO_NET_F_HOST_ECN 0x02000 /* Host can handle TSO[6] w/ ECN in. */
+#define VIRTIO_NET_F_HOST_UFO 0x04000 /* Host can handle UFO in. */
+#define VIRTIO_NET_F_MRG_RXBUF 0x08000 /* Host can merge receive buffers. */
+#define VIRTIO_NET_F_STATUS 0x10000 /* virtio_net_config.status available*/
+#define VIRTIO_NET_F_CTRL_VQ 0x20000 /* Control channel available */
+#define VIRTIO_NET_F_CTRL_RX 0x40000 /* Control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN 0x80000 /* Control channel VLAN filtering */
+#define VIRTIO_NET_F_CTRL_RX_EXTRA 0x100000 /* Extra RX mode control support */
+
+#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */
+
+struct virtio_net_config {
+ /* The config defining mac address (if VIRTIO_NET_F_MAC) */
+ uint8_t mac[ETHER_ADDR_LEN];
+ /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */
+ uint16_t status;
+} __packed;
+
+/*
+ * This is the first element of the scatter-gather list. If you don't
+ * specify GSO or CSUM features, you can simply ignore the header.
+ */
+struct virtio_net_hdr {
+#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start,csum_offset*/
+ uint8_t flags;
+#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */
+#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */
+#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */
+#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */
+#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */
+ uint8_t gso_type;
+ uint16_t hdr_len; /* Ethernet + IP + tcp/udp hdrs */
+ uint16_t gso_size; /* Bytes to append to hdr_len per frame */
+ uint16_t csum_start; /* Position to start checksumming from */
+ uint16_t csum_offset; /* Offset after that to place checksum */
+};
+
+/*
+ * This is the version of the header to use when the MRG_RXBUF
+ * feature has been negotiated.
+ */
+struct virtio_net_hdr_mrg_rxbuf {
+ struct virtio_net_hdr hdr;
+ uint16_t num_buffers; /* Number of merged rx buffers */
+};
+
+/*
+ * Control virtqueue data structures
+ *
+ * The control virtqueue expects a header in the first sg entry
+ * and an ack/status response in the last entry. Data for the
+ * command goes in between.
+ */
+struct virtio_net_ctrl_hdr {
+ uint8_t class;
+ uint8_t cmd;
+} __packed;
+
+typedef uint8_t virtio_net_ctrl_ack;
+
+#define VIRTIO_NET_OK 0
+#define VIRTIO_NET_ERR 1
+
+/*
+ * Control the RX mode, ie. promiscuous, allmulti, etc...
+ * All commands require an "out" sg entry containing a 1 byte
+ * state value, zero = disable, non-zero = enable. Commands
+ * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature.
+ * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA.
+ */
+#define VIRTIO_NET_CTRL_RX 0
+#define VIRTIO_NET_CTRL_RX_PROMISC 0
+#define VIRTIO_NET_CTRL_RX_ALLMULTI 1
+#define VIRTIO_NET_CTRL_RX_ALLUNI 2
+#define VIRTIO_NET_CTRL_RX_NOMULTI 3
+#define VIRTIO_NET_CTRL_RX_NOUNI 4
+#define VIRTIO_NET_CTRL_RX_NOBCAST 5
+
+/*
+ * Control the MAC filter table.
+ *
+ * The MAC filter table is managed by the hypervisor, the guest should
+ * assume the size is infinite. Filtering should be considered
+ * non-perfect, ie. based on hypervisor resources, the guest may
+ * received packets from sources not specified in the filter list.
+ *
+ * In addition to the class/cmd header, the TABLE_SET command requires
+ * two out scatterlists. Each contains a 4 byte count of entries followed
+ * by a concatenated byte stream of the ETH_ALEN MAC addresses. The
+ * first sg list contains unicast addresses, the second is for multicast.
+ * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature
+ * is available.
+ */
+struct virtio_net_ctrl_mac {
+ uint32_t entries;
+ uint8_t macs[][ETHER_ADDR_LEN];
+} __packed;
+
+#define VIRTIO_NET_CTRL_MAC 1
+#define VIRTIO_NET_CTRL_MAC_TABLE_SET 0
+
+/*
+ * Control VLAN filtering
+ *
+ * The VLAN filter table is controlled via a simple ADD/DEL interface.
+ * VLAN IDs not added may be filtered by the hypervisor. Del is the
+ * opposite of add. Both commands expect an out entry containing a 2
+ * byte VLAN ID. VLAN filtering is available with the
+ * VIRTIO_NET_F_CTRL_VLAN feature bit.
+ */
+#define VIRTIO_NET_CTRL_VLAN 2
+#define VIRTIO_NET_CTRL_VLAN_ADD 0
+#define VIRTIO_NET_CTRL_VLAN_DEL 1
+
+#endif /* _VIRTIO_NET_H */
diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c
new file mode 100644
index 0000000..dd348a5
--- /dev/null
+++ b/sys/dev/virtio/pci/virtio_pci.c
@@ -0,0 +1,1081 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/* Driver for the VirtIO PCI interface. */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/malloc.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/pci/pcivar.h>
+#include <dev/pci/pcireg.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/pci/virtio_pci.h>
+
+#include "virtio_bus_if.h"
+#include "virtio_if.h"
+
+struct vtpci_softc {
+ device_t vtpci_dev;
+ struct resource *vtpci_res;
+ struct resource *vtpci_msix_res;
+ uint64_t vtpci_features;
+ uint32_t vtpci_flags;
+#define VIRTIO_PCI_FLAG_NO_MSI 0x0001
+#define VIRTIO_PCI_FLAG_MSI 0x0002
+#define VIRTIO_PCI_FLAG_NO_MSIX 0x0010
+#define VIRTIO_PCI_FLAG_MSIX 0x0020
+#define VIRTIO_PCI_FLAG_SHARED_MSIX 0x0040
+
+ device_t vtpci_child_dev;
+ struct virtio_feature_desc *vtpci_child_feat_desc;
+
+ /*
+ * Ideally, each virtqueue that the driver provides a callback for
+ * will receive its own MSIX vector. If there are not sufficient
+ * vectors available, we will then attempt to have all the VQs
+ * share one vector. Note that when using MSIX, the configuration
+ * changed notifications must be on their own vector.
+ *
+ * If MSIX is not available, we will attempt to have the whole
+ * device share one MSI vector, and then, finally, one legacy
+ * interrupt.
+ */
+ int vtpci_nvqs;
+ struct vtpci_virtqueue {
+ struct virtqueue *vq;
+
+ /* Index into vtpci_intr_res[] below. Unused, then -1. */
+ int ires_idx;
+ } vtpci_vqx[VIRTIO_MAX_VIRTQUEUES];
+
+ /*
+ * When using MSIX interrupts, the first element of vtpci_intr_res[]
+ * is always the configuration changed notifications. The remaining
+ * element(s) are used for the virtqueues.
+ *
+ * With MSI and legacy interrupts, only the first element of
+ * vtpci_intr_res[] is used.
+ */
+ int vtpci_nintr_res;
+ struct vtpci_intr_resource {
+ struct resource *irq;
+ int rid;
+ void *intrhand;
+ } vtpci_intr_res[1 + VIRTIO_MAX_VIRTQUEUES];
+};
+
+static int vtpci_probe(device_t);
+static int vtpci_attach(device_t);
+static int vtpci_detach(device_t);
+static int vtpci_suspend(device_t);
+static int vtpci_resume(device_t);
+static int vtpci_shutdown(device_t);
+static void vtpci_driver_added(device_t, driver_t *);
+static void vtpci_child_detached(device_t, device_t);
+static int vtpci_read_ivar(device_t, device_t, int, uintptr_t *);
+static int vtpci_write_ivar(device_t, device_t, int, uintptr_t);
+
+static uint64_t vtpci_negotiate_features(device_t, uint64_t);
+static int vtpci_with_feature(device_t, uint64_t);
+static int vtpci_alloc_virtqueues(device_t, int, int,
+ struct vq_alloc_info *);
+static int vtpci_setup_intr(device_t, enum intr_type);
+static void vtpci_stop(device_t);
+static int vtpci_reinit(device_t, uint64_t);
+static void vtpci_reinit_complete(device_t);
+static void vtpci_notify_virtqueue(device_t, uint16_t);
+static uint8_t vtpci_get_status(device_t);
+static void vtpci_set_status(device_t, uint8_t);
+static void vtpci_read_dev_config(device_t, bus_size_t, void *, int);
+static void vtpci_write_dev_config(device_t, bus_size_t, void *, int);
+
+static void vtpci_describe_features(struct vtpci_softc *, const char *,
+ uint64_t);
+static void vtpci_probe_and_attach_child(struct vtpci_softc *);
+
+static int vtpci_alloc_interrupts(struct vtpci_softc *, int, int,
+ struct vq_alloc_info *);
+static int vtpci_alloc_intr_resources(struct vtpci_softc *, int,
+ struct vq_alloc_info *);
+static int vtpci_alloc_msi(struct vtpci_softc *);
+static int vtpci_alloc_msix(struct vtpci_softc *, int);
+static int vtpci_register_msix_vector(struct vtpci_softc *, int, int);
+
+static void vtpci_free_interrupts(struct vtpci_softc *);
+static void vtpci_free_virtqueues(struct vtpci_softc *);
+static void vtpci_release_child_resources(struct vtpci_softc *);
+static void vtpci_reset(struct vtpci_softc *);
+
+static int vtpci_legacy_intr(void *);
+static int vtpci_vq_shared_intr(void *);
+static int vtpci_vq_intr(void *);
+static int vtpci_config_intr(void *);
+
+/*
+ * I/O port read/write wrappers.
+ */
+#define vtpci_read_config_1(sc, o) bus_read_1((sc)->vtpci_res, (o))
+#define vtpci_read_config_2(sc, o) bus_read_2((sc)->vtpci_res, (o))
+#define vtpci_read_config_4(sc, o) bus_read_4((sc)->vtpci_res, (o))
+#define vtpci_write_config_1(sc, o, v) bus_write_1((sc)->vtpci_res, (o), (v))
+#define vtpci_write_config_2(sc, o, v) bus_write_2((sc)->vtpci_res, (o), (v))
+#define vtpci_write_config_4(sc, o, v) bus_write_4((sc)->vtpci_res, (o), (v))
+
+/* Tunables. */
+static int vtpci_disable_msix = 0;
+TUNABLE_INT("hw.virtio.pci.disable_msix", &vtpci_disable_msix);
+
+static device_method_t vtpci_methods[] = {
+ /* Device interface. */
+ DEVMETHOD(device_probe, vtpci_probe),
+ DEVMETHOD(device_attach, vtpci_attach),
+ DEVMETHOD(device_detach, vtpci_detach),
+ DEVMETHOD(device_suspend, vtpci_suspend),
+ DEVMETHOD(device_resume, vtpci_resume),
+ DEVMETHOD(device_shutdown, vtpci_shutdown),
+
+ /* Bus interface. */
+ DEVMETHOD(bus_driver_added, vtpci_driver_added),
+ DEVMETHOD(bus_child_detached, vtpci_child_detached),
+ DEVMETHOD(bus_read_ivar, vtpci_read_ivar),
+ DEVMETHOD(bus_write_ivar, vtpci_write_ivar),
+
+ /* VirtIO bus interface. */
+ DEVMETHOD(virtio_bus_negotiate_features, vtpci_negotiate_features),
+ DEVMETHOD(virtio_bus_with_feature, vtpci_with_feature),
+ DEVMETHOD(virtio_bus_alloc_virtqueues, vtpci_alloc_virtqueues),
+ DEVMETHOD(virtio_bus_setup_intr, vtpci_setup_intr),
+ DEVMETHOD(virtio_bus_stop, vtpci_stop),
+ DEVMETHOD(virtio_bus_reinit, vtpci_reinit),
+ DEVMETHOD(virtio_bus_reinit_complete, vtpci_reinit_complete),
+ DEVMETHOD(virtio_bus_notify_vq, vtpci_notify_virtqueue),
+ DEVMETHOD(virtio_bus_read_device_config, vtpci_read_dev_config),
+ DEVMETHOD(virtio_bus_write_device_config, vtpci_write_dev_config),
+
+ { 0, 0 }
+};
+
+static driver_t vtpci_driver = {
+ "virtio_pci",
+ vtpci_methods,
+ sizeof(struct vtpci_softc)
+};
+
+devclass_t vtpci_devclass;
+
+DRIVER_MODULE(virtio_pci, pci, vtpci_driver, vtpci_devclass, 0, 0);
+MODULE_VERSION(virtio_pci, 1);
+MODULE_DEPEND(virtio_pci, pci, 1, 1, 1);
+MODULE_DEPEND(virtio_pci, virtio, 1, 1, 1);
+
+static int
+vtpci_probe(device_t dev)
+{
+ char desc[36];
+ const char *name;
+
+ if (pci_get_vendor(dev) != VIRTIO_PCI_VENDORID)
+ return (ENXIO);
+
+ if (pci_get_device(dev) < VIRTIO_PCI_DEVICEID_MIN ||
+ pci_get_device(dev) > VIRTIO_PCI_DEVICEID_MAX)
+ return (ENXIO);
+
+ if (pci_get_revid(dev) != VIRTIO_PCI_ABI_VERSION)
+ return (ENXIO);
+
+ name = virtio_device_name(pci_get_subdevice(dev));
+ if (name == NULL)
+ name = "Unknown";
+
+ snprintf(desc, sizeof(desc), "VirtIO PCI %s adapter", name);
+ device_set_desc_copy(dev, desc);
+
+ return (BUS_PROBE_DEFAULT);
+}
+
+static int
+vtpci_attach(device_t dev)
+{
+ struct vtpci_softc *sc;
+ device_t child;
+ int rid;
+
+ sc = device_get_softc(dev);
+ sc->vtpci_dev = dev;
+
+ pci_enable_busmaster(dev);
+
+ rid = PCIR_BAR(0);
+ sc->vtpci_res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
+ RF_ACTIVE);
+ if (sc->vtpci_res == NULL) {
+ device_printf(dev, "cannot map I/O space\n");
+ return (ENXIO);
+ }
+
+ if (pci_find_extcap(dev, PCIY_MSI, NULL) != 0)
+ sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSI;
+
+ if (pci_find_extcap(dev, PCIY_MSIX, NULL) == 0) {
+ rid = PCIR_BAR(1);
+ sc->vtpci_msix_res = bus_alloc_resource_any(dev,
+ SYS_RES_MEMORY, &rid, RF_ACTIVE);
+ }
+
+ if (sc->vtpci_msix_res == NULL)
+ sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSIX;
+
+ vtpci_reset(sc);
+
+ /* Tell the host we've noticed this device. */
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+
+ if ((child = device_add_child(dev, NULL, -1)) == NULL) {
+ device_printf(dev, "cannot create child device\n");
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+ vtpci_detach(dev);
+ return (ENOMEM);
+ }
+
+ sc->vtpci_child_dev = child;
+ vtpci_probe_and_attach_child(sc);
+
+ return (0);
+}
+
+static int
+vtpci_detach(device_t dev)
+{
+ struct vtpci_softc *sc;
+ device_t child;
+ int error;
+
+ sc = device_get_softc(dev);
+
+ if ((child = sc->vtpci_child_dev) != NULL) {
+ error = device_delete_child(dev, child);
+ if (error)
+ return (error);
+ sc->vtpci_child_dev = NULL;
+ }
+
+ vtpci_reset(sc);
+
+ if (sc->vtpci_msix_res != NULL) {
+ bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(1),
+ sc->vtpci_msix_res);
+ sc->vtpci_msix_res = NULL;
+ }
+
+ if (sc->vtpci_res != NULL) {
+ bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(0),
+ sc->vtpci_res);
+ sc->vtpci_res = NULL;
+ }
+
+ return (0);
+}
+
+static int
+vtpci_suspend(device_t dev)
+{
+
+ return (bus_generic_suspend(dev));
+}
+
+static int
+vtpci_resume(device_t dev)
+{
+
+ return (bus_generic_resume(dev));
+}
+
+static int
+vtpci_shutdown(device_t dev)
+{
+
+ (void) bus_generic_shutdown(dev);
+ /* Forcibly stop the host device. */
+ vtpci_stop(dev);
+
+ return (0);
+}
+
+static void
+vtpci_driver_added(device_t dev, driver_t *driver)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ vtpci_probe_and_attach_child(sc);
+}
+
+static void
+vtpci_child_detached(device_t dev, device_t child)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ vtpci_reset(sc);
+ vtpci_release_child_resources(sc);
+}
+
+static int
+vtpci_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ if (sc->vtpci_child_dev != child)
+ return (ENOENT);
+
+ switch (index) {
+ case VIRTIO_IVAR_DEVTYPE:
+ *result = pci_get_subdevice(dev);
+ break;
+ default:
+ return (ENOENT);
+ }
+
+ return (0);
+}
+
+static int
+vtpci_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ if (sc->vtpci_child_dev != child)
+ return (ENOENT);
+
+ switch (index) {
+ case VIRTIO_IVAR_FEATURE_DESC:
+ sc->vtpci_child_feat_desc = (void *) value;
+ break;
+ default:
+ return (ENOENT);
+ }
+
+ return (0);
+}
+
+static uint64_t
+vtpci_negotiate_features(device_t dev, uint64_t child_features)
+{
+ struct vtpci_softc *sc;
+ uint64_t host_features, features;
+
+ sc = device_get_softc(dev);
+
+ host_features = vtpci_read_config_4(sc, VIRTIO_PCI_HOST_FEATURES);
+ vtpci_describe_features(sc, "host", host_features);
+
+ /*
+ * Limit negotiated features to what the driver, virtqueue, and
+ * host all support.
+ */
+ features = host_features & child_features;
+ features = virtqueue_filter_features(features);
+ sc->vtpci_features = features;
+
+ vtpci_describe_features(sc, "negotiated", features);
+ vtpci_write_config_4(sc, VIRTIO_PCI_GUEST_FEATURES, features);
+
+ return (features);
+}
+
+static int
+vtpci_with_feature(device_t dev, uint64_t feature)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ return ((sc->vtpci_features & feature) != 0);
+}
+
+static int
+vtpci_alloc_virtqueues(device_t dev, int flags, int nvqs,
+ struct vq_alloc_info *vq_info)
+{
+ struct vtpci_softc *sc;
+ struct vtpci_virtqueue *vqx;
+ struct vq_alloc_info *info;
+ int queue, error;
+ uint16_t vq_size;
+
+ sc = device_get_softc(dev);
+
+ if (sc->vtpci_nvqs != 0 || nvqs <= 0 ||
+ nvqs > VIRTIO_MAX_VIRTQUEUES)
+ return (EINVAL);
+
+ error = vtpci_alloc_interrupts(sc, flags, nvqs, vq_info);
+ if (error) {
+ device_printf(dev, "cannot allocate interrupts\n");
+ return (error);
+ }
+
+ if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+ error = vtpci_register_msix_vector(sc,
+ VIRTIO_MSI_CONFIG_VECTOR, 0);
+ if (error)
+ return (error);
+ }
+
+ for (queue = 0; queue < nvqs; queue++) {
+ vqx = &sc->vtpci_vqx[queue];
+ info = &vq_info[queue];
+
+ vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue);
+
+ vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
+ error = virtqueue_alloc(dev, queue, vq_size,
+ VIRTIO_PCI_VRING_ALIGN, 0xFFFFFFFFUL, info, &vqx->vq);
+ if (error)
+ return (error);
+
+ if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+ error = vtpci_register_msix_vector(sc,
+ VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx);
+ if (error)
+ return (error);
+ }
+
+ vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
+ virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+
+ *info->vqai_vq = vqx->vq;
+ sc->vtpci_nvqs++;
+ }
+
+ return (0);
+}
+
+static int
+vtpci_setup_intr(device_t dev, enum intr_type type)
+{
+ struct vtpci_softc *sc;
+ struct vtpci_intr_resource *ires;
+ struct vtpci_virtqueue *vqx;
+ int i, flags, error;
+
+ sc = device_get_softc(dev);
+ flags = type | INTR_MPSAFE;
+ ires = &sc->vtpci_intr_res[0];
+
+ if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) == 0) {
+ error = bus_setup_intr(dev, ires->irq, flags,
+ vtpci_legacy_intr, NULL, sc, &ires->intrhand);
+
+ return (error);
+ }
+
+ error = bus_setup_intr(dev, ires->irq, flags, vtpci_config_intr,
+ NULL, sc, &ires->intrhand);
+ if (error)
+ return (error);
+
+ if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX) {
+ ires = &sc->vtpci_intr_res[1];
+ error = bus_setup_intr(dev, ires->irq, flags,
+ vtpci_vq_shared_intr, NULL, sc, &ires->intrhand);
+
+ return (error);
+ }
+
+ /* Setup an interrupt handler for each virtqueue. */
+ for (i = 0; i < sc->vtpci_nvqs; i++) {
+ vqx = &sc->vtpci_vqx[i];
+ if (vqx->ires_idx < 1)
+ continue;
+
+ ires = &sc->vtpci_intr_res[vqx->ires_idx];
+ error = bus_setup_intr(dev, ires->irq, flags,
+ vtpci_vq_intr, NULL, vqx->vq, &ires->intrhand);
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+static void
+vtpci_stop(device_t dev)
+{
+
+ vtpci_reset(device_get_softc(dev));
+}
+
+static int
+vtpci_reinit(device_t dev, uint64_t features)
+{
+ struct vtpci_softc *sc;
+ struct vtpci_virtqueue *vqx;
+ struct virtqueue *vq;
+ int queue, error;
+ uint16_t vq_size;
+
+ sc = device_get_softc(dev);
+
+ /*
+ * Redrive the device initialization. This is a bit of an abuse
+ * of the specification, but both VirtualBox and QEMU/KVM seem
+ * to play nice. We do not allow the host device to change from
+ * what was originally negotiated beyond what the guest driver
+ * changed (MSIX state should not change, number of virtqueues
+ * and their size remain the same, etc).
+ */
+
+ if (vtpci_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET)
+ vtpci_stop(dev);
+
+ /*
+ * Quickly drive the status through ACK and DRIVER. The device
+ * does not become usable again until vtpci_reinit_complete().
+ */
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
+
+ vtpci_negotiate_features(dev, features);
+
+ if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+ error = vtpci_register_msix_vector(sc,
+ VIRTIO_MSI_CONFIG_VECTOR, 0);
+ if (error)
+ return (error);
+ }
+
+ for (queue = 0; queue < sc->vtpci_nvqs; queue++) {
+ vqx = &sc->vtpci_vqx[queue];
+ vq = vqx->vq;
+
+ KASSERT(vq != NULL, ("vq %d not allocated", queue));
+ vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue);
+
+ vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
+ error = virtqueue_reinit(vq, vq_size);
+ if (error)
+ return (error);
+
+ if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+ error = vtpci_register_msix_vector(sc,
+ VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx);
+ if (error)
+ return (error);
+ }
+
+ vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
+ virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
+ }
+
+ return (0);
+}
+
+static void
+vtpci_reinit_complete(device_t dev)
+{
+
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+static void
+vtpci_notify_virtqueue(device_t dev, uint16_t queue)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_NOTIFY, queue);
+}
+
+static uint8_t
+vtpci_get_status(device_t dev)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ return (vtpci_read_config_1(sc, VIRTIO_PCI_STATUS));
+}
+
+static void
+vtpci_set_status(device_t dev, uint8_t status)
+{
+ struct vtpci_softc *sc;
+
+ sc = device_get_softc(dev);
+
+ if (status != VIRTIO_CONFIG_STATUS_RESET)
+ status |= vtpci_get_status(dev);
+
+ vtpci_write_config_1(sc, VIRTIO_PCI_STATUS, status);
+}
+
+static void
+vtpci_read_dev_config(device_t dev, bus_size_t offset,
+ void *dst, int length)
+{
+ struct vtpci_softc *sc;
+ bus_size_t off;
+ uint8_t *d;
+ int size;
+
+ sc = device_get_softc(dev);
+ off = VIRTIO_PCI_CONFIG(sc) + offset;
+
+ for (d = dst; length > 0; d += size, off += size, length -= size) {
+ if (length >= 4) {
+ size = 4;
+ *(uint32_t *)d = vtpci_read_config_4(sc, off);
+ } else if (length >= 2) {
+ size = 2;
+ *(uint16_t *)d = vtpci_read_config_2(sc, off);
+ } else {
+ size = 1;
+ *d = vtpci_read_config_1(sc, off);
+ }
+ }
+}
+
+static void
+vtpci_write_dev_config(device_t dev, bus_size_t offset,
+ void *src, int length)
+{
+ struct vtpci_softc *sc;
+ bus_size_t off;
+ uint8_t *s;
+ int size;
+
+ sc = device_get_softc(dev);
+ off = VIRTIO_PCI_CONFIG(sc) + offset;
+
+ for (s = src; length > 0; s += size, off += size, length -= size) {
+ if (length >= 4) {
+ size = 4;
+ vtpci_write_config_4(sc, off, *(uint32_t *)s);
+ } else if (length >= 2) {
+ size = 2;
+ vtpci_write_config_2(sc, off, *(uint16_t *)s);
+ } else {
+ size = 1;
+ vtpci_write_config_1(sc, off, *s);
+ }
+ }
+}
+
+static void
+vtpci_describe_features(struct vtpci_softc *sc, const char *msg,
+ uint64_t features)
+{
+ device_t dev, child;
+
+ dev = sc->vtpci_dev;
+ child = sc->vtpci_child_dev;
+
+ if (device_is_attached(child) && bootverbose == 0)
+ return;
+
+ virtio_describe(dev, msg, features, sc->vtpci_child_feat_desc);
+}
+
+static void
+vtpci_probe_and_attach_child(struct vtpci_softc *sc)
+{
+ device_t dev, child;
+
+ dev = sc->vtpci_dev;
+ child = sc->vtpci_child_dev;
+
+ if (child == NULL)
+ return;
+
+ if (device_get_state(child) != DS_NOTPRESENT)
+ return;
+
+ if (device_probe(child) != 0)
+ return;
+
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
+ if (device_attach(child) != 0) {
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
+ vtpci_reset(sc);
+ vtpci_release_child_resources(sc);
+
+ /* Reset status for future attempt. */
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
+ } else
+ vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+static int
+vtpci_alloc_interrupts(struct vtpci_softc *sc, int flags, int nvqs,
+ struct vq_alloc_info *vq_info)
+{
+ int i, nvectors, error;
+
+ /*
+ * Only allocate a vector for virtqueues that are actually
+ * expecting an interrupt.
+ */
+ for (nvectors = 0, i = 0; i < nvqs; i++)
+ if (vq_info[i].vqai_intr != NULL)
+ nvectors++;
+
+ if (vtpci_disable_msix != 0 ||
+ sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSIX ||
+ flags & VIRTIO_ALLOC_VQS_DISABLE_MSIX ||
+ vtpci_alloc_msix(sc, nvectors) != 0) {
+ /*
+ * Use MSI interrupts if available. Otherwise, we fallback
+ * to legacy interrupts.
+ */
+ if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSI) == 0 &&
+ vtpci_alloc_msi(sc) == 0)
+ sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSI;
+
+ sc->vtpci_nintr_res = 1;
+ }
+
+ error = vtpci_alloc_intr_resources(sc, nvqs, vq_info);
+
+ return (error);
+}
+
+static int
+vtpci_alloc_intr_resources(struct vtpci_softc *sc, int nvqs,
+ struct vq_alloc_info *vq_info)
+{
+ device_t dev;
+ struct resource *irq;
+ struct vtpci_virtqueue *vqx;
+ int i, rid, flags, res_idx;
+
+ dev = sc->vtpci_dev;
+ flags = RF_ACTIVE;
+
+ if ((sc->vtpci_flags &
+ (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) == 0) {
+ rid = 0;
+ flags |= RF_SHAREABLE;
+ } else
+ rid = 1;
+
+ for (i = 0; i < sc->vtpci_nintr_res; i++) {
+ irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, flags);
+ if (irq == NULL)
+ return (ENXIO);
+
+ sc->vtpci_intr_res[i].irq = irq;
+ sc->vtpci_intr_res[i].rid = rid++;
+ }
+
+ /*
+ * Map the virtqueue into the correct index in vq_intr_res[]. Note the
+ * first index is reserved for configuration changes notifications.
+ */
+ for (i = 0, res_idx = 1; i < nvqs; i++) {
+ vqx = &sc->vtpci_vqx[i];
+
+ if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) {
+ if (vq_info[i].vqai_intr == NULL)
+ vqx->ires_idx = -1;
+ else if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX)
+ vqx->ires_idx = res_idx;
+ else
+ vqx->ires_idx = res_idx++;
+ } else
+ vqx->ires_idx = -1;
+ }
+
+ return (0);
+}
+
+static int
+vtpci_alloc_msi(struct vtpci_softc *sc)
+{
+ device_t dev;
+ int nmsi, cnt;
+
+ dev = sc->vtpci_dev;
+ nmsi = pci_msi_count(dev);
+
+ if (nmsi < 1)
+ return (1);
+
+ cnt = 1;
+ if (pci_alloc_msi(dev, &cnt) == 0 && cnt == 1)
+ return (0);
+
+ return (1);
+}
+
+static int
+vtpci_alloc_msix(struct vtpci_softc *sc, int nvectors)
+{
+ device_t dev;
+ int nmsix, cnt, required;
+
+ dev = sc->vtpci_dev;
+
+ nmsix = pci_msix_count(dev);
+ if (nmsix < 1)
+ return (1);
+
+ /* An additional vector is needed for the config changes. */
+ required = nvectors + 1;
+ if (nmsix >= required) {
+ cnt = required;
+ if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required)
+ goto out;
+
+ pci_release_msi(dev);
+ }
+
+ /* Attempt shared MSIX configuration. */
+ required = 2;
+ if (nmsix >= required) {
+ cnt = required;
+ if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) {
+ sc->vtpci_flags |= VIRTIO_PCI_FLAG_SHARED_MSIX;
+ goto out;
+ }
+
+ pci_release_msi(dev);
+ }
+
+ return (1);
+
+out:
+ sc->vtpci_nintr_res = required;
+ sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSIX;
+
+ if (bootverbose) {
+ if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX)
+ device_printf(dev, "using shared virtqueue MSIX\n");
+ else
+ device_printf(dev, "using per virtqueue MSIX\n");
+ }
+
+ return (0);
+}
+
+static int
+vtpci_register_msix_vector(struct vtpci_softc *sc, int offset, int res_idx)
+{
+ device_t dev;
+ uint16_t vector;
+
+ dev = sc->vtpci_dev;
+
+ if (offset != VIRTIO_MSI_CONFIG_VECTOR &&
+ offset != VIRTIO_MSI_QUEUE_VECTOR)
+ return (EINVAL);
+
+ if (res_idx != -1) {
+ /* Map from rid to host vector. */
+ vector = sc->vtpci_intr_res[res_idx].rid - 1;
+ } else
+ vector = VIRTIO_MSI_NO_VECTOR;
+
+ /* The first resource is special; make sure it is used correctly. */
+ if (res_idx == 0) {
+ KASSERT(vector == 0, ("unexpected config vector"));
+ KASSERT(offset == VIRTIO_MSI_CONFIG_VECTOR,
+ ("unexpected config offset"));
+ }
+
+ vtpci_write_config_2(sc, offset, vector);
+
+ if (vtpci_read_config_2(sc, offset) != vector) {
+ device_printf(dev, "insufficient host resources for "
+ "MSIX interrupts\n");
+ return (ENODEV);
+ }
+
+ return (0);
+}
+
+static void
+vtpci_free_interrupts(struct vtpci_softc *sc)
+{
+ device_t dev;
+ struct vtpci_intr_resource *ires;
+ int i;
+
+ dev = sc->vtpci_dev;
+ sc->vtpci_nintr_res = 0;
+
+ if (sc->vtpci_flags & (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) {
+ pci_release_msi(dev);
+ sc->vtpci_flags &= ~(VIRTIO_PCI_FLAG_MSI |
+ VIRTIO_PCI_FLAG_MSIX | VIRTIO_PCI_FLAG_SHARED_MSIX);
+ }
+
+ for (i = 0; i < 1 + VIRTIO_MAX_VIRTQUEUES; i++) {
+ ires = &sc->vtpci_intr_res[i];
+
+ if (ires->intrhand != NULL) {
+ bus_teardown_intr(dev, ires->irq, ires->intrhand);
+ ires->intrhand = NULL;
+ }
+
+ if (ires->irq != NULL) {
+ bus_release_resource(dev, SYS_RES_IRQ, ires->rid,
+ ires->irq);
+ ires->irq = NULL;
+ }
+
+ ires->rid = -1;
+ }
+}
+
+static void
+vtpci_free_virtqueues(struct vtpci_softc *sc)
+{
+ struct vtpci_virtqueue *vqx;
+ int i;
+
+ sc->vtpci_nvqs = 0;
+
+ for (i = 0; i < VIRTIO_MAX_VIRTQUEUES; i++) {
+ vqx = &sc->vtpci_vqx[i];
+
+ if (vqx->vq != NULL) {
+ virtqueue_free(vqx->vq);
+ vqx->vq = NULL;
+ }
+ }
+}
+
+static void
+vtpci_release_child_resources(struct vtpci_softc *sc)
+{
+
+ vtpci_free_interrupts(sc);
+ vtpci_free_virtqueues(sc);
+}
+
+static void
+vtpci_reset(struct vtpci_softc *sc)
+{
+
+ /*
+ * Setting the status to RESET sets the host device to
+ * the original, uninitialized state.
+ */
+ vtpci_set_status(sc->vtpci_dev, VIRTIO_CONFIG_STATUS_RESET);
+}
+
+static int
+vtpci_legacy_intr(void *xsc)
+{
+ struct vtpci_softc *sc;
+ struct vtpci_virtqueue *vqx;
+ int i;
+ uint8_t isr;
+
+ sc = xsc;
+ vqx = &sc->vtpci_vqx[0];
+
+ /* Reading the ISR also clears it. */
+ isr = vtpci_read_config_1(sc, VIRTIO_PCI_ISR);
+
+ if (isr & VIRTIO_PCI_ISR_CONFIG)
+ vtpci_config_intr(sc);
+
+ if (isr & VIRTIO_PCI_ISR_INTR)
+ for (i = 0; i < sc->vtpci_nvqs; i++, vqx++)
+ virtqueue_intr(vqx->vq);
+
+ return (isr ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+static int
+vtpci_vq_shared_intr(void *xsc)
+{
+ struct vtpci_softc *sc;
+ struct vtpci_virtqueue *vqx;
+ int i, rc;
+
+ rc = 0;
+ sc = xsc;
+ vqx = &sc->vtpci_vqx[0];
+
+ for (i = 0; i < sc->vtpci_nvqs; i++, vqx++)
+ rc |= virtqueue_intr(vqx->vq);
+
+ return (rc ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+static int
+vtpci_vq_intr(void *xvq)
+{
+ struct virtqueue *vq;
+ int rc;
+
+ vq = xvq;
+ rc = virtqueue_intr(vq);
+
+ return (rc ? FILTER_HANDLED : FILTER_STRAY);
+}
+
+static int
+vtpci_config_intr(void *xsc)
+{
+ struct vtpci_softc *sc;
+ device_t child;
+ int rc;
+
+ rc = 0;
+ sc = xsc;
+ child = sc->vtpci_child_dev;
+
+ if (child != NULL)
+ rc = VIRTIO_CONFIG_CHANGE(child);
+
+ return (rc ? FILTER_HANDLED : FILTER_STRAY);
+}
diff --git a/sys/dev/virtio/pci/virtio_pci.h b/sys/dev/virtio/pci/virtio_pci.h
new file mode 100644
index 0000000..6ebfdd5
--- /dev/null
+++ b/sys/dev/virtio/pci/virtio_pci.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright IBM Corp. 2007
+ *
+ * Authors:
+ * Anthony Liguori <aliguori@us.ibm.com>
+ *
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_PCI_H
+#define _VIRTIO_PCI_H
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_PCI_VENDORID 0x1AF4
+#define VIRTIO_PCI_DEVICEID_MIN 0x1000
+#define VIRTIO_PCI_DEVICEID_MAX 0x103F
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES 0 /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4 /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN 8 /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM 12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL 14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY 16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS 18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR 19 /* interrupt status register, reading
+ * also clears the register (8, RO) */
+/* Only if MSIX is enabled: */
+#define VIRTIO_MSI_CONFIG_VECTOR 20 /* configuration change vector (16, RW) */
+#define VIRTIO_MSI_QUEUE_VECTOR 22 /* vector for selected VQ notifications
+ (16, RW) */
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR 0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(sc) \
+ (((sc)->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) ? 24 : 20)
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+#endif /* _VIRTIO_PCI_H */
diff --git a/sys/dev/virtio/virtio.c b/sys/dev/virtio/virtio.c
new file mode 100644
index 0000000..e385575
--- /dev/null
+++ b/sys/dev/virtio/virtio.c
@@ -0,0 +1,283 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/sbuf.h>
+
+#include <machine/bus.h>
+#include <machine/resource.h>
+#include <machine/_inttypes.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+
+#include "virtio_bus_if.h"
+
+static int virtio_modevent(module_t, int, void *);
+static const char *virtio_feature_name(uint64_t, struct virtio_feature_desc *);
+
+static struct virtio_ident {
+ uint16_t devid;
+ char *name;
+} virtio_ident_table[] = {
+ { VIRTIO_ID_NETWORK, "Network" },
+ { VIRTIO_ID_BLOCK, "Block" },
+ { VIRTIO_ID_CONSOLE, "Console" },
+ { VIRTIO_ID_ENTROPY, "Entropy" },
+ { VIRTIO_ID_BALLOON, "Balloon" },
+ { VIRTIO_ID_IOMEMORY, "IOMemory" },
+ { VIRTIO_ID_9P, "9P Transport" },
+
+ { 0, NULL }
+};
+
+/* Device independent features. */
+static struct virtio_feature_desc virtio_common_feature_desc[] = {
+ { VIRTIO_F_NOTIFY_ON_EMPTY, "NotifyOnEmpty" },
+ { VIRTIO_RING_F_INDIRECT_DESC, "RingIndirect" },
+ { VIRTIO_RING_F_EVENT_IDX, "EventIdx" },
+ { VIRTIO_F_BAD_FEATURE, "BadFeature" },
+
+ { 0, NULL }
+};
+
+const char *
+virtio_device_name(uint16_t devid)
+{
+ struct virtio_ident *ident;
+
+ for (ident = virtio_ident_table; ident->name != NULL; ident++) {
+ if (ident->devid == devid)
+ return (ident->name);
+ }
+
+ return (NULL);
+}
+
+int
+virtio_get_device_type(device_t dev)
+{
+ uintptr_t devtype;
+
+ devtype = -1;
+
+ BUS_READ_IVAR(device_get_parent(dev), dev,
+ VIRTIO_IVAR_DEVTYPE, &devtype);
+
+ return ((int) devtype);
+}
+
+void
+virtio_set_feature_desc(device_t dev,
+ struct virtio_feature_desc *feature_desc)
+{
+
+ BUS_WRITE_IVAR(device_get_parent(dev), dev,
+ VIRTIO_IVAR_FEATURE_DESC, (uintptr_t) feature_desc);
+}
+
+void
+virtio_describe(device_t dev, const char *msg,
+ uint64_t features, struct virtio_feature_desc *feature_desc)
+{
+ struct sbuf sb;
+ uint64_t val;
+ char *buf;
+ const char *name;
+ int n;
+
+ if ((buf = malloc(512, M_TEMP, M_NOWAIT)) == NULL) {
+ device_printf(dev, "%s features: 0x%"PRIx64"\n", msg,
+ features);
+ return;
+ }
+
+ sbuf_new(&sb, buf, 512, SBUF_FIXEDLEN);
+ sbuf_printf(&sb, "%s features: 0x%"PRIx64, msg, features);
+
+ for (n = 0, val = 1ULL << 63; val != 0; val >>= 1) {
+ /*
+ * BAD_FEATURE is used to detect broken Linux clients
+ * and therefore is not applicable to FreeBSD.
+ */
+ if (((features & val) == 0) || val == VIRTIO_F_BAD_FEATURE)
+ continue;
+
+ if (n++ == 0)
+ sbuf_cat(&sb, " <");
+ else
+ sbuf_cat(&sb, ",");
+
+ name = NULL;
+ if (feature_desc != NULL)
+ name = virtio_feature_name(val, feature_desc);
+ if (name == NULL)
+ name = virtio_feature_name(val,
+ virtio_common_feature_desc);
+
+ if (name == NULL)
+ sbuf_printf(&sb, "0x%"PRIx64, val);
+ else
+ sbuf_cat(&sb, name);
+ }
+
+ if (n > 0)
+ sbuf_cat(&sb, ">");
+
+#if __FreeBSD_version < 900020
+ sbuf_finish(&sb);
+ if (sbuf_overflowed(&sb) == 0)
+#else
+ if (sbuf_finish(&sb) == 0)
+#endif
+ device_printf(dev, "%s\n", sbuf_data(&sb));
+
+ sbuf_delete(&sb);
+ free(buf, M_TEMP);
+}
+
+static const char *
+virtio_feature_name(uint64_t val, struct virtio_feature_desc *feature_desc)
+{
+ int i;
+
+ for (i = 0; feature_desc[i].vfd_val != 0; i++)
+ if (val == feature_desc[i].vfd_val)
+ return (feature_desc[i].vfd_str);
+
+ return (NULL);
+}
+
+/*
+ * VirtIO bus method wrappers.
+ */
+
+uint64_t
+virtio_negotiate_features(device_t dev, uint64_t child_features)
+{
+
+ return (VIRTIO_BUS_NEGOTIATE_FEATURES(device_get_parent(dev),
+ child_features));
+}
+
+int
+virtio_alloc_virtqueues(device_t dev, int flags, int nvqs,
+ struct vq_alloc_info *info)
+{
+
+ return (VIRTIO_BUS_ALLOC_VIRTQUEUES(device_get_parent(dev), flags,
+ nvqs, info));
+}
+
+int
+virtio_setup_intr(device_t dev, enum intr_type type)
+{
+
+ return (VIRTIO_BUS_SETUP_INTR(device_get_parent(dev), type));
+}
+
+int
+virtio_with_feature(device_t dev, uint64_t feature)
+{
+
+ return (VIRTIO_BUS_WITH_FEATURE(device_get_parent(dev), feature));
+}
+
+void
+virtio_stop(device_t dev)
+{
+
+ VIRTIO_BUS_STOP(device_get_parent(dev));
+}
+
+int
+virtio_reinit(device_t dev, uint64_t features)
+{
+
+ return (VIRTIO_BUS_REINIT(device_get_parent(dev), features));
+}
+
+void
+virtio_reinit_complete(device_t dev)
+{
+
+ VIRTIO_BUS_REINIT_COMPLETE(device_get_parent(dev));
+}
+
+void
+virtio_read_device_config(device_t dev, bus_size_t offset, void *dst, int len)
+{
+
+ VIRTIO_BUS_READ_DEVICE_CONFIG(device_get_parent(dev),
+ offset, dst, len);
+}
+
+void
+virtio_write_device_config(device_t dev, bus_size_t offset, void *dst, int len)
+{
+
+ VIRTIO_BUS_WRITE_DEVICE_CONFIG(device_get_parent(dev),
+ offset, dst, len);
+}
+
+static int
+virtio_modevent(module_t mod, int type, void *unused)
+{
+ int error;
+
+ error = 0;
+
+ switch (type) {
+ case MOD_LOAD:
+ case MOD_QUIESCE:
+ case MOD_UNLOAD:
+ case MOD_SHUTDOWN:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ break;
+ }
+
+ return (error);
+}
+
+static moduledata_t virtio_mod = {
+ "virtio",
+ virtio_modevent,
+ 0
+};
+
+DECLARE_MODULE(virtio, virtio_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST);
+MODULE_VERSION(virtio, 1);
diff --git a/sys/dev/virtio/virtio.h b/sys/dev/virtio/virtio.h
new file mode 100644
index 0000000..ebd3c74
--- /dev/null
+++ b/sys/dev/virtio/virtio.h
@@ -0,0 +1,130 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions to implement
+ * compatible drivers/servers.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+#include <sys/types.h>
+
+struct vq_alloc_info;
+
+/* VirtIO device IDs. */
+#define VIRTIO_ID_NETWORK 0x01
+#define VIRTIO_ID_BLOCK 0x02
+#define VIRTIO_ID_CONSOLE 0x03
+#define VIRTIO_ID_ENTROPY 0x04
+#define VIRTIO_ID_BALLOON 0x05
+#define VIRTIO_ID_IOMEMORY 0x06
+#define VIRTIO_ID_9P 0x09
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET 0x00
+#define VIRTIO_CONFIG_STATUS_ACK 0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER 0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FAILED 0x80
+
+/*
+ * Generate interrupt when the virtqueue ring is
+ * completely used, even if we've suppressed them.
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24)
+
+/*
+ * The guest should never negotiate this feature; it
+ * is used to detect faulty drivers.
+ */
+#define VIRTIO_F_BAD_FEATURE (1 << 30)
+
+/*
+ * Some VirtIO feature bits (currently bits 28 through 31) are
+ * reserved for the transport being used (eg. virtio_ring), the
+ * rest are per-device feature bits.
+ */
+#define VIRTIO_TRANSPORT_F_START 28
+#define VIRTIO_TRANSPORT_F_END 32
+
+/*
+ * Maximum number of virtqueues per device.
+ */
+#define VIRTIO_MAX_VIRTQUEUES 8
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/*
+ * VirtIO instance variables indices.
+ */
+#define VIRTIO_IVAR_DEVTYPE 1
+#define VIRTIO_IVAR_FEATURE_DESC 2
+
+struct virtio_feature_desc {
+ uint64_t vfd_val;
+ char *vfd_str;
+};
+
+const char *virtio_device_name(uint16_t devid);
+int virtio_get_device_type(device_t dev);
+void virtio_set_feature_desc(device_t dev,
+ struct virtio_feature_desc *feature_desc);
+void virtio_describe(device_t dev, const char *msg,
+ uint64_t features, struct virtio_feature_desc *feature_desc);
+
+/*
+ * VirtIO Bus Methods.
+ */
+uint64_t virtio_negotiate_features(device_t dev, uint64_t child_features);
+int virtio_alloc_virtqueues(device_t dev, int flags, int nvqs,
+ struct vq_alloc_info *info);
+int virtio_setup_intr(device_t dev, enum intr_type type);
+int virtio_with_feature(device_t dev, uint64_t feature);
+void virtio_stop(device_t dev);
+int virtio_reinit(device_t dev, uint64_t features);
+void virtio_reinit_complete(device_t dev);
+
+/*
+ * Read/write a variable amount from the device specific (ie, network)
+ * configuration region. This region is encoded in the same endian as
+ * the guest.
+ */
+void virtio_read_device_config(device_t dev, bus_size_t offset,
+ void *dst, int length);
+void virtio_write_device_config(device_t dev, bus_size_t offset,
+ void *src, int length);
+
+/* Inlined device specific read/write functions for common lengths. */
+#define VIRTIO_RDWR_DEVICE_CONFIG(size, type) \
+static inline type \
+__CONCAT(virtio_read_dev_config_,size)(device_t dev, \
+ bus_size_t offset) \
+{ \
+ type val; \
+ virtio_read_device_config(dev, offset, &val, sizeof(type)); \
+ return (val); \
+} \
+ \
+static inline void \
+__CONCAT(virtio_write_dev_config_,size)(device_t dev, \
+ bus_size_t offset, type val) \
+{ \
+ virtio_write_device_config(dev, offset, &val, sizeof(type)); \
+}
+
+VIRTIO_RDWR_DEVICE_CONFIG(1, uint8_t);
+VIRTIO_RDWR_DEVICE_CONFIG(2, uint16_t);
+VIRTIO_RDWR_DEVICE_CONFIG(4, uint32_t);
+
+#endif /* _VIRTIO_H_ */
diff --git a/sys/dev/virtio/virtio_bus_if.m b/sys/dev/virtio/virtio_bus_if.m
new file mode 100644
index 0000000..ec2029d
--- /dev/null
+++ b/sys/dev/virtio/virtio_bus_if.m
@@ -0,0 +1,92 @@
+#-
+# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+#include <machine/bus.h>
+
+INTERFACE virtio_bus;
+
+HEADER {
+struct vq_alloc_info;
+};
+
+METHOD uint64_t negotiate_features {
+ device_t dev;
+ uint64_t child_features;
+};
+
+METHOD int with_feature {
+ device_t dev;
+ uint64_t feature;
+};
+
+METHOD int alloc_virtqueues {
+ device_t dev;
+ int flags;
+ int nvqs;
+ struct vq_alloc_info *info;
+};
+HEADER {
+#define VIRTIO_ALLOC_VQS_DISABLE_MSIX 0x1
+};
+
+METHOD int setup_intr {
+ device_t dev;
+ enum intr_type type;
+};
+
+METHOD void stop {
+ device_t dev;
+};
+
+METHOD int reinit {
+ device_t dev;
+ uint64_t features;
+};
+
+METHOD void reinit_complete {
+ device_t dev;
+};
+
+METHOD void notify_vq {
+ device_t dev;
+ uint16_t queue;
+};
+
+METHOD void read_device_config {
+ device_t dev;
+ bus_size_t offset;
+ void *dst;
+ int len;
+};
+
+METHOD void write_device_config {
+ device_t dev;
+ bus_size_t offset;
+ void *src;
+ int len;
+};
diff --git a/sys/dev/virtio/virtio_if.m b/sys/dev/virtio/virtio_if.m
new file mode 100644
index 0000000..701678c
--- /dev/null
+++ b/sys/dev/virtio/virtio_if.m
@@ -0,0 +1,43 @@
+#-
+# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD$
+
+#include <sys/bus.h>
+
+INTERFACE virtio;
+
+CODE {
+ static int
+ virtio_default_config_change(device_t dev)
+ {
+ /* Return that we've handled the change. */
+ return (1);
+ }
+};
+
+METHOD int config_change {
+ device_t dev;
+} DEFAULT virtio_default_config_change;
diff --git a/sys/dev/virtio/virtio_ring.h b/sys/dev/virtio/virtio_ring.h
new file mode 100644
index 0000000..124cb4d
--- /dev/null
+++ b/sys/dev/virtio/virtio_ring.h
@@ -0,0 +1,119 @@
+/*
+ * This header is BSD licensed so anyone can use the definitions
+ * to implement compatible drivers/servers.
+ *
+ * Copyright Rusty Russell IBM Corporation 2007.
+ */
+/* $FreeBSD$ */
+
+#ifndef VIRTIO_RING_H
+#define VIRTIO_RING_H
+
+#include <sys/types.h>
+
+/* This marks a buffer as continuing via the next field. */
+#define VRING_DESC_F_NEXT 1
+/* This marks a buffer as write-only (otherwise read-only). */
+#define VRING_DESC_F_WRITE 2
+/* This means the buffer contains a list of buffer descriptors. */
+#define VRING_DESC_F_INDIRECT 4
+
+/* The Host uses this in used->flags to advise the Guest: don't kick me
+ * when you add a buffer. It's unreliable, so it's simply an
+ * optimization. Guest will still kick if it's out of buffers. */
+#define VRING_USED_F_NO_NOTIFY 1
+/* The Guest uses this in avail->flags to advise the Host: don't
+ * interrupt me when you consume a buffer. It's unreliable, so it's
+ * simply an optimization. */
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+/* VirtIO ring descriptors: 16 bytes.
+ * These can chain together via "next". */
+struct vring_desc {
+ /* Address (guest-physical). */
+ uint64_t addr;
+ /* Length. */
+ uint32_t len;
+ /* The flags as indicated above. */
+ uint16_t flags;
+ /* We chain unused descriptors via this, too. */
+ uint16_t next;
+};
+
+struct vring_avail {
+ uint16_t flags;
+ uint16_t idx;
+ uint16_t ring[0];
+};
+
+/* uint32_t is used here for ids for padding reasons. */
+struct vring_used_elem {
+ /* Index of start of used descriptor chain. */
+ uint32_t id;
+ /* Total length of the descriptor chain which was written to. */
+ uint32_t len;
+};
+
+struct vring_used {
+ uint16_t flags;
+ uint16_t idx;
+ struct vring_used_elem ring[0];
+};
+
+struct vring {
+ unsigned int num;
+
+ struct vring_desc *desc;
+ struct vring_avail *avail;
+ struct vring_used *used;
+};
+
+/* The standard layout for the ring is a continuous chunk of memory which
+ * looks like this. We assume num is a power of 2.
+ *
+ * struct vring {
+ * // The actual descriptors (16 bytes each)
+ * struct vring_desc desc[num];
+ *
+ * // A ring of available descriptor heads with free-running index.
+ * __u16 avail_flags;
+ * __u16 avail_idx;
+ * __u16 available[num];
+ *
+ * // Padding to the next align boundary.
+ * char pad[];
+ *
+ * // A ring of used descriptor heads with free-running index.
+ * __u16 used_flags;
+ * __u16 used_idx;
+ * struct vring_used_elem used[num];
+ * };
+ *
+ * NOTE: for VirtIO PCI, align is 4096.
+ */
+
+static inline int
+vring_size(unsigned int num, unsigned long align)
+{
+ int size;
+
+ size = num * sizeof(struct vring_desc);
+ size += sizeof(struct vring_avail) + (num * sizeof(uint16_t));
+ size = (size + align - 1) & ~(align - 1);
+ size += sizeof(struct vring_used) +
+ (num * sizeof(struct vring_used_elem));
+ return (size);
+}
+
+static inline void
+vring_init(struct vring *vr, unsigned int num, uint8_t *p,
+ unsigned long align)
+{
+ vr->num = num;
+ vr->desc = (struct vring_desc *) p;
+ vr->avail = (struct vring_avail *) (p +
+ num * sizeof(struct vring_desc));
+ vr->used = (void *)
+ (((unsigned long) &vr->avail->ring[num] + align-1) & ~(align-1));
+}
+#endif /* VIRTIO_RING_H */
diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c
new file mode 100644
index 0000000..1fb182e
--- /dev/null
+++ b/sys/dev/virtio/virtqueue.c
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Implements the virtqueue interface as basically described
+ * in the original VirtIO paper.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/sglist.h>
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/cpu.h>
+#include <machine/bus.h>
+#include <machine/atomic.h>
+#include <machine/resource.h>
+#include <sys/bus.h>
+#include <sys/rman.h>
+
+#include <dev/virtio/virtio.h>
+#include <dev/virtio/virtqueue.h>
+#include <dev/virtio/virtio_ring.h>
+
+#include "virtio_bus_if.h"
+
+struct virtqueue {
+ device_t vq_dev;
+ char vq_name[VIRTQUEUE_MAX_NAME_SZ];
+ uint16_t vq_queue_index;
+ uint16_t vq_nentries;
+ uint32_t vq_flags;
+#define VIRTQUEUE_FLAG_INDIRECT 0x0001
+
+ int vq_alignment;
+ int vq_ring_size;
+ void *vq_ring_mem;
+ int vq_max_indirect_size;
+ int vq_indirect_mem_size;
+ virtqueue_intr_t *vq_intrhand;
+ void *vq_intrhand_arg;
+
+ struct vring vq_ring;
+ uint16_t vq_free_cnt;
+ uint16_t vq_queued_cnt;
+ /*
+ * Head of the free chain in the descriptor table. If
+ * there are no free descriptors, this will be set to
+ * VQ_RING_DESC_CHAIN_END.
+ */
+ uint16_t vq_desc_head_idx;
+ /*
+ * Last consumed descriptor in the used table,
+ * trails vq_ring.used->idx.
+ */
+ uint16_t vq_used_cons_idx;
+
+ struct vq_desc_extra {
+ void *cookie;
+ struct vring_desc *indirect;
+ vm_paddr_t indirect_paddr;
+ uint16_t ndescs;
+ } vq_descx[0];
+};
+
+/*
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+#define VQASSERT(_vq, _exp, _msg, ...) \
+ KASSERT((_exp),("%s: %s - "_msg, __func__, (_vq)->vq_name, \
+ ##__VA_ARGS__))
+
+#define VQ_RING_ASSERT_VALID_IDX(_vq, _idx) \
+ VQASSERT((_vq), (_idx) < (_vq)->vq_nentries, \
+ "invalid ring index: %d, max: %d", (_idx), \
+ (_vq)->vq_nentries)
+
+#define VQ_RING_ASSERT_CHAIN_TERM(_vq) \
+ VQASSERT((_vq), (_vq)->vq_desc_head_idx == \
+ VQ_RING_DESC_CHAIN_END, "full ring terminated " \
+ "incorrectly: head idx: %d", (_vq)->vq_desc_head_idx)
+
+static int virtqueue_init_indirect(struct virtqueue *vq, int);
+static void virtqueue_free_indirect(struct virtqueue *vq);
+static void virtqueue_init_indirect_list(struct virtqueue *,
+ struct vring_desc *);
+
+static void vq_ring_init(struct virtqueue *);
+static void vq_ring_update_avail(struct virtqueue *, uint16_t);
+static uint16_t vq_ring_enqueue_segments(struct virtqueue *,
+ struct vring_desc *, uint16_t, struct sglist *, int, int);
+static int vq_ring_use_indirect(struct virtqueue *, int);
+static void vq_ring_enqueue_indirect(struct virtqueue *, void *,
+ struct sglist *, int, int);
+static void vq_ring_notify_host(struct virtqueue *, int);
+static void vq_ring_free_chain(struct virtqueue *, uint16_t);
+
+uint64_t
+virtqueue_filter_features(uint64_t features)
+{
+ uint64_t mask;
+
+ mask = (1 << VIRTIO_TRANSPORT_F_START) - 1;
+ mask |= VIRTIO_RING_F_INDIRECT_DESC;
+
+ return (features & mask);
+}
+
+int
+virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, int align,
+ vm_paddr_t highaddr, struct vq_alloc_info *info, struct virtqueue **vqp)
+{
+ struct virtqueue *vq;
+ int error;
+
+ *vqp = NULL;
+ error = 0;
+
+ if (size == 0) {
+ device_printf(dev,
+ "virtqueue %d (%s) does not exist (size is zero)\n",
+ queue, info->vqai_name);
+ return (ENODEV);
+ } else if (!powerof2(size)) {
+ device_printf(dev,
+ "virtqueue %d (%s) size is not a power of 2: %d\n",
+ queue, info->vqai_name, size);
+ return (ENXIO);
+ } else if (info->vqai_maxindirsz > VIRTIO_MAX_INDIRECT) {
+ device_printf(dev, "virtqueue %d (%s) requested too many "
+ "indirect descriptors: %d, max %d\n",
+ queue, info->vqai_name, info->vqai_maxindirsz,
+ VIRTIO_MAX_INDIRECT);
+ return (EINVAL);
+ }
+
+ vq = malloc(sizeof(struct virtqueue) +
+ size * sizeof(struct vq_desc_extra), M_DEVBUF, M_NOWAIT | M_ZERO);
+ if (vq == NULL) {
+ device_printf(dev, "cannot allocate virtqueue\n");
+ return (ENOMEM);
+ }
+
+ vq->vq_dev = dev;
+ strlcpy(vq->vq_name, info->vqai_name, sizeof(vq->vq_name));
+ vq->vq_queue_index = queue;
+ vq->vq_alignment = align;
+ vq->vq_nentries = size;
+ vq->vq_free_cnt = size;
+ vq->vq_intrhand = info->vqai_intr;
+ vq->vq_intrhand_arg = info->vqai_intr_arg;
+
+ if (info->vqai_maxindirsz > 1) {
+ error = virtqueue_init_indirect(vq, info->vqai_maxindirsz);
+ if (error)
+ goto fail;
+ }
+
+ vq->vq_ring_size = round_page(vring_size(size, align));
+ vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF,
+ M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0);
+ if (vq->vq_ring_mem == NULL) {
+ device_printf(dev,
+ "cannot allocate memory for virtqueue ring\n");
+ error = ENOMEM;
+ goto fail;
+ }
+
+ vq_ring_init(vq);
+ virtqueue_disable_intr(vq);
+
+ *vqp = vq;
+
+fail:
+ if (error)
+ virtqueue_free(vq);
+
+ return (error);
+}
+
+static int
+virtqueue_init_indirect(struct virtqueue *vq, int indirect_size)
+{
+ device_t dev;
+ struct vq_desc_extra *dxp;
+ int i, size;
+
+ dev = vq->vq_dev;
+
+ if (VIRTIO_BUS_WITH_FEATURE(dev, VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+ /*
+ * Indirect descriptors requested by the driver but not
+ * negotiated. Return zero to keep the initialization
+ * going: we'll run fine without.
+ */
+ if (bootverbose)
+ device_printf(dev, "virtqueue %d (%s) requested "
+ "indirect descriptors but not negotiated\n",
+ vq->vq_queue_index, vq->vq_name);
+ return (0);
+ }
+
+ size = indirect_size * sizeof(struct vring_desc);
+ vq->vq_max_indirect_size = indirect_size;
+ vq->vq_indirect_mem_size = size;
+ vq->vq_flags |= VIRTQUEUE_FLAG_INDIRECT;
+
+ for (i = 0; i < vq->vq_nentries; i++) {
+ dxp = &vq->vq_descx[i];
+
+ dxp->indirect = malloc(size, M_DEVBUF, M_NOWAIT);
+ if (dxp->indirect == NULL) {
+ device_printf(dev, "cannot allocate indirect list\n");
+ return (ENOMEM);
+ }
+
+ dxp->indirect_paddr = vtophys(dxp->indirect);
+ virtqueue_init_indirect_list(vq, dxp->indirect);
+ }
+
+ return (0);
+}
+
+static void
+virtqueue_free_indirect(struct virtqueue *vq)
+{
+ struct vq_desc_extra *dxp;
+ int i;
+
+ for (i = 0; i < vq->vq_nentries; i++) {
+ dxp = &vq->vq_descx[i];
+
+ if (dxp->indirect == NULL)
+ break;
+
+ free(dxp->indirect, M_DEVBUF);
+ dxp->indirect = NULL;
+ dxp->indirect_paddr = 0;
+ }
+
+ vq->vq_flags &= ~VIRTQUEUE_FLAG_INDIRECT;
+ vq->vq_indirect_mem_size = 0;
+}
+
+static void
+virtqueue_init_indirect_list(struct virtqueue *vq,
+ struct vring_desc *indirect)
+{
+ int i;
+
+ bzero(indirect, vq->vq_indirect_mem_size);
+
+ for (i = 0; i < vq->vq_max_indirect_size - 1; i++)
+ indirect[i].next = i + 1;
+ indirect[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+int
+virtqueue_reinit(struct virtqueue *vq, uint16_t size)
+{
+ struct vq_desc_extra *dxp;
+ int i;
+
+ if (vq->vq_nentries != size) {
+ device_printf(vq->vq_dev,
+ "%s: '%s' changed size; old=%hu, new=%hu\n",
+ __func__, vq->vq_name, vq->vq_nentries, size);
+ return (EINVAL);
+ }
+
+ /* Warn if the virtqueue was not properly cleaned up. */
+ if (vq->vq_free_cnt != vq->vq_nentries) {
+ device_printf(vq->vq_dev,
+ "%s: warning, '%s' virtqueue not empty, "
+ "leaking %d entries\n", __func__, vq->vq_name,
+ vq->vq_nentries - vq->vq_free_cnt);
+ }
+
+ vq->vq_desc_head_idx = 0;
+ vq->vq_used_cons_idx = 0;
+ vq->vq_queued_cnt = 0;
+ vq->vq_free_cnt = vq->vq_nentries;
+
+ /* To be safe, reset all our allocated memory. */
+ bzero(vq->vq_ring_mem, vq->vq_ring_size);
+ for (i = 0; i < vq->vq_nentries; i++) {
+ dxp = &vq->vq_descx[i];
+ dxp->cookie = NULL;
+ dxp->ndescs = 0;
+ if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT)
+ virtqueue_init_indirect_list(vq, dxp->indirect);
+ }
+
+ vq_ring_init(vq);
+ virtqueue_disable_intr(vq);
+
+ return (0);
+}
+
+void
+virtqueue_free(struct virtqueue *vq)
+{
+
+ if (vq->vq_free_cnt != vq->vq_nentries) {
+ device_printf(vq->vq_dev, "%s: freeing non-empty virtqueue, "
+ "leaking %d entries\n", vq->vq_name,
+ vq->vq_nentries - vq->vq_free_cnt);
+ }
+
+ if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT)
+ virtqueue_free_indirect(vq);
+
+ if (vq->vq_ring_mem != NULL) {
+ contigfree(vq->vq_ring_mem, vq->vq_ring_size, M_DEVBUF);
+ vq->vq_ring_size = 0;
+ vq->vq_ring_mem = NULL;
+ }
+
+ free(vq, M_DEVBUF);
+}
+
+vm_paddr_t
+virtqueue_paddr(struct virtqueue *vq)
+{
+
+ return (vtophys(vq->vq_ring_mem));
+}
+
+int
+virtqueue_size(struct virtqueue *vq)
+{
+
+ return (vq->vq_nentries);
+}
+
+int
+virtqueue_empty(struct virtqueue *vq)
+{
+
+ return (vq->vq_nentries == vq->vq_free_cnt);
+}
+
+int
+virtqueue_full(struct virtqueue *vq)
+{
+
+ return (vq->vq_free_cnt == 0);
+}
+
+void
+virtqueue_notify(struct virtqueue *vq)
+{
+
+ vq->vq_queued_cnt = 0;
+ vq_ring_notify_host(vq, 0);
+}
+
+int
+virtqueue_nused(struct virtqueue *vq)
+{
+ uint16_t used_idx, nused;
+
+ used_idx = vq->vq_ring.used->idx;
+ if (used_idx >= vq->vq_used_cons_idx)
+ nused = used_idx - vq->vq_used_cons_idx;
+ else
+ nused = UINT16_MAX - vq->vq_used_cons_idx +
+ used_idx + 1;
+ VQASSERT(vq, nused <= vq->vq_nentries, "used more than available");
+
+ return (nused);
+}
+
+int
+virtqueue_intr(struct virtqueue *vq)
+{
+
+ if (vq->vq_intrhand == NULL ||
+ vq->vq_used_cons_idx == vq->vq_ring.used->idx)
+ return (0);
+
+ vq->vq_intrhand(vq->vq_intrhand_arg);
+
+ return (1);
+}
+
+int
+virtqueue_enable_intr(struct virtqueue *vq)
+{
+
+ /*
+ * Enable interrupts, making sure we get the latest
+ * index of what's already been consumed.
+ */
+ vq->vq_ring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
+
+ mb();
+
+ /*
+ * Additional items may have been consumed in the time between
+ * since we last checked and enabled interrupts above. Let our
+ * caller know so it processes the new entries.
+ */
+ if (vq->vq_used_cons_idx != vq->vq_ring.used->idx)
+ return (1);
+
+ return (0);
+}
+
+void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+
+ /*
+ * Note this is only considered a hint to the host.
+ */
+ vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+int
+virtqueue_enqueue(struct virtqueue *vq, void *cookie, struct sglist *sg,
+ int readable, int writable)
+{
+ struct vq_desc_extra *dxp;
+ int needed;
+ uint16_t head_idx, idx;
+
+ needed = readable + writable;
+
+ VQASSERT(vq, cookie != NULL, "enqueuing with no cookie");
+ VQASSERT(vq, needed == sg->sg_nseg,
+ "segment count mismatch, %d, %d", needed, sg->sg_nseg);
+ VQASSERT(vq,
+ needed <= vq->vq_nentries || needed <= vq->vq_max_indirect_size,
+ "too many segments to enqueue: %d, %d/%d", needed,
+ vq->vq_nentries, vq->vq_max_indirect_size);
+
+ if (needed < 1)
+ return (EINVAL);
+ if (vq->vq_free_cnt == 0)
+ return (ENOSPC);
+
+ if (vq_ring_use_indirect(vq, needed)) {
+ vq_ring_enqueue_indirect(vq, cookie, sg, readable, writable);
+ return (0);
+ } else if (vq->vq_free_cnt < needed)
+ return (EMSGSIZE);
+
+ head_idx = vq->vq_desc_head_idx;
+ VQ_RING_ASSERT_VALID_IDX(vq, head_idx);
+ dxp = &vq->vq_descx[head_idx];
+
+ VQASSERT(vq, dxp->cookie == NULL,
+ "cookie already exists for index %d", head_idx);
+ dxp->cookie = cookie;
+ dxp->ndescs = needed;
+
+ idx = vq_ring_enqueue_segments(vq, vq->vq_ring.desc, head_idx,
+ sg, readable, writable);
+
+ vq->vq_desc_head_idx = idx;
+ vq->vq_free_cnt -= needed;
+ if (vq->vq_free_cnt == 0)
+ VQ_RING_ASSERT_CHAIN_TERM(vq);
+ else
+ VQ_RING_ASSERT_VALID_IDX(vq, idx);
+
+ vq_ring_update_avail(vq, head_idx);
+
+ return (0);
+}
+
+void *
+virtqueue_dequeue(struct virtqueue *vq, uint32_t *len)
+{
+ struct vring_used_elem *uep;
+ void *cookie;
+ uint16_t used_idx, desc_idx;
+
+ if (vq->vq_used_cons_idx == vq->vq_ring.used->idx)
+ return (NULL);
+
+ used_idx = vq->vq_used_cons_idx++ & (vq->vq_nentries - 1);
+ uep = &vq->vq_ring.used->ring[used_idx];
+
+ mb();
+ desc_idx = (uint16_t) uep->id;
+ if (len != NULL)
+ *len = uep->len;
+
+ vq_ring_free_chain(vq, desc_idx);
+
+ cookie = vq->vq_descx[desc_idx].cookie;
+ VQASSERT(vq, cookie != NULL, "no cookie for index %d", desc_idx);
+ vq->vq_descx[desc_idx].cookie = NULL;
+
+ return (cookie);
+}
+
+void *
+virtqueue_poll(struct virtqueue *vq, uint32_t *len)
+{
+ void *cookie;
+
+ while ((cookie = virtqueue_dequeue(vq, len)) == NULL)
+ cpu_spinwait();
+
+ return (cookie);
+}
+
+void *
+virtqueue_drain(struct virtqueue *vq, int *last)
+{
+ void *cookie;
+ int idx;
+
+ cookie = NULL;
+ idx = *last;
+
+ while (idx < vq->vq_nentries && cookie == NULL) {
+ if ((cookie = vq->vq_descx[idx].cookie) != NULL) {
+ vq->vq_descx[idx].cookie = NULL;
+ /* Free chain to keep free count consistent. */
+ vq_ring_free_chain(vq, idx);
+ }
+ idx++;
+ }
+
+ *last = idx;
+
+ return (cookie);
+}
+
+void
+virtqueue_dump(struct virtqueue *vq)
+{
+
+ if (vq == NULL)
+ return;
+
+ printf("VQ: %s - size=%d; free=%d; used=%d; queued=%d; "
+ "desc_head_idx=%d; avail.idx=%d; used_cons_idx=%d; "
+ "used.idx=%d; avail.flags=0x%x; used.flags=0x%x\n",
+ vq->vq_name, vq->vq_nentries, vq->vq_free_cnt,
+ virtqueue_nused(vq), vq->vq_queued_cnt, vq->vq_desc_head_idx,
+ vq->vq_ring.avail->idx, vq->vq_used_cons_idx,
+ vq->vq_ring.used->idx, vq->vq_ring.avail->flags,
+ vq->vq_ring.used->flags);
+}
+
+static void
+vq_ring_init(struct virtqueue *vq)
+{
+ struct vring *vr;
+ char *ring_mem;
+ int i, size;
+
+ ring_mem = vq->vq_ring_mem;
+ size = vq->vq_nentries;
+ vr = &vq->vq_ring;
+
+ vring_init(vr, size, ring_mem, vq->vq_alignment);
+
+ for (i = 0; i < size - 1; i++)
+ vr->desc[i].next = i + 1;
+ vr->desc[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+static void
+vq_ring_update_avail(struct virtqueue *vq, uint16_t desc_idx)
+{
+ uint16_t avail_idx;
+
+ /*
+ * Place the head of the descriptor chain into the next slot and make
+ * it usable to the host. The chain is made available now rather than
+ * deferring to virtqueue_notify() in the hopes that if the host is
+ * currently running on another CPU, we can keep it processing the new
+ * descriptor.
+ */
+ avail_idx = vq->vq_ring.avail->idx & (vq->vq_nentries - 1);
+ vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+
+ mb();
+ vq->vq_ring.avail->idx++;
+
+ /* Keep pending count until virtqueue_notify() for debugging. */
+ vq->vq_queued_cnt++;
+}
+
+static uint16_t
+vq_ring_enqueue_segments(struct virtqueue *vq, struct vring_desc *desc,
+ uint16_t head_idx, struct sglist *sg, int readable, int writable)
+{
+ struct sglist_seg *seg;
+ struct vring_desc *dp;
+ int i, needed;
+ uint16_t idx;
+
+ needed = readable + writable;
+
+ for (i = 0, idx = head_idx, seg = sg->sg_segs;
+ i < needed;
+ i++, idx = dp->next, seg++) {
+ VQASSERT(vq, idx != VQ_RING_DESC_CHAIN_END,
+ "premature end of free desc chain");
+
+ dp = &desc[idx];
+ dp->addr = seg->ss_paddr;
+ dp->len = seg->ss_len;
+ dp->flags = 0;
+
+ if (i < needed - 1)
+ dp->flags |= VRING_DESC_F_NEXT;
+ if (i >= readable)
+ dp->flags |= VRING_DESC_F_WRITE;
+ }
+
+ return (idx);
+}
+
+static int
+vq_ring_use_indirect(struct virtqueue *vq, int needed)
+{
+
+ if ((vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT) == 0)
+ return (0);
+
+ if (vq->vq_max_indirect_size < needed)
+ return (0);
+
+ if (needed < 2)
+ return (0);
+
+ return (1);
+}
+
+static void
+vq_ring_enqueue_indirect(struct virtqueue *vq, void *cookie,
+ struct sglist *sg, int readable, int writable)
+{
+ struct vring_desc *dp;
+ struct vq_desc_extra *dxp;
+ int needed;
+ uint16_t head_idx;
+
+ needed = readable + writable;
+ VQASSERT(vq, needed <= vq->vq_max_indirect_size,
+ "enqueuing too many indirect descriptors");
+
+ head_idx = vq->vq_desc_head_idx;
+ VQ_RING_ASSERT_VALID_IDX(vq, head_idx);
+ dp = &vq->vq_ring.desc[head_idx];
+ dxp = &vq->vq_descx[head_idx];
+
+ VQASSERT(vq, dxp->cookie == NULL,
+ "cookie already exists for index %d", head_idx);
+ dxp->cookie = cookie;
+ dxp->ndescs = 1;
+
+ dp->addr = dxp->indirect_paddr;
+ dp->len = needed * sizeof(struct vring_desc);
+ dp->flags = VRING_DESC_F_INDIRECT;
+
+ vq_ring_enqueue_segments(vq, dxp->indirect, 0,
+ sg, readable, writable);
+
+ vq->vq_desc_head_idx = dp->next;
+ vq->vq_free_cnt--;
+ if (vq->vq_free_cnt == 0)
+ VQ_RING_ASSERT_CHAIN_TERM(vq);
+ else
+ VQ_RING_ASSERT_VALID_IDX(vq, vq->vq_desc_head_idx);
+
+ vq_ring_update_avail(vq, head_idx);
+}
+
+static void
+vq_ring_notify_host(struct virtqueue *vq, int force)
+{
+
+ mb();
+
+ if (force ||
+ (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) == 0)
+ VIRTIO_BUS_NOTIFY_VQ(vq->vq_dev, vq->vq_queue_index);
+}
+
+static void
+vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx)
+{
+ struct vring_desc *dp;
+ struct vq_desc_extra *dxp;
+
+ VQ_RING_ASSERT_VALID_IDX(vq, desc_idx);
+ dp = &vq->vq_ring.desc[desc_idx];
+ dxp = &vq->vq_descx[desc_idx];
+
+ if (vq->vq_free_cnt == 0)
+ VQ_RING_ASSERT_CHAIN_TERM(vq);
+
+ vq->vq_free_cnt += dxp->ndescs;
+ dxp->ndescs--;
+
+ if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) {
+ while (dp->flags & VRING_DESC_F_NEXT) {
+ VQ_RING_ASSERT_VALID_IDX(vq, dp->next);
+ dp = &vq->vq_ring.desc[dp->next];
+ dxp->ndescs--;
+ }
+ }
+ VQASSERT(vq, dxp->ndescs == 0, "failed to free entire desc chain");
+
+ /*
+ * We must append the existing free chain, if any, to the end of
+ * newly freed chain. If the virtqueue was completely used, then
+ * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above).
+ */
+ dp->next = vq->vq_desc_head_idx;
+ vq->vq_desc_head_idx = desc_idx;
+}
diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h
new file mode 100644
index 0000000..e790e65
--- /dev/null
+++ b/sys/dev/virtio/virtqueue.h
@@ -0,0 +1,98 @@
+/*-
+ * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VIRTIO_VIRTQUEUE_H
+#define _VIRTIO_VIRTQUEUE_H
+
+#include <sys/types.h>
+
+struct virtqueue;
+struct sglist;
+
+/* Support for indirect buffer descriptors. */
+#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28)
+
+/* The guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field.
+ * The host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field.
+ */
+#define VIRTIO_RING_F_EVENT_IDX (1 << 29)
+
+/* Device callback for a virtqueue interrupt. */
+typedef int virtqueue_intr_t(void *);
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+/* One for each virtqueue the device wishes to allocate. */
+struct vq_alloc_info {
+ char vqai_name[VIRTQUEUE_MAX_NAME_SZ];
+ int vqai_maxindirsz;
+ virtqueue_intr_t *vqai_intr;
+ void *vqai_intr_arg;
+ struct virtqueue **vqai_vq;
+};
+
+#define VQ_ALLOC_INFO_INIT(_i,_nsegs,_intr,_arg,_vqp,_str,...) do { \
+ snprintf((_i)->vqai_name, VIRTQUEUE_MAX_NAME_SZ, _str, \
+ ##__VA_ARGS__); \
+ (_i)->vqai_maxindirsz = (_nsegs); \
+ (_i)->vqai_intr = (_intr); \
+ (_i)->vqai_intr_arg = (_arg); \
+ (_i)->vqai_vq = (_vqp); \
+} while (0)
+
+uint64_t virtqueue_filter_features(uint64_t features);
+
+int virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size,
+ int align, vm_paddr_t highaddr, struct vq_alloc_info *info,
+ struct virtqueue **vqp);
+void *virtqueue_drain(struct virtqueue *vq, int *last);
+void virtqueue_free(struct virtqueue *vq);
+int virtqueue_reinit(struct virtqueue *vq, uint16_t size);
+
+int virtqueue_intr(struct virtqueue *vq);
+int virtqueue_enable_intr(struct virtqueue *vq);
+void virtqueue_disable_intr(struct virtqueue *vq);
+
+/* Get physical address of the virtqueue ring. */
+vm_paddr_t virtqueue_paddr(struct virtqueue *vq);
+
+int virtqueue_full(struct virtqueue *vq);
+int virtqueue_empty(struct virtqueue *vq);
+int virtqueue_size(struct virtqueue *vq);
+int virtqueue_nused(struct virtqueue *vq);
+void virtqueue_notify(struct virtqueue *vq);
+void virtqueue_dump(struct virtqueue *vq);
+
+int virtqueue_enqueue(struct virtqueue *vq, void *cookie,
+ struct sglist *sg, int readable, int writable);
+void *virtqueue_dequeue(struct virtqueue *vq, uint32_t *len);
+void *virtqueue_poll(struct virtqueue *vq, uint32_t *len);
+
+#endif /* _VIRTIO_VIRTQUEUE_H */
OpenPOWER on IntegriCloud