diff options
27 files changed, 8190 insertions, 0 deletions
diff --git a/share/man/man4/virtio.4 b/share/man/man4/virtio.4 new file mode 100644 index 0000000..7787051 --- /dev/null +++ b/share/man/man4/virtio.4 @@ -0,0 +1,91 @@ +.\" Copyright (c) 2011 Bryan Venteicher +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd July 4, 2011 +.Dt VIRTIO 4 +.Os +.Sh NAME +.Nm virtio +.Nd VirtIO Device Support +.Sh SYNOPSIS +To compile VirtIO device support into the kernel, place the following lines +in your kernel configuration file: +.Bd -ragged -offset indent +.Cd "device virtio" +.Cd "device virtio_pci" +.Ed +.Pp +Alternatively, to load VirtIO support as modules at boot time, place the +following lines in +.Xr loader.conf 5 : +.Bd -literal -offset indent +virtio_load="YES" +virtio_pci_load="YES" +.Ed +.Sh DESCRIPTION +VirtIO is a specification for para-virtualized I/O in a virtual machine (VM). +Traditionally, the hypervisor emulated real devices such as an Ethernet +interface or disk controller to provide the VM with I/O. This emulation is +often inefficient. +.Pp +VirtIO defines an interface for efficient I/O between the hypervisor and VM. +The +.Xr virtio 4 +module provides a shared memory transport called a virtqueue. +The +.Xr virtio_pci 4 +device driver represents an emulated PCI device that the hypervisor makes +available to the VM. This device provides the probing, configuration, and +interrupt notifications need to interact with the hypervisor. +.Fx +supports the following VirtIO devices: +.Bl -hang -offset indent -width xxxxxxxx +.It Nm Ethernet +An emulated Ethernet device is provided by the +.Xr if_vtnet 4 +device driver. +.It Nm Block +An emulated disk controller is provided by the +.Xr virtio_blk 4 +device driver. +.It Nm Balloon +A pseudo-device to allow the VM to release memory back to the hypervisor is +provided by the +.Xr virtio_balloon 4 +device driver. +.El +.Sh SEE ALSO +.Xr if_vtnet 4 , +.Xr virtio_blk 4 , +.Xr virtio_balloon 4 +.Sh HISTORY +Support for VirtIO first appeared in +.Fx 9.0 . +.Sh AUTHORS +.An -nosplit +.Fx +support for VirtIO was first added by +.An Bryan Venteicher Aq bryanv@daemoninthecloset.org . diff --git a/share/man/man4/virtio_balloon.4 b/share/man/man4/virtio_balloon.4 new file mode 100644 index 0000000..4053f78 --- /dev/null +++ b/share/man/man4/virtio_balloon.4 @@ -0,0 +1,64 @@ +.\" Copyright (c) 2011 Bryan Venteicher +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd July 4, 2011 +.Dt VIRTIO_BALLOON 4 +.Os +.Sh NAME +.Nm virtio_balloon +.Nd VirtIO Memory Balloon driver +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following lines in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device virtio_balloon" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +virtio_balloon_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +device driver provides support for VirtIO memory balloon devices. +.Pp +The memory balloon allows the guest to, at the request of the +hypervisor, return memory allocated to the hypervisor so it can +be made available to other guests. The hypervisor can later +signal the balloon to return the memory. +.Sh SEE ALSO +.Xr virtio 4 +.Sh HISTORY +The +.Nm +driver was written by +.An Bryan Venteicher Aq bryanv@daemoninthecloset.org . +It first appeared in +.Fx 9.0 . diff --git a/share/man/man4/virtio_blk.4 b/share/man/man4/virtio_blk.4 new file mode 100644 index 0000000..4250141 --- /dev/null +++ b/share/man/man4/virtio_blk.4 @@ -0,0 +1,70 @@ +.\" Copyright (c) 2011 Bryan Venteicher +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd July 4, 2011 +.Dt VIRTIO_BLK 4 +.Os +.Sh NAME +.Nm virtio_blk +.Nd VirtIO Block driver +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following lines in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device virtio_blk" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +virtio_blk_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +device driver provides support for VirtIO block devices. +.Pp +.Sh LOADER TUNABLES +Tunables can be set at the +.Xr loader 8 +prompt before booting the kernel or stored in +.Xr loader.conf 5 . +.Bl -tag -width "xxxxxx" +.It Va hw.vtblk.no_ident +This tunable disables retrieving the device identification string +from the hypervisor. The default value is 0. +.El +.Sh SEE ALSO +.Xr virtio 4 +.Sh HISTORY +The +.Nm +driver was written by +.An Bryan Venteicher Aq bryanv@daemoninthecloset.org . +It first appeared in +.Fx 9.0 . diff --git a/share/man/man4/vtnet.4 b/share/man/man4/vtnet.4 new file mode 100644 index 0000000..900d12d --- /dev/null +++ b/share/man/man4/vtnet.4 @@ -0,0 +1,98 @@ +.\" Copyright (c) 2011 Bryan Venteicher +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +.\" SUCH DAMAGE. +.\" +.\" $FreeBSD$ +.\" +.Dd July 4, 2011 +.Dt VTNET 4 +.Os +.Sh NAME +.Nm vtnet +.Nd VirtIO Ethernet driver +.Sh SYNOPSIS +To compile this driver into the kernel, +place the following lines in your +kernel configuration file: +.Bd -ragged -offset indent +.Cd "device if_vtnet" +.Ed +.Pp +Alternatively, to load the driver as a +module at boot time, place the following line in +.Xr loader.conf 5 : +.Bd -literal -offset indent +if_vtnet_load="YES" +.Ed +.Sh DESCRIPTION +The +.Nm +device driver provides support for VirtIO Ethernet devices. +.Pp +If the hypervisor advertises the appreciate features, the +.Nm +driver supports TCP/UDP checksum offload for both transmit and receive, +TCP segmentation offload (TSO), TCP large receive offload (LRO), and +hardware VLAN tag stripping/insertion features, as well as a multicast +hash filter, as well as Jumbo Frames (up to 9216 bytes), which can be +configured via the interface MTU setting. +Selecting an MTU larger than 1500 bytes with the +.Xr ifconfig 8 +utility configures the adapter to receive and transmit Jumbo Frames. +.Pp +For more information on configuring this device, see +.Xr ifconfig 8 . +.El +.Sh LOADER TUNABLES +Tunables can be set at the +.Xr loader 8 +prompt before booting the kernel or stored in +.Xr loader.conf 5 . +.Bl -tag -width "xxxxxx" +.It Va hw.vtnet.csum_disable +This tunable disables receive and send checksum offload. The default +value is 0. +.It Va hw.vtnet.tso_disable +This tunable disables TSO. The default value is 0. +.It Va hw.vtnet.lro_disable +This tunable disables LRO. The default value is 0. +.El +.Sh SEE ALSO +.Xr arp 4 , +.Xr netintro 4 , +.Xr ng_ether 4 , +.Xr vlan 4 , +.Xr virtio 4 , +.Xr ifconfig 8 +.Sh HISTORY +The +.Nm +driver was written by +.An Bryan Venteicher Aq bryanv@daemoninthecloset.org . +It first appeared in +.Fx 9.0 . +.Sh CAVEATS +The +.Nm +driver only supports LRO when the hypervisor advertises the +mergeable buffer feature. diff --git a/sys/dev/virtio/balloon/virtio_balloon.c b/sys/dev/virtio/balloon/virtio_balloon.c new file mode 100644 index 0000000..ef7aca9 --- /dev/null +++ b/sys/dev/virtio/balloon/virtio_balloon.c @@ -0,0 +1,569 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Driver for VirtIO memory balloon devices. */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/endian.h> +#include <sys/kthread.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/sglist.h> +#include <sys/sysctl.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> + +#include <vm/vm.h> +#include <vm/vm_page.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus.h> +#include <sys/rman.h> + +#include <dev/virtio/virtio.h> +#include <dev/virtio/virtqueue.h> +#include <dev/virtio/balloon/virtio_balloon.h> + +#include "virtio_if.h" + +struct vtballoon_softc { + device_t vtballoon_dev; + struct mtx vtballoon_mtx; + uint64_t vtballoon_features; + uint32_t vtballoon_flags; +#define VTBALLOON_FLAG_DETACH 0x01 + + struct virtqueue *vtballoon_inflate_vq; + struct virtqueue *vtballoon_deflate_vq; + + uint32_t vtballoon_desired_npages; + uint32_t vtballoon_current_npages; + TAILQ_HEAD(,vm_page) vtballoon_pages; + + struct proc *vtballoon_kproc; + uint32_t *vtballoon_page_frames; + int vtballoon_timeout; +}; + +static struct virtio_feature_desc vtballoon_feature_desc[] = { + { VIRTIO_BALLOON_F_MUST_TELL_HOST, "MustTellHost" }, + { VIRTIO_BALLOON_F_STATS_VQ, "StatsVq" }, + + { 0, NULL } +}; + +static int vtballoon_probe(device_t); +static int vtballoon_attach(device_t); +static int vtballoon_detach(device_t); +static int vtballoon_config_change(device_t); + +static void vtballoon_negotiate_features(struct vtballoon_softc *); +static int vtballoon_alloc_virtqueues(struct vtballoon_softc *); + +static int vtballoon_vq_intr(void *); + +static void vtballoon_inflate(struct vtballoon_softc *, int); +static void vtballoon_deflate(struct vtballoon_softc *, int); + +static void vtballoon_send_page_frames(struct vtballoon_softc *, + struct virtqueue *, int); + +static void vtballoon_pop(struct vtballoon_softc *); +static void vtballoon_stop(struct vtballoon_softc *); + +static vm_page_t + vtballoon_alloc_page(struct vtballoon_softc *); +static void vtballoon_free_page(struct vtballoon_softc *, vm_page_t); + +static int vtballoon_sleep(struct vtballoon_softc *); +static void vtballoon_thread(void *); +static void vtballoon_add_sysctl(struct vtballoon_softc *); + +/* Features desired/implemented by this driver. */ +#define VTBALLOON_FEATURES 0 + +/* Timeout between retries when the balloon needs inflating. */ +#define VTBALLOON_LOWMEM_TIMEOUT hz + +/* + * Maximum number of pages we'll request to inflate or deflate + * the balloon in one virtqueue request. Both Linux and NetBSD + * have settled on 256, doing up to 1MB at a time. + */ +#define VTBALLOON_PAGES_PER_REQUEST 256 + +#define VTBALLOON_MTX(_sc) &(_sc)->vtballoon_mtx +#define VTBALLOON_LOCK_INIT(_sc, _name) mtx_init(VTBALLOON_MTX((_sc)), _name, \ + "VirtIO Balloon Lock", MTX_SPIN) +#define VTBALLOON_LOCK(_sc) mtx_lock_spin(VTBALLOON_MTX((_sc))) +#define VTBALLOON_UNLOCK(_sc) mtx_unlock_spin(VTBALLOON_MTX((_sc))) +#define VTBALLOON_LOCK_DESTROY(_sc) mtx_destroy(VTBALLOON_MTX((_sc))) + +static device_method_t vtballoon_methods[] = { + /* Device methods. */ + DEVMETHOD(device_probe, vtballoon_probe), + DEVMETHOD(device_attach, vtballoon_attach), + DEVMETHOD(device_detach, vtballoon_detach), + + /* VirtIO methods. */ + DEVMETHOD(virtio_config_change, vtballoon_config_change), + + { 0, 0 } +}; + +static driver_t vtballoon_driver = { + "vtballoon", + vtballoon_methods, + sizeof(struct vtballoon_softc) +}; +static devclass_t vtballoon_devclass; + +DRIVER_MODULE(virtio_balloon, virtio_pci, vtballoon_driver, + vtballoon_devclass, 0, 0); +MODULE_VERSION(virtio_balloon, 1); +MODULE_DEPEND(virtio_balloon, virtio, 1, 1, 1); + +static int +vtballoon_probe(device_t dev) +{ + + if (virtio_get_device_type(dev) != VIRTIO_ID_BALLOON) + return (ENXIO); + + device_set_desc(dev, "VirtIO Balloon Adapter"); + + return (BUS_PROBE_DEFAULT); +} + +static int +vtballoon_attach(device_t dev) +{ + struct vtballoon_softc *sc; + int error; + + sc = device_get_softc(dev); + sc->vtballoon_dev = dev; + + VTBALLOON_LOCK_INIT(sc, device_get_nameunit(dev)); + TAILQ_INIT(&sc->vtballoon_pages); + + vtballoon_add_sysctl(sc); + + virtio_set_feature_desc(dev, vtballoon_feature_desc); + vtballoon_negotiate_features(sc); + + sc->vtballoon_page_frames = malloc(VTBALLOON_PAGES_PER_REQUEST * + sizeof(uint32_t), M_DEVBUF, M_NOWAIT | M_ZERO); + if (sc->vtballoon_page_frames == NULL) { + error = ENOMEM; + device_printf(dev, + "cannot allocate page frame request array\n"); + goto fail; + } + + error = vtballoon_alloc_virtqueues(sc); + if (error) { + device_printf(dev, "cannot allocate virtqueues\n"); + goto fail; + } + + error = virtio_setup_intr(dev, INTR_TYPE_MISC); + if (error) { + device_printf(dev, "cannot setup virtqueue interrupts\n"); + goto fail; + } + + error = kproc_create(vtballoon_thread, sc, &sc->vtballoon_kproc, + 0, 0, "virtio_balloon"); + if (error) { + device_printf(dev, "cannot create balloon kproc\n"); + goto fail; + } + + virtqueue_enable_intr(sc->vtballoon_inflate_vq); + virtqueue_enable_intr(sc->vtballoon_deflate_vq); + +fail: + if (error) + vtballoon_detach(dev); + + return (error); +} + +static int +vtballoon_detach(device_t dev) +{ + struct vtballoon_softc *sc; + + sc = device_get_softc(dev); + + if (sc->vtballoon_kproc != NULL) { + VTBALLOON_LOCK(sc); + sc->vtballoon_flags |= VTBALLOON_FLAG_DETACH; + wakeup_one(sc); + msleep_spin(sc->vtballoon_kproc, VTBALLOON_MTX(sc), + "vtbdth", 0); + VTBALLOON_UNLOCK(sc); + + sc->vtballoon_kproc = NULL; + } + + if (device_is_attached(dev)) { + vtballoon_pop(sc); + vtballoon_stop(sc); + } + + if (sc->vtballoon_page_frames != NULL) { + free(sc->vtballoon_page_frames, M_DEVBUF); + sc->vtballoon_page_frames = NULL; + } + + VTBALLOON_LOCK_DESTROY(sc); + + return (0); +} + +static int +vtballoon_config_change(device_t dev) +{ + struct vtballoon_softc *sc; + + sc = device_get_softc(dev); + + VTBALLOON_LOCK(sc); + wakeup_one(sc); + VTBALLOON_UNLOCK(sc); + + return (1); +} + +static void +vtballoon_negotiate_features(struct vtballoon_softc *sc) +{ + device_t dev; + uint64_t features; + + dev = sc->vtballoon_dev; + features = virtio_negotiate_features(dev, VTBALLOON_FEATURES); + sc->vtballoon_features = features; +} + +static int +vtballoon_alloc_virtqueues(struct vtballoon_softc *sc) +{ + device_t dev; + struct vq_alloc_info vq_info[2]; + int nvqs; + + dev = sc->vtballoon_dev; + nvqs = 2; + + VQ_ALLOC_INFO_INIT(&vq_info[0], 0, vtballoon_vq_intr, sc, + &sc->vtballoon_inflate_vq, "%s inflate", device_get_nameunit(dev)); + + VQ_ALLOC_INFO_INIT(&vq_info[1], 0, vtballoon_vq_intr, sc, + &sc->vtballoon_deflate_vq, "%s deflate", device_get_nameunit(dev)); + + return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info)); +} + +static int +vtballoon_vq_intr(void *xsc) +{ + struct vtballoon_softc *sc; + + sc = xsc; + + VTBALLOON_LOCK(sc); + wakeup_one(sc); + VTBALLOON_UNLOCK(sc); + + return (1); +} + +static void +vtballoon_inflate(struct vtballoon_softc *sc, int npages) +{ + struct virtqueue *vq; + vm_page_t m; + int i; + + vq = sc->vtballoon_inflate_vq; + m = NULL; + + if (npages > VTBALLOON_PAGES_PER_REQUEST) + npages = VTBALLOON_PAGES_PER_REQUEST; + KASSERT(npages > 0, ("balloon doesn't need inflating?")); + + for (i = 0; i < npages; i++) { + if ((m = vtballoon_alloc_page(sc)) == NULL) + break; + + sc->vtballoon_page_frames[i] = + VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; + + KASSERT(m->queue == PQ_NONE, ("allocated page on queue")); + TAILQ_INSERT_TAIL(&sc->vtballoon_pages, m, pageq); + } + + if (i > 0) + vtballoon_send_page_frames(sc, vq, i); + + if (m == NULL) + sc->vtballoon_timeout = VTBALLOON_LOWMEM_TIMEOUT; +} + +static void +vtballoon_deflate(struct vtballoon_softc *sc, int npages) +{ + TAILQ_HEAD(, vm_page) free_pages; + struct virtqueue *vq; + vm_page_t m; + int i; + + vq = sc->vtballoon_deflate_vq; + TAILQ_INIT(&free_pages); + + if (npages > VTBALLOON_PAGES_PER_REQUEST) + npages = VTBALLOON_PAGES_PER_REQUEST; + KASSERT(npages > 0, ("balloon doesn't need deflating?")); + + for (i = 0; i < npages; i++) { + m = TAILQ_FIRST(&sc->vtballoon_pages); + KASSERT(m != NULL, ("no more pages to deflate")); + + sc->vtballoon_page_frames[i] = + VM_PAGE_TO_PHYS(m) >> VIRTIO_BALLOON_PFN_SHIFT; + + TAILQ_REMOVE(&sc->vtballoon_pages, m, pageq); + TAILQ_INSERT_TAIL(&free_pages, m, pageq); + } + + if (i > 0) { + /* Always tell host first before freeing the pages. */ + vtballoon_send_page_frames(sc, vq, i); + + while ((m = TAILQ_FIRST(&free_pages)) != NULL) { + TAILQ_REMOVE(&free_pages, m, pageq); + vtballoon_free_page(sc, m); + } + } + + KASSERT((TAILQ_EMPTY(&sc->vtballoon_pages) && + sc->vtballoon_current_npages == 0) || + (!TAILQ_EMPTY(&sc->vtballoon_pages) && + sc->vtballoon_current_npages != 0), ("balloon empty?")); +} + +static void +vtballoon_send_page_frames(struct vtballoon_softc *sc, struct virtqueue *vq, + int npages) +{ + struct sglist sg; + struct sglist_seg segs[1]; + void *c; + int error; + + sglist_init(&sg, 1, segs); + + error = sglist_append(&sg, sc->vtballoon_page_frames, + npages * sizeof(uint32_t)); + KASSERT(error == 0, ("error adding page frames to sglist")); + + error = virtqueue_enqueue(vq, vq, &sg, 1, 0); + KASSERT(error == 0, ("error enqueuing page frames to virtqueue")); + + /* + * Inflate and deflate operations are done synchronously. The + * interrupt handler will wake us up. + */ + VTBALLOON_LOCK(sc); + virtqueue_notify(vq); + + while ((c = virtqueue_dequeue(vq, NULL)) == NULL) + msleep_spin(sc, VTBALLOON_MTX(sc), "vtbspf", 0); + VTBALLOON_UNLOCK(sc); + + KASSERT(c == vq, ("unexpected balloon operation response")); +} + +static void +vtballoon_pop(struct vtballoon_softc *sc) +{ + + while (!TAILQ_EMPTY(&sc->vtballoon_pages)) + vtballoon_deflate(sc, sc->vtballoon_current_npages); +} + +static void +vtballoon_stop(struct vtballoon_softc *sc) +{ + + virtqueue_disable_intr(sc->vtballoon_inflate_vq); + virtqueue_disable_intr(sc->vtballoon_deflate_vq); + + virtio_stop(sc->vtballoon_dev); +} + +static vm_page_t +vtballoon_alloc_page(struct vtballoon_softc *sc) +{ + vm_page_t m; + + m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_WIRED | + VM_ALLOC_NOOBJ); + if (m != NULL) + sc->vtballoon_current_npages++; + + return (m); +} + +static void +vtballoon_free_page(struct vtballoon_softc *sc, vm_page_t m) +{ + + vm_page_unwire(m, 0); + vm_page_free(m); + sc->vtballoon_current_npages--; +} + +static uint32_t +vtballoon_desired_size(struct vtballoon_softc *sc) +{ + uint32_t desired; + + desired = virtio_read_dev_config_4(sc->vtballoon_dev, + offsetof(struct virtio_balloon_config, num_pages)); + + return (le32toh(desired)); +} + +static void +vtballoon_update_size(struct vtballoon_softc *sc) +{ + + virtio_write_dev_config_4(sc->vtballoon_dev, + offsetof(struct virtio_balloon_config, actual), + htole32(sc->vtballoon_current_npages)); + +} + +static int +vtballoon_sleep(struct vtballoon_softc *sc) +{ + int rc, timeout; + uint32_t current, desired; + + rc = 0; + current = sc->vtballoon_current_npages; + + VTBALLOON_LOCK(sc); + for (;;) { + if (sc->vtballoon_flags & VTBALLOON_FLAG_DETACH) { + rc = 1; + break; + } + + desired = vtballoon_desired_size(sc); + sc->vtballoon_desired_npages = desired; + + /* + * If given, use non-zero timeout on the first time through + * the loop. On subsequent times, timeout will be zero so + * we will reevaluate the desired size of the balloon and + * break out to retry if needed. + */ + timeout = sc->vtballoon_timeout; + sc->vtballoon_timeout = 0; + + if (current > desired) + break; + if (current < desired && timeout == 0) + break; + + msleep_spin(sc, VTBALLOON_MTX(sc), "vtbslp", timeout); + } + VTBALLOON_UNLOCK(sc); + + return (rc); +} + +static void +vtballoon_thread(void *xsc) +{ + struct vtballoon_softc *sc; + uint32_t current, desired; + + sc = xsc; + + for (;;) { + if (vtballoon_sleep(sc) != 0) + break; + + current = sc->vtballoon_current_npages; + desired = sc->vtballoon_desired_npages; + + if (desired != current) { + if (desired > current) + vtballoon_inflate(sc, desired - current); + else + vtballoon_deflate(sc, current - desired); + + vtballoon_update_size(sc); + } + } + + kproc_exit(0); +} + +static void +vtballoon_add_sysctl(struct vtballoon_softc *sc) +{ + device_t dev; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + + dev = sc->vtballoon_dev; + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "desired", + CTLFLAG_RD, &sc->vtballoon_desired_npages, sizeof(uint32_t), + "Desired balloon size in pages"); + + SYSCTL_ADD_UINT(ctx, child, OID_AUTO, "current", + CTLFLAG_RD, &sc->vtballoon_current_npages, sizeof(uint32_t), + "Current balloon size in pages"); +} diff --git a/sys/dev/virtio/balloon/virtio_balloon.h b/sys/dev/virtio/balloon/virtio_balloon.h new file mode 100644 index 0000000..cea84ba --- /dev/null +++ b/sys/dev/virtio/balloon/virtio_balloon.h @@ -0,0 +1,41 @@ +/* + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_BALLOON_H +#define _VIRTIO_BALLOON_H + +#include <sys/types.h> + +/* Feature bits. */ +#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0x1 /* Tell before reclaiming pages */ +#define VIRTIO_BALLOON_F_STATS_VQ 0x2 /* Memory stats virtqueue */ + +/* Size of a PFN in the balloon interface. */ +#define VIRTIO_BALLOON_PFN_SHIFT 12 + +struct virtio_balloon_config { + /* Number of pages host wants Guest to give up. */ + uint32_t num_pages; + + /* Number of pages we've actually got in balloon. */ + uint32_t actual; +}; + +#define VIRTIO_BALLOON_S_SWAP_IN 0 /* Amount of memory swapped in */ +#define VIRTIO_BALLOON_S_SWAP_OUT 1 /* Amount of memory swapped out */ +#define VIRTIO_BALLOON_S_MAJFLT 2 /* Number of major faults */ +#define VIRTIO_BALLOON_S_MINFLT 3 /* Number of minor faults */ +#define VIRTIO_BALLOON_S_MEMFREE 4 /* Total amount of free memory */ +#define VIRTIO_BALLOON_S_MEMTOT 5 /* Total amount of memory */ +#define VIRTIO_BALLOON_S_NR 6 + +struct virtio_balloon_stat { + uint16_t tag; + uint64_t val; +} __packed; + +#endif /* _VIRTIO_BALLOON_H */ diff --git a/sys/dev/virtio/block/virtio_blk.c b/sys/dev/virtio/block/virtio_blk.c new file mode 100644 index 0000000..09783a8 --- /dev/null +++ b/sys/dev/virtio/block/virtio_blk.c @@ -0,0 +1,1149 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Driver for VirtIO block devices. */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/bio.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/sglist.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> + +#include <geom/geom_disk.h> +#include <vm/uma.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus.h> +#include <sys/rman.h> + +#include <dev/virtio/virtio.h> +#include <dev/virtio/virtqueue.h> +#include <dev/virtio/block/virtio_blk.h> + +#include "virtio_if.h" + +struct vtblk_request { + struct virtio_blk_outhdr vbr_hdr; + struct bio *vbr_bp; + uint8_t vbr_ack; + + TAILQ_ENTRY(vtblk_request) vbr_link; +}; + +struct vtblk_softc { + device_t vtblk_dev; + struct mtx vtblk_mtx; + uint64_t vtblk_features; + uint32_t vtblk_flags; +#define VTBLK_FLAG_INDIRECT 0x0001 +#define VTBLK_FLAG_READONLY 0x0002 +#define VTBLK_FLAG_DETACHING 0x0004 +#define VTBLK_FLAG_SUSPENDED 0x0008 +#define VTBLK_FLAG_DUMPING 0x0010 + + struct virtqueue *vtblk_vq; + struct sglist *vtblk_sglist; + struct disk *vtblk_disk; + + struct bio_queue_head vtblk_bioq; + TAILQ_HEAD(, vtblk_request) + vtblk_req_free; + TAILQ_HEAD(, vtblk_request) + vtblk_req_ready; + + struct taskqueue *vtblk_tq; + struct task vtblk_intr_task; + + int vtblk_sector_size; + int vtblk_max_nsegs; + int vtblk_unit; + int vtblk_request_count; + + struct vtblk_request vtblk_dump_request; +}; + +static struct virtio_feature_desc vtblk_feature_desc[] = { + { VIRTIO_BLK_F_BARRIER, "HostBarrier" }, + { VIRTIO_BLK_F_SIZE_MAX, "MaxSegSize" }, + { VIRTIO_BLK_F_SEG_MAX, "MaxNumSegs" }, + { VIRTIO_BLK_F_GEOMETRY, "DiskGeometry" }, + { VIRTIO_BLK_F_RO, "ReadOnly" }, + { VIRTIO_BLK_F_BLK_SIZE, "BlockSize" }, + { VIRTIO_BLK_F_SCSI, "SCSICmds" }, + { VIRTIO_BLK_F_FLUSH, "FlushCmd" }, + { VIRTIO_BLK_F_TOPOLOGY, "Topology" }, + + { 0, NULL } +}; + +static int vtblk_modevent(module_t, int, void *); + +static int vtblk_probe(device_t); +static int vtblk_attach(device_t); +static int vtblk_detach(device_t); +static int vtblk_suspend(device_t); +static int vtblk_resume(device_t); +static int vtblk_shutdown(device_t); + +static void vtblk_negotiate_features(struct vtblk_softc *); +static int vtblk_maximum_segments(struct vtblk_softc *, + struct virtio_blk_config *); +static int vtblk_alloc_virtqueue(struct vtblk_softc *); +static void vtblk_alloc_disk(struct vtblk_softc *, + struct virtio_blk_config *); +static void vtblk_create_disk(struct vtblk_softc *); + +static int vtblk_open(struct disk *); +static int vtblk_close(struct disk *); +static int vtblk_ioctl(struct disk *, u_long, void *, int, + struct thread *); +static int vtblk_dump(void *, void *, vm_offset_t, off_t, size_t); +static void vtblk_strategy(struct bio *); + +static void vtblk_startio(struct vtblk_softc *); +static struct vtblk_request * vtblk_bio_request(struct vtblk_softc *); +static int vtblk_execute_request(struct vtblk_softc *, + struct vtblk_request *); + +static int vtblk_vq_intr(void *); +static void vtblk_intr_task(void *, int); + +static void vtblk_stop(struct vtblk_softc *); + +static void vtblk_get_ident(struct vtblk_softc *); +static void vtblk_prepare_dump(struct vtblk_softc *); +static int vtblk_write_dump(struct vtblk_softc *, void *, off_t, size_t); +static int vtblk_flush_dump(struct vtblk_softc *); +static int vtblk_poll_request(struct vtblk_softc *, + struct vtblk_request *); + +static void vtblk_drain_vq(struct vtblk_softc *, int); +static void vtblk_drain(struct vtblk_softc *); + +static int vtblk_alloc_requests(struct vtblk_softc *); +static void vtblk_free_requests(struct vtblk_softc *); +static struct vtblk_request * vtblk_dequeue_request(struct vtblk_softc *); +static void vtblk_enqueue_request(struct vtblk_softc *, + struct vtblk_request *); + +static struct vtblk_request * vtblk_dequeue_ready(struct vtblk_softc *); +static void vtblk_enqueue_ready(struct vtblk_softc *, + struct vtblk_request *); + +static void vtblk_bio_error(struct bio *, int); + +/* Tunables. */ +static int vtblk_no_ident = 0; +TUNABLE_INT("hw.vtblk.no_ident", &vtblk_no_ident); + +/* Features desired/implemented by this driver. */ +#define VTBLK_FEATURES \ + (VIRTIO_BLK_F_BARRIER | \ + VIRTIO_BLK_F_SIZE_MAX | \ + VIRTIO_BLK_F_SEG_MAX | \ + VIRTIO_BLK_F_GEOMETRY | \ + VIRTIO_BLK_F_RO | \ + VIRTIO_BLK_F_BLK_SIZE | \ + VIRTIO_BLK_F_FLUSH | \ + VIRTIO_RING_F_INDIRECT_DESC) + +#define VTBLK_MTX(_sc) &(_sc)->vtblk_mtx +#define VTBLK_LOCK_INIT(_sc, _name) \ + mtx_init(VTBLK_MTX((_sc)), (_name), \ + "VTBLK Lock", MTX_DEF) +#define VTBLK_LOCK(_sc) mtx_lock(VTBLK_MTX((_sc))) +#define VTBLK_TRYLOCK(_sc) mtx_trylock(VTBLK_MTX((_sc))) +#define VTBLK_UNLOCK(_sc) mtx_unlock(VTBLK_MTX((_sc))) +#define VTBLK_LOCK_DESTROY(_sc) mtx_destroy(VTBLK_MTX((_sc))) +#define VTBLK_LOCK_ASSERT(_sc) mtx_assert(VTBLK_MTX((_sc)), MA_OWNED) +#define VTBLK_LOCK_ASSERT_NOTOWNED(_sc) \ + mtx_assert(VTBLK_MTX((_sc)), MA_NOTOWNED) + +#define VTBLK_BIO_SEGMENTS(_bp) sglist_count((_bp)->bio_data, (_bp)->bio_bcount) + +#define VTBLK_DISK_NAME "vtbd" + +/* + * Each block request uses at least two segments - one for the header + * and one for the status. + */ +#define VTBLK_MIN_SEGMENTS 2 + +static uma_zone_t vtblk_req_zone; + +static device_method_t vtblk_methods[] = { + /* Device methods. */ + DEVMETHOD(device_probe, vtblk_probe), + DEVMETHOD(device_attach, vtblk_attach), + DEVMETHOD(device_detach, vtblk_detach), + DEVMETHOD(device_suspend, vtblk_suspend), + DEVMETHOD(device_resume, vtblk_resume), + DEVMETHOD(device_shutdown, vtblk_shutdown), + + { 0, 0 } +}; + +static driver_t vtblk_driver = { + "vtblk", + vtblk_methods, + sizeof(struct vtblk_softc) +}; +static devclass_t vtblk_devclass; + +DRIVER_MODULE(virtio_blk, virtio_pci, vtblk_driver, vtblk_devclass, + vtblk_modevent, 0); +MODULE_VERSION(virtio_blk, 1); +MODULE_DEPEND(virtio_blk, virtio, 1, 1, 1); + +static int +vtblk_modevent(module_t mod, int type, void *unused) +{ + int error; + + error = 0; + + switch (type) { + case MOD_LOAD: + vtblk_req_zone = uma_zcreate("vtblk_request", + sizeof(struct vtblk_request), + NULL, NULL, NULL, NULL, 0, 0); + break; + case MOD_QUIESCE: + case MOD_UNLOAD: + if (uma_zone_get_cur(vtblk_req_zone) > 0) + error = EBUSY; + else if (type == MOD_UNLOAD) { + uma_zdestroy(vtblk_req_zone); + vtblk_req_zone = NULL; + } + break; + case MOD_SHUTDOWN: + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +static int +vtblk_probe(device_t dev) +{ + + if (virtio_get_device_type(dev) != VIRTIO_ID_BLOCK) + return (ENXIO); + + device_set_desc(dev, "VirtIO Block Adapter"); + + return (BUS_PROBE_DEFAULT); +} + +static int +vtblk_attach(device_t dev) +{ + struct vtblk_softc *sc; + struct virtio_blk_config blkcfg; + int error; + + sc = device_get_softc(dev); + sc->vtblk_dev = dev; + sc->vtblk_unit = device_get_unit(dev); + + VTBLK_LOCK_INIT(sc, device_get_nameunit(dev)); + + bioq_init(&sc->vtblk_bioq); + TAILQ_INIT(&sc->vtblk_req_free); + TAILQ_INIT(&sc->vtblk_req_ready); + + virtio_set_feature_desc(dev, vtblk_feature_desc); + vtblk_negotiate_features(sc); + + if (virtio_with_feature(dev, VIRTIO_RING_F_INDIRECT_DESC)) + sc->vtblk_flags |= VTBLK_FLAG_INDIRECT; + + if (virtio_with_feature(dev, VIRTIO_BLK_F_RO)) + sc->vtblk_flags |= VTBLK_FLAG_READONLY; + + /* Get local copy of config. */ + if (virtio_with_feature(dev, VIRTIO_BLK_F_TOPOLOGY) == 0) { + bzero(&blkcfg, sizeof(struct virtio_blk_config)); + virtio_read_device_config(dev, 0, &blkcfg, + offsetof(struct virtio_blk_config, physical_block_exp)); + } else + virtio_read_device_config(dev, 0, &blkcfg, + sizeof(struct virtio_blk_config)); + + /* + * With the current sglist(9) implementation, it is not easy + * for us to support a maximum segment size as adjacent + * segments are coalesced. For now, just make sure it's larger + * than the maximum supported transfer size. + */ + if (virtio_with_feature(dev, VIRTIO_BLK_F_SIZE_MAX)) { + if (blkcfg.size_max < MAXPHYS) { + error = ENOTSUP; + device_printf(dev, "host requires unsupported " + "maximum segment size feature\n"); + goto fail; + } + } + + sc->vtblk_max_nsegs = vtblk_maximum_segments(sc, &blkcfg); + + /* + * Allocate working sglist. The number of segments may be too + * large to safely store on the stack. + */ + sc->vtblk_sglist = sglist_alloc(sc->vtblk_max_nsegs, M_NOWAIT); + if (sc->vtblk_sglist == NULL) { + error = ENOMEM; + device_printf(dev, "cannot allocate sglist\n"); + goto fail; + } + + error = vtblk_alloc_virtqueue(sc); + if (error) { + device_printf(dev, "cannot allocate virtqueue\n"); + goto fail; + } + + error = vtblk_alloc_requests(sc); + if (error) { + device_printf(dev, "cannot preallocate requests\n"); + goto fail; + } + + vtblk_alloc_disk(sc, &blkcfg); + + TASK_INIT(&sc->vtblk_intr_task, 0, vtblk_intr_task, sc); + sc->vtblk_tq = taskqueue_create_fast("vtblk_taskq", M_NOWAIT, + taskqueue_thread_enqueue, &sc->vtblk_tq); + if (sc->vtblk_tq == NULL) { + error = ENOMEM; + device_printf(dev, "cannot allocate taskqueue\n"); + goto fail; + } + taskqueue_start_threads(&sc->vtblk_tq, 1, PI_DISK, "%s taskq", + device_get_nameunit(dev)); + + error = virtio_setup_intr(dev, INTR_TYPE_BIO | INTR_ENTROPY); + if (error) { + device_printf(dev, "cannot setup virtqueue interrupt\n"); + goto fail; + } + + vtblk_create_disk(sc); + + virtqueue_enable_intr(sc->vtblk_vq); + +fail: + if (error) + vtblk_detach(dev); + + return (error); +} + +static int +vtblk_detach(device_t dev) +{ + struct vtblk_softc *sc; + + sc = device_get_softc(dev); + + VTBLK_LOCK(sc); + sc->vtblk_flags |= VTBLK_FLAG_DETACHING; + if (device_is_attached(dev)) + vtblk_stop(sc); + VTBLK_UNLOCK(sc); + + if (sc->vtblk_tq != NULL) { + taskqueue_drain(sc->vtblk_tq, &sc->vtblk_intr_task); + taskqueue_free(sc->vtblk_tq); + sc->vtblk_tq = NULL; + } + + vtblk_drain(sc); + + if (sc->vtblk_disk != NULL) { + disk_destroy(sc->vtblk_disk); + sc->vtblk_disk = NULL; + } + + if (sc->vtblk_sglist != NULL) { + sglist_free(sc->vtblk_sglist); + sc->vtblk_sglist = NULL; + } + + VTBLK_LOCK_DESTROY(sc); + + return (0); +} + +static int +vtblk_suspend(device_t dev) +{ + struct vtblk_softc *sc; + + sc = device_get_softc(dev); + + VTBLK_LOCK(sc); + sc->vtblk_flags |= VTBLK_FLAG_SUSPENDED; + /* TODO Wait for any inflight IO to complete? */ + VTBLK_UNLOCK(sc); + + return (0); +} + +static int +vtblk_resume(device_t dev) +{ + struct vtblk_softc *sc; + + sc = device_get_softc(dev); + + VTBLK_LOCK(sc); + sc->vtblk_flags &= ~VTBLK_FLAG_SUSPENDED; + /* TODO Resume IO? */ + VTBLK_UNLOCK(sc); + + return (0); +} + +static int +vtblk_shutdown(device_t dev) +{ + + return (0); +} + +static int +vtblk_open(struct disk *dp) +{ + struct vtblk_softc *sc; + + if ((sc = dp->d_drv1) == NULL) + return (ENXIO); + + return (sc->vtblk_flags & VTBLK_FLAG_DETACHING ? ENXIO : 0); +} + +static int +vtblk_close(struct disk *dp) +{ + struct vtblk_softc *sc; + + if ((sc = dp->d_drv1) == NULL) + return (ENXIO); + + return (0); +} + +static int +vtblk_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, + struct thread *td) +{ + struct vtblk_softc *sc; + + if ((sc = dp->d_drv1) == NULL) + return (ENXIO); + + return (ENOTTY); +} + +static int +vtblk_dump(void *arg, void *virtual, vm_offset_t physical, off_t offset, + size_t length) +{ + struct disk *dp; + struct vtblk_softc *sc; + int error; + + dp = arg; + error = 0; + + if ((sc = dp->d_drv1) == NULL) + return (ENXIO); + + if (VTBLK_TRYLOCK(sc) == 0) { + device_printf(sc->vtblk_dev, + "softc already locked, cannot dump...\n"); + return (EBUSY); + } + + if ((sc->vtblk_flags & VTBLK_FLAG_DUMPING) == 0) { + vtblk_prepare_dump(sc); + sc->vtblk_flags |= VTBLK_FLAG_DUMPING; + } + + if (length > 0) + error = vtblk_write_dump(sc, virtual, offset, length); + else if (virtual == NULL && offset == 0) + error = vtblk_flush_dump(sc); + + VTBLK_UNLOCK(sc); + + return (error); +} + +static void +vtblk_strategy(struct bio *bp) +{ + struct vtblk_softc *sc; + + if ((sc = bp->bio_disk->d_drv1) == NULL) { + vtblk_bio_error(bp, EINVAL); + return; + } + + /* + * Fail any write if RO. Unfortunately, there does not seem to + * be a better way to report our readonly'ness to GEOM above. + */ + if (sc->vtblk_flags & VTBLK_FLAG_READONLY && + (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_FLUSH)) { + vtblk_bio_error(bp, EROFS); + return; + } + + /* + * Prevent read/write buffers spanning too many segments from + * getting into the queue. This should only trip if d_maxsize + * was incorrectly set. + */ + if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { + KASSERT(VTBLK_BIO_SEGMENTS(bp) <= sc->vtblk_max_nsegs - + VTBLK_MIN_SEGMENTS, + ("bio spanned too many segments: %d, max: %d", + VTBLK_BIO_SEGMENTS(bp), + sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS)); + } + + VTBLK_LOCK(sc); + if ((sc->vtblk_flags & VTBLK_FLAG_DETACHING) == 0) { + bioq_disksort(&sc->vtblk_bioq, bp); + vtblk_startio(sc); + } else + vtblk_bio_error(bp, ENXIO); + VTBLK_UNLOCK(sc); +} + +static void +vtblk_negotiate_features(struct vtblk_softc *sc) +{ + device_t dev; + uint64_t features; + + dev = sc->vtblk_dev; + features = VTBLK_FEATURES; + + sc->vtblk_features = virtio_negotiate_features(dev, features); +} + +static int +vtblk_maximum_segments(struct vtblk_softc *sc, + struct virtio_blk_config *blkcfg) +{ + device_t dev; + int nsegs; + + dev = sc->vtblk_dev; + nsegs = VTBLK_MIN_SEGMENTS; + + if (virtio_with_feature(dev, VIRTIO_BLK_F_SEG_MAX)) { + nsegs += MIN(blkcfg->seg_max, MAXPHYS / PAGE_SIZE + 1); + if (sc->vtblk_flags & VTBLK_FLAG_INDIRECT) + nsegs = MIN(nsegs, VIRTIO_MAX_INDIRECT); + } else + nsegs += 1; + + return (nsegs); +} + +static int +vtblk_alloc_virtqueue(struct vtblk_softc *sc) +{ + device_t dev; + struct vq_alloc_info vq_info; + + dev = sc->vtblk_dev; + + VQ_ALLOC_INFO_INIT(&vq_info, sc->vtblk_max_nsegs, + vtblk_vq_intr, sc, &sc->vtblk_vq, + "%s request", device_get_nameunit(dev)); + + return (virtio_alloc_virtqueues(dev, 0, 1, &vq_info)); +} + +static void +vtblk_alloc_disk(struct vtblk_softc *sc, struct virtio_blk_config *blkcfg) +{ + device_t dev; + struct disk *dp; + + dev = sc->vtblk_dev; + + sc->vtblk_disk = dp = disk_alloc(); + dp->d_open = vtblk_open; + dp->d_close = vtblk_close; + dp->d_ioctl = vtblk_ioctl; + dp->d_strategy = vtblk_strategy; + dp->d_name = VTBLK_DISK_NAME; + dp->d_unit = sc->vtblk_unit; + dp->d_drv1 = sc; + + if ((sc->vtblk_flags & VTBLK_FLAG_READONLY) == 0) + dp->d_dump = vtblk_dump; + + /* Capacity is always in 512-byte units. */ + dp->d_mediasize = blkcfg->capacity * 512; + + if (virtio_with_feature(dev, VIRTIO_BLK_F_BLK_SIZE)) + sc->vtblk_sector_size = blkcfg->blk_size; + else + sc->vtblk_sector_size = 512; + dp->d_sectorsize = sc->vtblk_sector_size; + + /* + * The VirtIO maximum I/O size is given in terms of segments. + * However, FreeBSD limits I/O size by logical buffer size, not + * by physically contiguous pages. Therefore, we have to assume + * no pages are contiguous. This may impose an artificially low + * maximum I/O size. But in practice, since QEMU advertises 128 + * segments, this gives us a maximum IO size of 125 * PAGE_SIZE, + * which is typically greater than MAXPHYS. Eventually we should + * just advertise MAXPHYS and split buffers that are too big. + * + * Note we must subtract one additional segment in case of non + * page aligned buffers. + */ + dp->d_maxsize = (sc->vtblk_max_nsegs - VTBLK_MIN_SEGMENTS - 1) * + PAGE_SIZE; + if (dp->d_maxsize < PAGE_SIZE) + dp->d_maxsize = PAGE_SIZE; /* XXX */ + + if (virtio_with_feature(dev, VIRTIO_BLK_F_GEOMETRY)) { + dp->d_fwsectors = blkcfg->geometry.sectors; + dp->d_fwheads = blkcfg->geometry.heads; + } + + if (virtio_with_feature(dev, VIRTIO_BLK_F_FLUSH)) + dp->d_flags |= DISKFLAG_CANFLUSHCACHE; +} + +static void +vtblk_create_disk(struct vtblk_softc *sc) +{ + struct disk *dp; + + dp = sc->vtblk_disk; + + /* + * Retrieving the identification string must be done after + * the virtqueue interrupt is setup otherwise it will hang. + */ + vtblk_get_ident(sc); + + device_printf(sc->vtblk_dev, "%juMB (%ju %u byte sectors)\n", + (uintmax_t) dp->d_mediasize >> 20, + (uintmax_t) dp->d_mediasize / dp->d_sectorsize, + dp->d_sectorsize); + + disk_create(dp, DISK_VERSION); +} + +static void +vtblk_startio(struct vtblk_softc *sc) +{ + struct virtqueue *vq; + struct vtblk_request *req; + int enq; + + vq = sc->vtblk_vq; + enq = 0; + + VTBLK_LOCK_ASSERT(sc); + + if (sc->vtblk_flags & VTBLK_FLAG_SUSPENDED) + return; + + while (!virtqueue_full(vq)) { + if ((req = vtblk_dequeue_ready(sc)) == NULL) + req = vtblk_bio_request(sc); + if (req == NULL) + break; + + if (vtblk_execute_request(sc, req) != 0) { + vtblk_enqueue_ready(sc, req); + break; + } + + enq++; + } + + if (enq > 0) + virtqueue_notify(vq); +} + +static struct vtblk_request * +vtblk_bio_request(struct vtblk_softc *sc) +{ + struct bio_queue_head *bioq; + struct vtblk_request *req; + struct bio *bp; + + bioq = &sc->vtblk_bioq; + + if (bioq_first(bioq) == NULL) + return (NULL); + + req = vtblk_dequeue_request(sc); + if (req == NULL) + return (NULL); + + bp = bioq_takefirst(bioq); + req->vbr_bp = bp; + req->vbr_ack = -1; + req->vbr_hdr.ioprio = 1; + + switch (bp->bio_cmd) { + case BIO_FLUSH: + req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH; + break; + case BIO_READ: + req->vbr_hdr.type = VIRTIO_BLK_T_IN; + req->vbr_hdr.sector = bp->bio_offset / 512; + break; + case BIO_WRITE: + req->vbr_hdr.type = VIRTIO_BLK_T_OUT; + req->vbr_hdr.sector = bp->bio_offset / 512; + break; + default: + KASSERT(0, ("bio with unhandled cmd: %d", bp->bio_cmd)); + req->vbr_hdr.type = -1; + break; + } + + if (bp->bio_flags & BIO_ORDERED) + req->vbr_hdr.type |= VIRTIO_BLK_T_BARRIER; + + return (req); +} + +static int +vtblk_execute_request(struct vtblk_softc *sc, struct vtblk_request *req) +{ + struct sglist *sg; + struct bio *bp; + int writable, error; + + sg = sc->vtblk_sglist; + bp = req->vbr_bp; + writable = 0; + + VTBLK_LOCK_ASSERT(sc); + + sglist_reset(sg); + error = sglist_append(sg, &req->vbr_hdr, + sizeof(struct virtio_blk_outhdr)); + KASSERT(error == 0, ("error adding header to sglist")); + KASSERT(sg->sg_nseg == 1, + ("header spanned multiple segments: %d", sg->sg_nseg)); + + if (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE) { + error = sglist_append(sg, bp->bio_data, bp->bio_bcount); + KASSERT(error == 0, ("error adding buffer to sglist")); + + /* BIO_READ means the host writes into our buffer. */ + if (bp->bio_cmd == BIO_READ) + writable += sg->sg_nseg - 1; + } + + error = sglist_append(sg, &req->vbr_ack, sizeof(uint8_t)); + KASSERT(error == 0, ("error adding ack to sglist")); + writable++; + + KASSERT(sg->sg_nseg >= VTBLK_MIN_SEGMENTS, + ("fewer than min segments: %d", sg->sg_nseg)); + + error = virtqueue_enqueue(sc->vtblk_vq, req, sg, + sg->sg_nseg - writable, writable); + + return (error); +} + +static int +vtblk_vq_intr(void *xsc) +{ + struct vtblk_softc *sc; + + sc = xsc; + + virtqueue_disable_intr(sc->vtblk_vq); + taskqueue_enqueue_fast(sc->vtblk_tq, &sc->vtblk_intr_task); + + return (1); +} + +static void +vtblk_intr_task(void *arg, int pending) +{ + struct vtblk_softc *sc; + struct vtblk_request *req; + struct virtqueue *vq; + struct bio *bp; + + sc = arg; + vq = sc->vtblk_vq; + + VTBLK_LOCK(sc); + if (sc->vtblk_flags & VTBLK_FLAG_DETACHING) { + VTBLK_UNLOCK(sc); + return; + } + + while ((req = virtqueue_dequeue(vq, NULL)) != NULL) { + bp = req->vbr_bp; + + if (req->vbr_ack == VIRTIO_BLK_S_OK) + bp->bio_resid = 0; + else { + bp->bio_flags |= BIO_ERROR; + if (req->vbr_ack == VIRTIO_BLK_S_UNSUPP) + bp->bio_error = ENOTSUP; + else + bp->bio_error = EIO; + } + + biodone(bp); + vtblk_enqueue_request(sc, req); + } + + vtblk_startio(sc); + + if (virtqueue_enable_intr(vq) != 0) { + virtqueue_disable_intr(vq); + VTBLK_UNLOCK(sc); + taskqueue_enqueue_fast(sc->vtblk_tq, + &sc->vtblk_intr_task); + return; + } + + VTBLK_UNLOCK(sc); +} + +static void +vtblk_stop(struct vtblk_softc *sc) +{ + + virtqueue_disable_intr(sc->vtblk_vq); + virtio_stop(sc->vtblk_dev); +} + +static void +vtblk_get_ident(struct vtblk_softc *sc) +{ + struct bio buf; + struct disk *dp; + struct vtblk_request *req; + int len, error; + + dp = sc->vtblk_disk; + len = MIN(VIRTIO_BLK_ID_BYTES, DISK_IDENT_SIZE); + + if (vtblk_no_ident != 0) + return; + + req = vtblk_dequeue_request(sc); + if (req == NULL) + return; + + req->vbr_ack = -1; + req->vbr_hdr.type = VIRTIO_BLK_T_GET_ID; + req->vbr_hdr.ioprio = 1; + req->vbr_hdr.sector = 0; + + req->vbr_bp = &buf; + bzero(&buf, sizeof(struct bio)); + + buf.bio_cmd = BIO_READ; + buf.bio_data = dp->d_ident; + buf.bio_bcount = len; + + VTBLK_LOCK(sc); + error = vtblk_poll_request(sc, req); + vtblk_enqueue_request(sc, req); + VTBLK_UNLOCK(sc); + + if (error) { + device_printf(sc->vtblk_dev, + "error getting device identifier: %d\n", error); + } +} + +static void +vtblk_prepare_dump(struct vtblk_softc *sc) +{ + device_t dev; + struct virtqueue *vq; + + dev = sc->vtblk_dev; + vq = sc->vtblk_vq; + + vtblk_stop(sc); + + /* + * Drain all requests caught in-flight in the virtqueue, + * skipping biodone(). When dumping, only one request is + * outstanding at a time, and we just poll the virtqueue + * for the response. + */ + vtblk_drain_vq(sc, 1); + + if (virtio_reinit(dev, sc->vtblk_features) != 0) + panic("cannot reinit VirtIO block device during dump"); + + virtqueue_disable_intr(vq); + virtio_reinit_complete(dev); +} + +static int +vtblk_write_dump(struct vtblk_softc *sc, void *virtual, off_t offset, + size_t length) +{ + struct bio buf; + struct vtblk_request *req; + + req = &sc->vtblk_dump_request; + req->vbr_ack = -1; + req->vbr_hdr.type = VIRTIO_BLK_T_OUT; + req->vbr_hdr.ioprio = 1; + req->vbr_hdr.sector = offset / 512; + + req->vbr_bp = &buf; + bzero(&buf, sizeof(struct bio)); + + buf.bio_cmd = BIO_WRITE; + buf.bio_data = virtual; + buf.bio_bcount = length; + + return (vtblk_poll_request(sc, req)); +} + +static int +vtblk_flush_dump(struct vtblk_softc *sc) +{ + struct bio buf; + struct vtblk_request *req; + + req = &sc->vtblk_dump_request; + req->vbr_ack = -1; + req->vbr_hdr.type = VIRTIO_BLK_T_FLUSH; + req->vbr_hdr.ioprio = 1; + req->vbr_hdr.sector = 0; + + req->vbr_bp = &buf; + bzero(&buf, sizeof(struct bio)); + + buf.bio_cmd = BIO_FLUSH; + + return (vtblk_poll_request(sc, req)); +} + +static int +vtblk_poll_request(struct vtblk_softc *sc, struct vtblk_request *req) +{ + device_t dev; + struct virtqueue *vq; + struct vtblk_request *r; + int error; + + dev = sc->vtblk_dev; + vq = sc->vtblk_vq; + + if (!virtqueue_empty(vq)) + return (EBUSY); + + error = vtblk_execute_request(sc, req); + if (error) + return (error); + + virtqueue_notify(vq); + + r = virtqueue_poll(vq, NULL); + KASSERT(r == req, ("unexpected request response")); + + if (req->vbr_ack != VIRTIO_BLK_S_OK) { + error = req->vbr_ack == VIRTIO_BLK_S_UNSUPP ? ENOTSUP : EIO; + if (bootverbose) + device_printf(dev, + "vtblk_poll_request: IO error: %d\n", error); + } + + return (error); +} + +static void +vtblk_drain_vq(struct vtblk_softc *sc, int skip_done) +{ + struct virtqueue *vq; + struct vtblk_request *req; + int last; + + vq = sc->vtblk_vq; + last = 0; + + while ((req = virtqueue_drain(vq, &last)) != NULL) { + if (!skip_done) + vtblk_bio_error(req->vbr_bp, ENXIO); + + vtblk_enqueue_request(sc, req); + } + + KASSERT(virtqueue_empty(vq), ("virtqueue not empty")); +} + +static void +vtblk_drain(struct vtblk_softc *sc) +{ + struct bio_queue_head *bioq; + struct vtblk_request *req; + struct bio *bp; + + bioq = &sc->vtblk_bioq; + + if (sc->vtblk_vq != NULL) + vtblk_drain_vq(sc, 0); + + while ((req = vtblk_dequeue_ready(sc)) != NULL) { + vtblk_bio_error(req->vbr_bp, ENXIO); + vtblk_enqueue_request(sc, req); + } + + while (bioq_first(bioq) != NULL) { + bp = bioq_takefirst(bioq); + vtblk_bio_error(bp, ENXIO); + } + + vtblk_free_requests(sc); +} + +static int +vtblk_alloc_requests(struct vtblk_softc *sc) +{ + struct vtblk_request *req; + int i, size; + + size = virtqueue_size(sc->vtblk_vq); + + /* + * Preallocate sufficient requests to keep the virtqueue full. Each + * request consumes VTBLK_MIN_SEGMENTS or more descriptors so reduce + * the number allocated when indirect descriptors are not available. + */ + if ((sc->vtblk_flags & VTBLK_FLAG_INDIRECT) == 0) + size /= VTBLK_MIN_SEGMENTS; + + for (i = 0; i < size; i++) { + req = uma_zalloc(vtblk_req_zone, M_NOWAIT); + if (req == NULL) + return (ENOMEM); + + sc->vtblk_request_count++; + vtblk_enqueue_request(sc, req); + } + + return (0); +} + +static void +vtblk_free_requests(struct vtblk_softc *sc) +{ + struct vtblk_request *req; + + while ((req = vtblk_dequeue_request(sc)) != NULL) { + sc->vtblk_request_count--; + uma_zfree(vtblk_req_zone, req); + } + + KASSERT(sc->vtblk_request_count == 0, ("leaked requests")); +} + +static struct vtblk_request * +vtblk_dequeue_request(struct vtblk_softc *sc) +{ + struct vtblk_request *req; + + req = TAILQ_FIRST(&sc->vtblk_req_free); + if (req != NULL) + TAILQ_REMOVE(&sc->vtblk_req_free, req, vbr_link); + + return (req); +} + +static void +vtblk_enqueue_request(struct vtblk_softc *sc, struct vtblk_request *req) +{ + + bzero(req, sizeof(struct vtblk_request)); + TAILQ_INSERT_HEAD(&sc->vtblk_req_free, req, vbr_link); +} + +static struct vtblk_request * +vtblk_dequeue_ready(struct vtblk_softc *sc) +{ + struct vtblk_request *req; + + req = TAILQ_FIRST(&sc->vtblk_req_ready); + if (req != NULL) + TAILQ_REMOVE(&sc->vtblk_req_ready, req, vbr_link); + + return (req); +} + +static void +vtblk_enqueue_ready(struct vtblk_softc *sc, struct vtblk_request *req) +{ + + TAILQ_INSERT_HEAD(&sc->vtblk_req_ready, req, vbr_link); +} + +static void +vtblk_bio_error(struct bio *bp, int error) +{ + + biofinish(bp, NULL, error); +} diff --git a/sys/dev/virtio/block/virtio_blk.h b/sys/dev/virtio/block/virtio_blk.h new file mode 100644 index 0000000..4fb32e0 --- /dev/null +++ b/sys/dev/virtio/block/virtio_blk.h @@ -0,0 +1,106 @@ +/* + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_BLK_H +#define _VIRTIO_BLK_H + +#include <sys/types.h> + +/* Feature bits */ +#define VIRTIO_BLK_F_BARRIER 0x0001 /* Does host support barriers? */ +#define VIRTIO_BLK_F_SIZE_MAX 0x0002 /* Indicates maximum segment size */ +#define VIRTIO_BLK_F_SEG_MAX 0x0004 /* Indicates maximum # of segments */ +#define VIRTIO_BLK_F_GEOMETRY 0x0010 /* Legacy geometry available */ +#define VIRTIO_BLK_F_RO 0x0020 /* Disk is read-only */ +#define VIRTIO_BLK_F_BLK_SIZE 0x0040 /* Block size of disk is available*/ +#define VIRTIO_BLK_F_SCSI 0x0080 /* Supports scsi command passthru */ +#define VIRTIO_BLK_F_FLUSH 0x0200 /* Cache flush command support */ +#define VIRTIO_BLK_F_TOPOLOGY 0x0400 /* Topology information is available */ + +#define VIRTIO_BLK_ID_BYTES 20 /* ID string length */ + +struct virtio_blk_config { + /* The capacity (in 512-byte sectors). */ + uint64_t capacity; + /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ + uint32_t size_max; + /* The maximum number of segments (if VIRTIO_BLK_F_SEG_MAX) */ + uint32_t seg_max; + /* geometry the device (if VIRTIO_BLK_F_GEOMETRY) */ + struct virtio_blk_geometry { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } geometry; + + /* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */ + uint32_t blk_size; + + /* the next 4 entries are guarded by VIRTIO_BLK_F_TOPOLOGY */ + /* exponent for physical block per logical block. */ + uint8_t physical_block_exp; + /* alignment offset in logical blocks. */ + uint8_t alignment_offset; + /* minimum I/O size without performance penalty in logical blocks. */ + uint16_t min_io_size; + /* optimal sustained I/O size in logical blocks. */ + uint32_t opt_io_size; +} __packed; + +/* + * Command types + * + * Usage is a bit tricky as some bits are used as flags and some are not. + * + * Rules: + * VIRTIO_BLK_T_OUT may be combined with VIRTIO_BLK_T_SCSI_CMD or + * VIRTIO_BLK_T_BARRIER. VIRTIO_BLK_T_FLUSH is a command of its own + * and may not be combined with any of the other flags. + */ + +/* These two define direction. */ +#define VIRTIO_BLK_T_IN 0 +#define VIRTIO_BLK_T_OUT 1 + +/* This bit says it's a scsi command, not an actual read or write. */ +#define VIRTIO_BLK_T_SCSI_CMD 2 + +/* Cache flush command */ +#define VIRTIO_BLK_T_FLUSH 4 + +/* Get device ID command */ +#define VIRTIO_BLK_T_GET_ID 8 + +/* Barrier before this op. */ +#define VIRTIO_BLK_T_BARRIER 0x80000000 + +/* ID string length */ +#define VIRTIO_BLK_ID_BYTES 20 + +/* This is the first element of the read scatter-gather list. */ +struct virtio_blk_outhdr { + /* VIRTIO_BLK_T* */ + uint32_t type; + /* io priority. */ + uint32_t ioprio; + /* Sector (ie. 512 byte offset) */ + uint64_t sector; +}; + +struct virtio_scsi_inhdr { + uint32_t errors; + uint32_t data_len; + uint32_t sense_len; + uint32_t residual; +}; + +/* And this is the final byte of the write scatter-gather list. */ +#define VIRTIO_BLK_S_OK 0 +#define VIRTIO_BLK_S_IOERR 1 +#define VIRTIO_BLK_S_UNSUPP 2 + +#endif /* _VIRTIO_BLK_H */ diff --git a/sys/dev/virtio/network/if_vtnet.c b/sys/dev/virtio/network/if_vtnet.c new file mode 100644 index 0000000..22becb1 --- /dev/null +++ b/sys/dev/virtio/network/if_vtnet.c @@ -0,0 +1,2746 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Driver for VirtIO network devices. */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#ifdef HAVE_KERNEL_OPTION_HEADERS +#include "opt_device_polling.h" +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/taskqueue.h> +#include <sys/random.h> +#include <sys/sglist.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <vm/uma.h> + +#include <net/ethernet.h> +#include <net/if.h> +#include <net/if_arp.h> +#include <net/if_dl.h> +#include <net/if_types.h> +#include <net/if_media.h> +#include <net/if_vlan_var.h> + +#include <net/bpf.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> +#include <netinet/udp.h> +#include <netinet/tcp.h> +#include <netinet/sctp.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus.h> +#include <sys/rman.h> + +#include <dev/virtio/virtio.h> +#include <dev/virtio/virtqueue.h> +#include <dev/virtio/network/virtio_net.h> +#include <dev/virtio/network/if_vtnetvar.h> + +#include "virtio_if.h" + +static int vtnet_modevent(module_t, int, void *); + +static int vtnet_probe(device_t); +static int vtnet_attach(device_t); +static int vtnet_detach(device_t); +static int vtnet_suspend(device_t); +static int vtnet_resume(device_t); +static int vtnet_shutdown(device_t); +static int vtnet_config_change(device_t); + +static void vtnet_negotiate_features(struct vtnet_softc *); +static int vtnet_alloc_virtqueues(struct vtnet_softc *); +static void vtnet_get_hwaddr(struct vtnet_softc *); +static void vtnet_set_hwaddr(struct vtnet_softc *); +static int vtnet_is_link_up(struct vtnet_softc *); +static void vtnet_update_link_status(struct vtnet_softc *); +static void vtnet_watchdog(struct vtnet_softc *); +static void vtnet_config_change_task(void *, int); +static int vtnet_change_mtu(struct vtnet_softc *, int); +static int vtnet_ioctl(struct ifnet *, u_long, caddr_t); + +static int vtnet_init_rx_vq(struct vtnet_softc *); +static void vtnet_free_rx_mbufs(struct vtnet_softc *); +static void vtnet_free_tx_mbufs(struct vtnet_softc *); +static void vtnet_free_ctrl_vq(struct vtnet_softc *); + +#ifdef DEVICE_POLLING +static poll_handler_t vtnet_poll; +#endif + +static struct mbuf * vtnet_alloc_rxbuf(struct vtnet_softc *, int, + struct mbuf **); +static int vtnet_replace_rxbuf(struct vtnet_softc *, + struct mbuf *, int); +static int vtnet_newbuf(struct vtnet_softc *); +static void vtnet_discard_merged_rxbuf(struct vtnet_softc *, int); +static void vtnet_discard_rxbuf(struct vtnet_softc *, struct mbuf *); +static int vtnet_enqueue_rxbuf(struct vtnet_softc *, struct mbuf *); +static void vtnet_vlan_tag_remove(struct mbuf *); +static int vtnet_rx_csum(struct vtnet_softc *, struct mbuf *, + struct virtio_net_hdr *); +static int vtnet_rxeof_merged(struct vtnet_softc *, struct mbuf *, int); +static int vtnet_rxeof(struct vtnet_softc *, int, int *); +static void vtnet_rx_intr_task(void *, int); +static int vtnet_rx_vq_intr(void *); + +static void vtnet_txeof(struct vtnet_softc *); +static struct mbuf * vtnet_tx_offload(struct vtnet_softc *, struct mbuf *, + struct virtio_net_hdr *); +static int vtnet_enqueue_txbuf(struct vtnet_softc *, struct mbuf **, + struct vtnet_tx_header *); +static int vtnet_encap(struct vtnet_softc *, struct mbuf **); +static void vtnet_start_locked(struct ifnet *); +static void vtnet_start(struct ifnet *); +static void vtnet_tick(void *); +static void vtnet_tx_intr_task(void *, int); +static int vtnet_tx_vq_intr(void *); + +static void vtnet_stop(struct vtnet_softc *); +static int vtnet_reinit(struct vtnet_softc *); +static void vtnet_init_locked(struct vtnet_softc *); +static void vtnet_init(void *); + +static void vtnet_exec_ctrl_cmd(struct vtnet_softc *, void *, + struct sglist *, int, int); + +static void vtnet_rx_filter(struct vtnet_softc *sc); +static int vtnet_ctrl_rx_cmd(struct vtnet_softc *, int, int); +static int vtnet_set_promisc(struct vtnet_softc *, int); +static int vtnet_set_allmulti(struct vtnet_softc *, int); +static void vtnet_rx_filter_mac(struct vtnet_softc *); + +static int vtnet_exec_vlan_filter(struct vtnet_softc *, int, uint16_t); +static void vtnet_rx_filter_vlan(struct vtnet_softc *); +static void vtnet_set_vlan_filter(struct vtnet_softc *, int, uint16_t); +static void vtnet_register_vlan(void *, struct ifnet *, uint16_t); +static void vtnet_unregister_vlan(void *, struct ifnet *, uint16_t); + +static int vtnet_ifmedia_upd(struct ifnet *); +static void vtnet_ifmedia_sts(struct ifnet *, struct ifmediareq *); + +static void vtnet_add_statistics(struct vtnet_softc *); + +static int vtnet_enable_rx_intr(struct vtnet_softc *); +static int vtnet_enable_tx_intr(struct vtnet_softc *); +static void vtnet_disable_rx_intr(struct vtnet_softc *); +static void vtnet_disable_tx_intr(struct vtnet_softc *); + +/* Tunables. */ +static int vtnet_csum_disable = 0; +TUNABLE_INT("hw.vtnet.csum_disable", &vtnet_csum_disable); +static int vtnet_tso_disable = 0; +TUNABLE_INT("hw.vtnet.tso_disable", &vtnet_tso_disable); +static int vtnet_lro_disable = 0; +TUNABLE_INT("hw.vtnet.lro_disable", &vtnet_lro_disable); + +/* + * Reducing the number of transmit completed interrupts can + * improve performance. To do so, the define below keeps the + * Tx vq interrupt disabled and adds calls to vtnet_txeof() + * in the start and watchdog paths. The price to pay for this + * is the m_free'ing of transmitted mbufs may be delayed until + * the watchdog fires. + */ +#define VTNET_TX_INTR_MODERATION + +static uma_zone_t vtnet_tx_header_zone; + +static struct virtio_feature_desc vtnet_feature_desc[] = { + { VIRTIO_NET_F_CSUM, "TxChecksum" }, + { VIRTIO_NET_F_GUEST_CSUM, "RxChecksum" }, + { VIRTIO_NET_F_MAC, "MacAddress" }, + { VIRTIO_NET_F_GSO, "TxAllGSO" }, + { VIRTIO_NET_F_GUEST_TSO4, "RxTSOv4" }, + { VIRTIO_NET_F_GUEST_TSO6, "RxTSOv6" }, + { VIRTIO_NET_F_GUEST_ECN, "RxECN" }, + { VIRTIO_NET_F_GUEST_UFO, "RxUFO" }, + { VIRTIO_NET_F_HOST_TSO4, "TxTSOv4" }, + { VIRTIO_NET_F_HOST_TSO6, "TxTSOv6" }, + { VIRTIO_NET_F_HOST_ECN, "TxTSOECN" }, + { VIRTIO_NET_F_HOST_UFO, "TxUFO" }, + { VIRTIO_NET_F_MRG_RXBUF, "MrgRxBuf" }, + { VIRTIO_NET_F_STATUS, "Status" }, + { VIRTIO_NET_F_CTRL_VQ, "ControlVq" }, + { VIRTIO_NET_F_CTRL_RX, "RxMode" }, + { VIRTIO_NET_F_CTRL_VLAN, "VLanFilter" }, + { VIRTIO_NET_F_CTRL_RX_EXTRA, "RxModeExtra" }, + + { 0, NULL } +}; + +static device_method_t vtnet_methods[] = { + /* Device methods. */ + DEVMETHOD(device_probe, vtnet_probe), + DEVMETHOD(device_attach, vtnet_attach), + DEVMETHOD(device_detach, vtnet_detach), + DEVMETHOD(device_suspend, vtnet_suspend), + DEVMETHOD(device_resume, vtnet_resume), + DEVMETHOD(device_shutdown, vtnet_shutdown), + + /* VirtIO methods. */ + DEVMETHOD(virtio_config_change, vtnet_config_change), + + { 0, 0 } +}; + +static driver_t vtnet_driver = { + "vtnet", + vtnet_methods, + sizeof(struct vtnet_softc) +}; +static devclass_t vtnet_devclass; + +DRIVER_MODULE(vtnet, virtio_pci, vtnet_driver, vtnet_devclass, + vtnet_modevent, 0); +MODULE_VERSION(vtnet, 1); +MODULE_DEPEND(vtnet, virtio, 1, 1, 1); + +static int +vtnet_modevent(module_t mod, int type, void *unused) +{ + int error; + + error = 0; + + switch (type) { + case MOD_LOAD: + vtnet_tx_header_zone = uma_zcreate("vtnet_tx_hdr", + sizeof(struct vtnet_tx_header), + NULL, NULL, NULL, NULL, 0, 0); + break; + case MOD_QUIESCE: + case MOD_UNLOAD: + if (uma_zone_get_cur(vtnet_tx_header_zone) > 0) + error = EBUSY; + else if (type == MOD_UNLOAD) { + uma_zdestroy(vtnet_tx_header_zone); + vtnet_tx_header_zone = NULL; + } + break; + case MOD_SHUTDOWN: + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +static int +vtnet_probe(device_t dev) +{ + + if (virtio_get_device_type(dev) != VIRTIO_ID_NETWORK) + return (ENXIO); + + device_set_desc(dev, "VirtIO Networking Adapter"); + + return (BUS_PROBE_DEFAULT); +} + +static int +vtnet_attach(device_t dev) +{ + struct vtnet_softc *sc; + struct ifnet *ifp; + int tx_size, error; + + sc = device_get_softc(dev); + sc->vtnet_dev = dev; + + VTNET_LOCK_INIT(sc); + callout_init_mtx(&sc->vtnet_tick_ch, VTNET_MTX(sc), 0); + + ifmedia_init(&sc->vtnet_media, IFM_IMASK, vtnet_ifmedia_upd, + vtnet_ifmedia_sts); + ifmedia_add(&sc->vtnet_media, VTNET_MEDIATYPE, 0, NULL); + ifmedia_set(&sc->vtnet_media, VTNET_MEDIATYPE); + + vtnet_add_statistics(sc); + + virtio_set_feature_desc(dev, vtnet_feature_desc); + vtnet_negotiate_features(sc); + + if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF)) { + sc->vtnet_flags |= VTNET_FLAG_MRG_RXBUFS; + sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } else + sc->vtnet_hdr_size = sizeof(struct virtio_net_hdr); + + sc->vtnet_rx_mbuf_size = MCLBYTES; + sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc); + + if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VQ)) { + sc->vtnet_flags |= VTNET_FLAG_CTRL_VQ; + + if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_RX)) + sc->vtnet_flags |= VTNET_FLAG_CTRL_RX; + if (virtio_with_feature(dev, VIRTIO_NET_F_CTRL_VLAN)) + sc->vtnet_flags |= VTNET_FLAG_VLAN_FILTER; + } + + vtnet_get_hwaddr(sc); + + error = vtnet_alloc_virtqueues(sc); + if (error) { + device_printf(dev, "cannot allocate virtqueues\n"); + goto fail; + } + + ifp = sc->vtnet_ifp = if_alloc(IFT_ETHER); + if (ifp == NULL) { + device_printf(dev, "cannot allocate ifnet structure\n"); + error = ENOSPC; + goto fail; + } + + ifp->if_softc = sc; + if_initname(ifp, device_get_name(dev), device_get_unit(dev)); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST; + ifp->if_init = vtnet_init; + ifp->if_start = vtnet_start; + ifp->if_ioctl = vtnet_ioctl; + + sc->vtnet_rx_size = virtqueue_size(sc->vtnet_rx_vq); + sc->vtnet_rx_process_limit = sc->vtnet_rx_size; + + tx_size = virtqueue_size(sc->vtnet_tx_vq); + sc->vtnet_tx_size = tx_size; + IFQ_SET_MAXLEN(&ifp->if_snd, tx_size - 1); + ifp->if_snd.ifq_drv_maxlen = tx_size - 1; + IFQ_SET_READY(&ifp->if_snd); + + ether_ifattach(ifp, sc->vtnet_hwaddr); + + if (virtio_with_feature(dev, VIRTIO_NET_F_STATUS)) + ifp->if_capabilities |= IFCAP_LINKSTATE; + + /* Tell the upper layer(s) we support long frames. */ + ifp->if_data.ifi_hdrlen = sizeof(struct ether_vlan_header); + ifp->if_capabilities |= IFCAP_JUMBO_MTU | IFCAP_VLAN_MTU; + + if (virtio_with_feature(dev, VIRTIO_NET_F_CSUM)) { + ifp->if_capabilities |= IFCAP_TXCSUM; + + if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO4)) + ifp->if_capabilities |= IFCAP_TSO4; + if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_TSO6)) + ifp->if_capabilities |= IFCAP_TSO6; + if (ifp->if_capabilities & IFCAP_TSO) + ifp->if_capabilities |= IFCAP_VLAN_HWTSO; + + if (virtio_with_feature(dev, VIRTIO_NET_F_HOST_ECN)) + sc->vtnet_flags |= VTNET_FLAG_TSO_ECN; + } + + if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_CSUM)) { + ifp->if_capabilities |= IFCAP_RXCSUM; + + if (virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO4) || + virtio_with_feature(dev, VIRTIO_NET_F_GUEST_TSO6)) + ifp->if_capabilities |= IFCAP_LRO; + } + + if (ifp->if_capabilities & IFCAP_HWCSUM) { + /* + * VirtIO does not support VLAN tagging, but we can fake + * it by inserting and removing the 802.1Q header during + * transmit and receive. We are then able to do checksum + * offloading of VLAN frames. + */ + ifp->if_capabilities |= + IFCAP_VLAN_HWTAGGING | IFCAP_VLAN_HWCSUM; + } + + ifp->if_capenable = ifp->if_capabilities; + + /* + * Capabilities after here are not enabled by default. + */ + + if (sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER) { + ifp->if_capabilities |= IFCAP_VLAN_HWFILTER; + + sc->vtnet_vlan_attach = EVENTHANDLER_REGISTER(vlan_config, + vtnet_register_vlan, sc, EVENTHANDLER_PRI_FIRST); + sc->vtnet_vlan_detach = EVENTHANDLER_REGISTER(vlan_unconfig, + vtnet_unregister_vlan, sc, EVENTHANDLER_PRI_FIRST); + } + +#ifdef DEVICE_POLLING + ifp->if_capabilities |= IFCAP_POLLING; +#endif + + TASK_INIT(&sc->vtnet_rx_intr_task, 0, vtnet_rx_intr_task, sc); + TASK_INIT(&sc->vtnet_tx_intr_task, 0, vtnet_tx_intr_task, sc); + TASK_INIT(&sc->vtnet_cfgchg_task, 0, vtnet_config_change_task, sc); + + sc->vtnet_tq = taskqueue_create_fast("vtnet_taskq", M_NOWAIT, + taskqueue_thread_enqueue, &sc->vtnet_tq); + if (sc->vtnet_tq == NULL) { + error = ENOMEM; + device_printf(dev, "cannot allocate taskqueue\n"); + ether_ifdetach(ifp); + goto fail; + } + taskqueue_start_threads(&sc->vtnet_tq, 1, PI_NET, "%s taskq", + device_get_nameunit(dev)); + + error = virtio_setup_intr(dev, INTR_TYPE_NET); + if (error) { + device_printf(dev, "cannot setup virtqueue interrupts\n"); + taskqueue_free(sc->vtnet_tq); + sc->vtnet_tq = NULL; + ether_ifdetach(ifp); + goto fail; + } + + /* + * Device defaults to promiscuous mode for backwards + * compatibility. Turn it off if possible. + */ + if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { + VTNET_LOCK(sc); + if (vtnet_set_promisc(sc, 0) != 0) { + ifp->if_flags |= IFF_PROMISC; + device_printf(dev, + "cannot disable promiscuous mode\n"); + } + VTNET_UNLOCK(sc); + } else + ifp->if_flags |= IFF_PROMISC; + +fail: + if (error) + vtnet_detach(dev); + + return (error); +} + +static int +vtnet_detach(device_t dev) +{ + struct vtnet_softc *sc; + struct ifnet *ifp; + + sc = device_get_softc(dev); + ifp = sc->vtnet_ifp; + + KASSERT(mtx_initialized(VTNET_MTX(sc)), + ("vtnet mutex not initialized")); + +#ifdef DEVICE_POLLING + if (ifp != NULL && ifp->if_capenable & IFCAP_POLLING) + ether_poll_deregister(ifp); +#endif + + if (device_is_attached(dev)) { + VTNET_LOCK(sc); + vtnet_stop(sc); + VTNET_UNLOCK(sc); + + callout_drain(&sc->vtnet_tick_ch); + taskqueue_drain(taskqueue_fast, &sc->vtnet_cfgchg_task); + + ether_ifdetach(ifp); + } + + if (sc->vtnet_tq != NULL) { + taskqueue_drain(sc->vtnet_tq, &sc->vtnet_rx_intr_task); + taskqueue_drain(sc->vtnet_tq, &sc->vtnet_tx_intr_task); + taskqueue_free(sc->vtnet_tq); + sc->vtnet_tq = NULL; + } + + if (sc->vtnet_vlan_attach != NULL) { + EVENTHANDLER_DEREGISTER(vlan_config, sc->vtnet_vlan_attach); + sc->vtnet_vlan_attach = NULL; + } + if (sc->vtnet_vlan_detach != NULL) { + EVENTHANDLER_DEREGISTER(vlan_unconfg, sc->vtnet_vlan_detach); + sc->vtnet_vlan_detach = NULL; + } + + if (ifp) { + if_free(ifp); + sc->vtnet_ifp = NULL; + } + + if (sc->vtnet_rx_vq != NULL) + vtnet_free_rx_mbufs(sc); + if (sc->vtnet_tx_vq != NULL) + vtnet_free_tx_mbufs(sc); + if (sc->vtnet_ctrl_vq != NULL) + vtnet_free_ctrl_vq(sc); + + ifmedia_removeall(&sc->vtnet_media); + VTNET_LOCK_DESTROY(sc); + + return (0); +} + +static int +vtnet_suspend(device_t dev) +{ + struct vtnet_softc *sc; + + sc = device_get_softc(dev); + + VTNET_LOCK(sc); + vtnet_stop(sc); + sc->vtnet_flags |= VTNET_FLAG_SUSPENDED; + VTNET_UNLOCK(sc); + + return (0); +} + +static int +vtnet_resume(device_t dev) +{ + struct vtnet_softc *sc; + struct ifnet *ifp; + + sc = device_get_softc(dev); + ifp = sc->vtnet_ifp; + + VTNET_LOCK(sc); + if (ifp->if_flags & IFF_UP) + vtnet_init_locked(sc); + sc->vtnet_flags &= ~VTNET_FLAG_SUSPENDED; + VTNET_UNLOCK(sc); + + return (0); +} + +static int +vtnet_shutdown(device_t dev) +{ + + /* + * Suspend already does all of what we need to + * do here; we just never expect to be resumed. + */ + return (vtnet_suspend(dev)); +} + +static int +vtnet_config_change(device_t dev) +{ + struct vtnet_softc *sc; + + sc = device_get_softc(dev); + + taskqueue_enqueue_fast(taskqueue_fast, &sc->vtnet_cfgchg_task); + + return (1); +} + +static void +vtnet_negotiate_features(struct vtnet_softc *sc) +{ + device_t dev; + uint64_t mask, features; + + dev = sc->vtnet_dev; + mask = 0; + + if (vtnet_csum_disable) + mask |= VIRTIO_NET_F_CSUM | VIRTIO_NET_F_GUEST_CSUM; + + /* + * TSO and LRO are only available when their corresponding + * checksum offload feature is also negotiated. + */ + + if (vtnet_csum_disable || vtnet_tso_disable) + mask |= VIRTIO_NET_F_HOST_TSO4 | VIRTIO_NET_F_HOST_TSO6 | + VIRTIO_NET_F_HOST_ECN; + + if (vtnet_csum_disable || vtnet_lro_disable) + mask |= VTNET_LRO_FEATURES; + + features = VTNET_FEATURES & ~mask; +#ifdef VTNET_TX_INTR_MODERATION + features |= VIRTIO_F_NOTIFY_ON_EMPTY; +#endif + sc->vtnet_features = virtio_negotiate_features(dev, features); + + if (virtio_with_feature(dev, VIRTIO_NET_F_MRG_RXBUF) == 0 && + virtio_with_feature(dev, VTNET_LRO_FEATURES)) { + /* + * LRO without mergeable buffers requires special care. This + * is not ideal because every receive buffer must be large + * enough to hold the maximum TCP packet, the Ethernet header, + * and the vtnet_rx_header. This requires up to 34 descriptors + * when using MCLBYTES clusters. If we do not have indirect + * descriptors, LRO is disabled since the virtqueue will not + * be able to contain very many receive buffers. + */ + if (virtio_with_feature(dev, + VIRTIO_RING_F_INDIRECT_DESC) == 0) { + device_printf(dev, + "LRO disabled due to lack of both mergeable " + "buffers and indirect descriptors\n"); + + sc->vtnet_features = virtio_negotiate_features(dev, + features & ~VTNET_LRO_FEATURES); + } else + sc->vtnet_flags |= VTNET_FLAG_LRO_NOMRG; + } +} + +static int +vtnet_alloc_virtqueues(struct vtnet_softc *sc) +{ + device_t dev; + struct vq_alloc_info vq_info[3]; + int nvqs, rxsegs; + + dev = sc->vtnet_dev; + nvqs = 2; + + /* + * Indirect descriptors are not needed for the Rx + * virtqueue when mergeable buffers are negotiated. + * The header is placed inline with the data, not + * in a separate descriptor, and mbuf clusters are + * always physically contiguous. + */ + if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { + rxsegs = sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG ? + VTNET_MAX_RX_SEGS : VTNET_MIN_RX_SEGS; + } else + rxsegs = 0; + + VQ_ALLOC_INFO_INIT(&vq_info[0], rxsegs, + vtnet_rx_vq_intr, sc, &sc->vtnet_rx_vq, + "%s receive", device_get_nameunit(dev)); + + VQ_ALLOC_INFO_INIT(&vq_info[1], VTNET_MAX_TX_SEGS, + vtnet_tx_vq_intr, sc, &sc->vtnet_tx_vq, + "%s transmit", device_get_nameunit(dev)); + + if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { + nvqs++; + + VQ_ALLOC_INFO_INIT(&vq_info[2], 0, NULL, NULL, + &sc->vtnet_ctrl_vq, "%s control", + device_get_nameunit(dev)); + } + + return (virtio_alloc_virtqueues(dev, 0, nvqs, vq_info)); +} + +static void +vtnet_get_hwaddr(struct vtnet_softc *sc) +{ + device_t dev; + + dev = sc->vtnet_dev; + + if (virtio_with_feature(dev, VIRTIO_NET_F_MAC)) { + virtio_read_device_config(dev, + offsetof(struct virtio_net_config, mac), + sc->vtnet_hwaddr, ETHER_ADDR_LEN); + } else { + /* Generate random locally administered unicast address. */ + sc->vtnet_hwaddr[0] = 0xB2; + arc4rand(&sc->vtnet_hwaddr[1], ETHER_ADDR_LEN - 1, 0); + + vtnet_set_hwaddr(sc); + } +} + +static void +vtnet_set_hwaddr(struct vtnet_softc *sc) +{ + device_t dev; + + dev = sc->vtnet_dev; + + virtio_write_device_config(dev, + offsetof(struct virtio_net_config, mac), + sc->vtnet_hwaddr, ETHER_ADDR_LEN); +} + +static int +vtnet_is_link_up(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + uint16_t status; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + VTNET_LOCK_ASSERT(sc); + + if ((ifp->if_capenable & IFCAP_LINKSTATE) == 0) + return (1); + + status = virtio_read_dev_config_2(dev, + offsetof(struct virtio_net_config, status)); + + return ((status & VIRTIO_NET_S_LINK_UP) != 0); +} + +static void +vtnet_update_link_status(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + int link; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + link = vtnet_is_link_up(sc); + + if (link && ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0)) { + sc->vtnet_flags |= VTNET_FLAG_LINK; + if (bootverbose) + device_printf(dev, "Link is up\n"); + + if_link_state_change(ifp, LINK_STATE_UP); + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + vtnet_start_locked(ifp); + } else if (!link && (sc->vtnet_flags & VTNET_FLAG_LINK)) { + sc->vtnet_flags &= ~VTNET_FLAG_LINK; + if (bootverbose) + device_printf(dev, "Link is down\n"); + + if_link_state_change(ifp, LINK_STATE_DOWN); + } +} + +static void +vtnet_watchdog(struct vtnet_softc *sc) +{ + struct ifnet *ifp; + + ifp = sc->vtnet_ifp; + +#ifdef VTNET_TX_INTR_MODERATION + vtnet_txeof(sc); +#endif + + if (sc->vtnet_watchdog_timer == 0 || --sc->vtnet_watchdog_timer) + return; + + if_printf(ifp, "watchdog timeout -- resetting\n"); +#ifdef VTNET_DEBUG + virtqueue_dump(sc->vtnet_tx_vq); +#endif + ifp->if_oerrors++; + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + vtnet_init_locked(sc); +} + +static void +vtnet_config_change_task(void *arg, int pending) +{ + struct vtnet_softc *sc; + + sc = arg; + + VTNET_LOCK(sc); + vtnet_update_link_status(sc); + VTNET_UNLOCK(sc); +} + +static int +vtnet_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct vtnet_softc *sc; + struct ifreq *ifr; + int reinit, mask, error; + + sc = ifp->if_softc; + ifr = (struct ifreq *) data; + reinit = 0; + error = 0; + + switch (cmd) { + case SIOCSIFMTU: + if (ifr->ifr_mtu < ETHERMIN || ifr->ifr_mtu > VTNET_MAX_MTU) + error = EINVAL; + else if (ifp->if_mtu != ifr->ifr_mtu) { + VTNET_LOCK(sc); + error = vtnet_change_mtu(sc, ifr->ifr_mtu); + VTNET_UNLOCK(sc); + } + break; + + case SIOCSIFFLAGS: + VTNET_LOCK(sc); + if ((ifp->if_flags & IFF_UP) == 0) { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + vtnet_stop(sc); + } else if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if ((ifp->if_flags ^ sc->vtnet_if_flags) & + (IFF_PROMISC | IFF_ALLMULTI)) { + if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) + vtnet_rx_filter(sc); + else + error = ENOTSUP; + } + } else + vtnet_init_locked(sc); + + if (error == 0) + sc->vtnet_if_flags = ifp->if_flags; + VTNET_UNLOCK(sc); + break; + + case SIOCADDMULTI: + case SIOCDELMULTI: + VTNET_LOCK(sc); + if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) && + (ifp->if_drv_flags & IFF_DRV_RUNNING)) + vtnet_rx_filter_mac(sc); + VTNET_UNLOCK(sc); + break; + + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = ifmedia_ioctl(ifp, ifr, &sc->vtnet_media, cmd); + break; + + case SIOCSIFCAP: + mask = ifr->ifr_reqcap ^ ifp->if_capenable; + +#ifdef DEVICE_POLLING + if (mask & IFCAP_POLLING) { + if (ifr->ifr_reqcap & IFCAP_POLLING) { + error = ether_poll_register(vtnet_poll, ifp); + if (error) + break; + + VTNET_LOCK(sc); + vtnet_disable_rx_intr(sc); + vtnet_disable_tx_intr(sc); + ifp->if_capenable |= IFCAP_POLLING; + VTNET_UNLOCK(sc); + } else { + error = ether_poll_deregister(ifp); + + /* Enable interrupts even in error case. */ + VTNET_LOCK(sc); + vtnet_enable_tx_intr(sc); + vtnet_enable_rx_intr(sc); + ifp->if_capenable &= ~IFCAP_POLLING; + VTNET_UNLOCK(sc); + } + } +#endif + VTNET_LOCK(sc); + + if (mask & IFCAP_TXCSUM) { + ifp->if_capenable ^= IFCAP_TXCSUM; + if (ifp->if_capenable & IFCAP_TXCSUM) + ifp->if_hwassist |= VTNET_CSUM_OFFLOAD; + else + ifp->if_hwassist &= ~VTNET_CSUM_OFFLOAD; + } + + if (mask & IFCAP_TSO4) { + ifp->if_capenable ^= IFCAP_TSO4; + if (ifp->if_capenable & IFCAP_TSO4) + ifp->if_hwassist |= CSUM_TSO; + else + ifp->if_hwassist &= ~CSUM_TSO; + } + + if (mask & IFCAP_RXCSUM) { + ifp->if_capenable ^= IFCAP_RXCSUM; + reinit = 1; + } + + if (mask & IFCAP_LRO) { + ifp->if_capenable ^= IFCAP_LRO; + reinit = 1; + } + + if (mask & IFCAP_VLAN_HWFILTER) { + ifp->if_capenable ^= IFCAP_VLAN_HWFILTER; + reinit = 1; + } + + if (mask & IFCAP_VLAN_HWTSO) + ifp->if_capenable ^= IFCAP_VLAN_HWTSO; + + if (mask & IFCAP_VLAN_HWTAGGING) + ifp->if_capenable ^= IFCAP_VLAN_HWTAGGING; + + if (reinit && (ifp->if_drv_flags & IFF_DRV_RUNNING)) { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + vtnet_init_locked(sc); + } + VLAN_CAPABILITIES(ifp); + + VTNET_UNLOCK(sc); + break; + + default: + error = ether_ioctl(ifp, cmd, data); + break; + } + + VTNET_LOCK_ASSERT_NOTOWNED(sc); + + return (error); +} + +static int +vtnet_change_mtu(struct vtnet_softc *sc, int new_mtu) +{ + struct ifnet *ifp; + int new_frame_size, clsize; + + ifp = sc->vtnet_ifp; + + if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { + new_frame_size = sizeof(struct vtnet_rx_header) + + sizeof(struct ether_vlan_header) + new_mtu; + + if (new_frame_size > MJUM9BYTES) + return (EINVAL); + + if (new_frame_size <= MCLBYTES) + clsize = MCLBYTES; + else + clsize = MJUM9BYTES; + } else { + new_frame_size = sizeof(struct virtio_net_hdr_mrg_rxbuf) + + sizeof(struct ether_vlan_header) + new_mtu; + + if (new_frame_size <= MCLBYTES) + clsize = MCLBYTES; + else + clsize = MJUMPAGESIZE; + } + + sc->vtnet_rx_mbuf_size = clsize; + sc->vtnet_rx_mbuf_count = VTNET_NEEDED_RX_MBUFS(sc); + KASSERT(sc->vtnet_rx_mbuf_count < VTNET_MAX_RX_SEGS, + ("too many rx mbufs: %d", sc->vtnet_rx_mbuf_count)); + + ifp->if_mtu = new_mtu; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + vtnet_init_locked(sc); + } + + return (0); +} + +static int +vtnet_init_rx_vq(struct vtnet_softc *sc) +{ + struct virtqueue *vq; + int nbufs, error; + + vq = sc->vtnet_rx_vq; + nbufs = 0; + error = ENOSPC; + + while (!virtqueue_full(vq)) { + if ((error = vtnet_newbuf(sc)) != 0) + break; + nbufs++; + } + + if (nbufs > 0) { + virtqueue_notify(vq); + + /* + * EMSGSIZE signifies the virtqueue did not have enough + * entries available to hold the last mbuf. This is not + * an error. We should not get ENOSPC since we check if + * the virtqueue is full before attempting to add a + * buffer. + */ + if (error == EMSGSIZE) + error = 0; + } + + return (error); +} + +static void +vtnet_free_rx_mbufs(struct vtnet_softc *sc) +{ + struct virtqueue *vq; + struct mbuf *m; + int last; + + vq = sc->vtnet_rx_vq; + last = 0; + + while ((m = virtqueue_drain(vq, &last)) != NULL) + m_freem(m); + + KASSERT(virtqueue_empty(vq), ("mbufs remaining in Rx Vq")); +} + +static void +vtnet_free_tx_mbufs(struct vtnet_softc *sc) +{ + struct virtqueue *vq; + struct vtnet_tx_header *txhdr; + int last; + + vq = sc->vtnet_tx_vq; + last = 0; + + while ((txhdr = virtqueue_drain(vq, &last)) != NULL) { + m_freem(txhdr->vth_mbuf); + uma_zfree(vtnet_tx_header_zone, txhdr); + } + + KASSERT(virtqueue_empty(vq), ("mbufs remaining in Tx Vq")); +} + +static void +vtnet_free_ctrl_vq(struct vtnet_softc *sc) +{ + + /* + * The control virtqueue is only polled, therefore + * it should already be empty. + */ + KASSERT(virtqueue_empty(sc->vtnet_ctrl_vq), + ("Ctrl Vq not empty")); +} + +#ifdef DEVICE_POLLING +static int +vtnet_poll(struct ifnet *ifp, enum poll_cmd cmd, int count) +{ + struct vtnet_softc *sc; + int rx_done; + + sc = ifp->if_softc; + rx_done = 0; + + VTNET_LOCK(sc); + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + if (cmd == POLL_AND_CHECK_STATUS) + vtnet_update_link_status(sc); + + if (virtqueue_nused(sc->vtnet_rx_vq) > 0) + vtnet_rxeof(sc, count, &rx_done); + + vtnet_txeof(sc); + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + vtnet_start_locked(ifp); + } + VTNET_UNLOCK(sc); + + return (rx_done); +} +#endif /* DEVICE_POLLING */ + +static struct mbuf * +vtnet_alloc_rxbuf(struct vtnet_softc *sc, int nbufs, struct mbuf **m_tailp) +{ + struct mbuf *m_head, *m_tail, *m; + int i, clsize; + + clsize = sc->vtnet_rx_mbuf_size; + + m_head = m_getjcl(M_DONTWAIT, MT_DATA, M_PKTHDR, clsize); + if (m_head == NULL) + goto fail; + + m_head->m_len = clsize; + m_tail = m_head; + + if (nbufs > 1) { + KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, + ("chained Rx mbuf requested without LRO_NOMRG")); + + for (i = 0; i < nbufs - 1; i++) { + m = m_getjcl(M_DONTWAIT, MT_DATA, 0, clsize); + if (m == NULL) + goto fail; + + m->m_len = clsize; + m_tail->m_next = m; + m_tail = m; + } + } + + if (m_tailp != NULL) + *m_tailp = m_tail; + + return (m_head); + +fail: + sc->vtnet_stats.mbuf_alloc_failed++; + m_freem(m_head); + + return (NULL); +} + +static int +vtnet_replace_rxbuf(struct vtnet_softc *sc, struct mbuf *m0, int len0) +{ + struct mbuf *m, *m_prev; + struct mbuf *m_new, *m_tail; + int len, clsize, nreplace, error; + + m = m0; + m_prev = NULL; + len = len0; + + m_tail = NULL; + clsize = sc->vtnet_rx_mbuf_size; + nreplace = 0; + + if (m->m_next != NULL) + KASSERT(sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG, + ("chained Rx mbuf without LRO_NOMRG")); + + /* + * Since LRO_NOMRG mbuf chains are so large, we want to avoid + * allocating an entire chain for each received frame. When + * the received frame's length is less than that of the chain, + * the unused mbufs are reassigned to the new chain. + */ + while (len > 0) { + /* + * Something is seriously wrong if we received + * a frame larger than the mbuf chain. Drop it. + */ + if (m == NULL) { + sc->vtnet_stats.rx_frame_too_large++; + return (EMSGSIZE); + } + + KASSERT(m->m_len == clsize, + ("mbuf length not expected cluster size: %d", + m->m_len)); + + m->m_len = MIN(m->m_len, len); + len -= m->m_len; + + m_prev = m; + m = m->m_next; + nreplace++; + } + + KASSERT(m_prev != NULL, ("m_prev == NULL")); + KASSERT(nreplace <= sc->vtnet_rx_mbuf_count, + ("too many replacement mbufs: %d/%d", nreplace, + sc->vtnet_rx_mbuf_count)); + + m_new = vtnet_alloc_rxbuf(sc, nreplace, &m_tail); + if (m_new == NULL) { + m_prev->m_len = clsize; + return (ENOBUFS); + } + + /* + * Move unused mbufs, if any, from the original chain + * onto the end of the new chain. + */ + if (m_prev->m_next != NULL) { + m_tail->m_next = m_prev->m_next; + m_prev->m_next = NULL; + } + + error = vtnet_enqueue_rxbuf(sc, m_new); + if (error) { + /* + * BAD! We could not enqueue the replacement mbuf chain. We + * must restore the m0 chain to the original state if it was + * modified so we can subsequently discard it. + * + * NOTE: The replacement is suppose to be an identical copy + * to the one just dequeued so this is an unexpected error. + */ + sc->vtnet_stats.rx_enq_replacement_failed++; + + if (m_tail->m_next != NULL) { + m_prev->m_next = m_tail->m_next; + m_tail->m_next = NULL; + } + + m_prev->m_len = clsize; + m_freem(m_new); + } + + return (error); +} + +static int +vtnet_newbuf(struct vtnet_softc *sc) +{ + struct mbuf *m; + int error; + + m = vtnet_alloc_rxbuf(sc, sc->vtnet_rx_mbuf_count, NULL); + if (m == NULL) + return (ENOBUFS); + + error = vtnet_enqueue_rxbuf(sc, m); + if (error) + m_freem(m); + + return (error); +} + +static void +vtnet_discard_merged_rxbuf(struct vtnet_softc *sc, int nbufs) +{ + struct virtqueue *vq; + struct mbuf *m; + + vq = sc->vtnet_rx_vq; + + while (--nbufs > 0) { + if ((m = virtqueue_dequeue(vq, NULL)) == NULL) + break; + vtnet_discard_rxbuf(sc, m); + } +} + +static void +vtnet_discard_rxbuf(struct vtnet_softc *sc, struct mbuf *m) +{ + int error; + + /* + * Requeue the discarded mbuf. This should always be + * successful since it was just dequeued. + */ + error = vtnet_enqueue_rxbuf(sc, m); + KASSERT(error == 0, ("cannot requeue discarded mbuf")); +} + +static int +vtnet_enqueue_rxbuf(struct vtnet_softc *sc, struct mbuf *m) +{ + struct sglist sg; + struct sglist_seg segs[VTNET_MAX_RX_SEGS]; + struct vtnet_rx_header *rxhdr; + struct virtio_net_hdr *hdr; + uint8_t *mdata; + int offset, error; + + VTNET_LOCK_ASSERT(sc); + if ((sc->vtnet_flags & VTNET_FLAG_LRO_NOMRG) == 0) + KASSERT(m->m_next == NULL, ("chained Rx mbuf")); + + sglist_init(&sg, VTNET_MAX_RX_SEGS, segs); + + mdata = mtod(m, uint8_t *); + offset = 0; + + if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { + rxhdr = (struct vtnet_rx_header *) mdata; + hdr = &rxhdr->vrh_hdr; + offset += sizeof(struct vtnet_rx_header); + + error = sglist_append(&sg, hdr, sc->vtnet_hdr_size); + KASSERT(error == 0, ("cannot add header to sglist")); + } + + error = sglist_append(&sg, mdata + offset, m->m_len - offset); + if (error) + return (error); + + if (m->m_next != NULL) { + error = sglist_append_mbuf(&sg, m->m_next); + if (error) + return (error); + } + + return (virtqueue_enqueue(sc->vtnet_rx_vq, m, &sg, 0, sg.sg_nseg)); +} + +static void +vtnet_vlan_tag_remove(struct mbuf *m) +{ + struct ether_vlan_header *evl; + + evl = mtod(m, struct ether_vlan_header *); + + m->m_pkthdr.ether_vtag = ntohs(evl->evl_tag); + m->m_flags |= M_VLANTAG; + + /* Strip the 802.1Q header. */ + bcopy((char *) evl, (char *) evl + ETHER_VLAN_ENCAP_LEN, + ETHER_HDR_LEN - ETHER_TYPE_LEN); + m_adj(m, ETHER_VLAN_ENCAP_LEN); +} + +#ifdef notyet +static int +vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m, + struct virtio_net_hdr *hdr) +{ + struct ether_header *eh; + struct ether_vlan_header *evh; + struct ip *ip; + struct ip6_hdr *ip6; + struct udphdr *udp; + int ip_offset, csum_start, csum_offset, hlen; + uint16_t eth_type; + uint8_t ip_proto; + + /* + * Convert the VirtIO checksum interface to FreeBSD's interface. + * The host only provides us with the offset at which to start + * checksumming, and the offset from that to place the completed + * checksum. While this maps well with how Linux does checksums, + * for FreeBSD, we must parse the received packet in order to set + * the appropriate CSUM_* flags. + */ + + /* + * Every mbuf added to the receive virtqueue is always at least + * MCLBYTES big, so assume something is amiss if the first mbuf + * does not contain both the Ethernet and protocol headers. + */ + ip_offset = sizeof(struct ether_header); + if (m->m_len < ip_offset) + return (1); + + eh = mtod(m, struct ether_header *); + eth_type = ntohs(eh->ether_type); + if (eth_type == ETHERTYPE_VLAN) { + ip_offset = sizeof(struct ether_vlan_header); + if (m->m_len < ip_offset) + return (1); + evh = mtod(m, struct ether_vlan_header *); + eth_type = ntohs(evh->evl_proto); + } + + switch (eth_type) { + case ETHERTYPE_IP: + if (m->m_len < ip_offset + sizeof(struct ip)) + return (1); + + ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset); + /* Sanity check the IP header. */ + if (ip->ip_v != IPVERSION) + return (1); + hlen = ip->ip_hl << 2; + if (hlen < sizeof(struct ip)) + return (1); + if (ntohs(ip->ip_len) < hlen) + return (1); + if (ntohs(ip->ip_len) != (m->m_pkthdr.len - ip_offset)) + return (1); + + ip_proto = ip->ip_p; + csum_start = ip_offset + hlen; + break; + + case ETHERTYPE_IPV6: + if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) + return (1); + + /* + * XXX FreeBSD does not handle any IPv6 checksum offloading + * at the moment. + */ + + ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset); + /* XXX Assume no extension headers are present. */ + ip_proto = ip6->ip6_nxt; + csum_start = ip_offset + sizeof(struct ip6_hdr); + break; + + default: + sc->vtnet_stats.rx_csum_bad_ethtype++; + return (1); + } + + /* Assume checksum begins right after the IP header. */ + if (hdr->csum_start != csum_start) { + sc->vtnet_stats.rx_csum_bad_start++; + return (1); + } + + switch (ip_proto) { + case IPPROTO_TCP: + csum_offset = offsetof(struct tcphdr, th_sum); + break; + + case IPPROTO_UDP: + csum_offset = offsetof(struct udphdr, uh_sum); + break; + + case IPPROTO_SCTP: + csum_offset = offsetof(struct sctphdr, checksum); + break; + + default: + sc->vtnet_stats.rx_csum_bad_ipproto++; + return (1); + } + + if (hdr->csum_offset != csum_offset) { + sc->vtnet_stats.rx_csum_bad_offset++; + return (1); + } + + /* + * The IP header checksum is almost certainly valid but I'm + * uncertain if that is guaranteed. + * + * m->m_pkthdr.csum_flags |= CSUM_IP_CHECKED | CSUM_IP_VALID; + */ + + switch (ip_proto) { + case IPPROTO_UDP: + if (m->m_len < csum_start + sizeof(struct udphdr)) + return (1); + + udp = (struct udphdr *)(mtod(m, uint8_t *) + csum_start); + if (udp->uh_sum == 0) + return (0); + + /* FALLTHROUGH */ + + case IPPROTO_TCP: + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + + case IPPROTO_SCTP: + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; + break; + } + + sc->vtnet_stats.rx_csum_offloaded++; + + return (0); +} +#endif + +/* + * Alternative method of doing receive checksum offloading. Rather + * than parsing the received frame down to the IP header, use the + * csum_offset to determine which CSUM_* flags are appropriate. We + * can get by with doing this only because the checksum offsets are + * unique for the things we care about. + */ +static int +vtnet_rx_csum(struct vtnet_softc *sc, struct mbuf *m, + struct virtio_net_hdr *hdr) +{ + struct ether_header *eh; + struct ether_vlan_header *evh; + struct udphdr *udp; + int csum_len; + uint16_t eth_type; + + csum_len = hdr->csum_start + hdr->csum_offset; + + if (csum_len < sizeof(struct ether_header) + sizeof(struct ip)) + return (1); + if (m->m_len < csum_len) + return (1); + + eh = mtod(m, struct ether_header *); + eth_type = ntohs(eh->ether_type); + if (eth_type == ETHERTYPE_VLAN) { + evh = mtod(m, struct ether_vlan_header *); + eth_type = ntohs(evh->evl_proto); + } + + if (eth_type != ETHERTYPE_IP && eth_type != ETHERTYPE_IPV6) { + sc->vtnet_stats.rx_csum_bad_ethtype++; + return (1); + } + + /* Use the offset to determine the appropriate CSUM_* flags. */ + switch (hdr->csum_offset) { + case offsetof(struct udphdr, uh_sum): + if (m->m_len < hdr->csum_start + sizeof(struct udphdr)) + return (1); + udp = (struct udphdr *)(mtod(m, uint8_t *) + hdr->csum_start); + if (udp->uh_sum == 0) + return (0); + + /* FALLTHROUGH */ + + case offsetof(struct tcphdr, th_sum): + m->m_pkthdr.csum_flags |= CSUM_DATA_VALID | CSUM_PSEUDO_HDR; + m->m_pkthdr.csum_data = 0xFFFF; + break; + + case offsetof(struct sctphdr, checksum): + m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID; + break; + + default: + sc->vtnet_stats.rx_csum_bad_offset++; + return (1); + } + + sc->vtnet_stats.rx_csum_offloaded++; + + return (0); +} + +static int +vtnet_rxeof_merged(struct vtnet_softc *sc, struct mbuf *m_head, int nbufs) +{ + struct ifnet *ifp; + struct virtqueue *vq; + struct mbuf *m, *m_tail; + int len; + + ifp = sc->vtnet_ifp; + vq = sc->vtnet_rx_vq; + m_tail = m_head; + + while (--nbufs > 0) { + m = virtqueue_dequeue(vq, &len); + if (m == NULL) { + ifp->if_ierrors++; + goto fail; + } + + if (vtnet_newbuf(sc) != 0) { + ifp->if_iqdrops++; + vtnet_discard_rxbuf(sc, m); + if (nbufs > 1) + vtnet_discard_merged_rxbuf(sc, nbufs); + goto fail; + } + + if (m->m_len < len) + len = m->m_len; + + m->m_len = len; + m->m_flags &= ~M_PKTHDR; + + m_head->m_pkthdr.len += len; + m_tail->m_next = m; + m_tail = m; + } + + return (0); + +fail: + sc->vtnet_stats.rx_mergeable_failed++; + m_freem(m_head); + + return (1); +} + +static int +vtnet_rxeof(struct vtnet_softc *sc, int count, int *rx_npktsp) +{ + struct virtio_net_hdr lhdr; + struct ifnet *ifp; + struct virtqueue *vq; + struct mbuf *m; + struct ether_header *eh; + struct virtio_net_hdr *hdr; + struct virtio_net_hdr_mrg_rxbuf *mhdr; + int len, deq, nbufs, adjsz, rx_npkts; + + ifp = sc->vtnet_ifp; + vq = sc->vtnet_rx_vq; + hdr = &lhdr; + deq = 0; + rx_npkts = 0; + + VTNET_LOCK_ASSERT(sc); + + while (--count >= 0) { + m = virtqueue_dequeue(vq, &len); + if (m == NULL) + break; + deq++; + + if (len < sc->vtnet_hdr_size + ETHER_HDR_LEN) { + ifp->if_ierrors++; + vtnet_discard_rxbuf(sc, m); + continue; + } + + if ((sc->vtnet_flags & VTNET_FLAG_MRG_RXBUFS) == 0) { + nbufs = 1; + adjsz = sizeof(struct vtnet_rx_header); + /* + * Account for our pad between the header and + * the actual start of the frame. + */ + len += VTNET_RX_HEADER_PAD; + } else { + mhdr = mtod(m, struct virtio_net_hdr_mrg_rxbuf *); + nbufs = mhdr->num_buffers; + adjsz = sizeof(struct virtio_net_hdr_mrg_rxbuf); + } + + if (vtnet_replace_rxbuf(sc, m, len) != 0) { + ifp->if_iqdrops++; + vtnet_discard_rxbuf(sc, m); + if (nbufs > 1) + vtnet_discard_merged_rxbuf(sc, nbufs); + continue; + } + + m->m_pkthdr.len = len; + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.csum_flags = 0; + + if (nbufs > 1) { + if (vtnet_rxeof_merged(sc, m, nbufs) != 0) + continue; + } + + ifp->if_ipackets++; + + /* + * Save copy of header before we strip it. For both mergeable + * and non-mergeable, the VirtIO header is placed first in the + * mbuf's data. We no longer need num_buffers, so always use a + * virtio_net_hdr. + */ + memcpy(hdr, mtod(m, void *), sizeof(struct virtio_net_hdr)); + m_adj(m, adjsz); + + if (ifp->if_capenable & IFCAP_VLAN_HWTAGGING) { + eh = mtod(m, struct ether_header *); + if (eh->ether_type == htons(ETHERTYPE_VLAN)) { + vtnet_vlan_tag_remove(m); + + /* + * With the 802.1Q header removed, update the + * checksum starting location accordingly. + */ + if (hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) + hdr->csum_start -= + ETHER_VLAN_ENCAP_LEN; + } + } + + if (ifp->if_capenable & IFCAP_RXCSUM && + hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) { + if (vtnet_rx_csum(sc, m, hdr) != 0) + sc->vtnet_stats.rx_csum_failed++; + } + + VTNET_UNLOCK(sc); + rx_npkts++; + (*ifp->if_input)(ifp, m); + VTNET_LOCK(sc); + + /* + * The interface may have been stopped while we were + * passing the packet up the network stack. + */ + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) + break; + } + + virtqueue_notify(vq); + + if (rx_npktsp != NULL) + *rx_npktsp = rx_npkts; + + return (count > 0 ? 0 : EAGAIN); +} + +static void +vtnet_rx_intr_task(void *arg, int pending) +{ + struct vtnet_softc *sc; + struct ifnet *ifp; + int more; + + sc = arg; + ifp = sc->vtnet_ifp; + + VTNET_LOCK(sc); + +#ifdef DEVICE_POLLING + if (ifp->if_capenable & IFCAP_POLLING) { + VTNET_UNLOCK(sc); + return; + } +#endif + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + vtnet_enable_rx_intr(sc); + VTNET_UNLOCK(sc); + return; + } + + more = vtnet_rxeof(sc, sc->vtnet_rx_process_limit, NULL); + if (!more && vtnet_enable_rx_intr(sc) != 0) { + vtnet_disable_rx_intr(sc); + more = 1; + } + + VTNET_UNLOCK(sc); + + if (more) { + sc->vtnet_stats.rx_task_rescheduled++; + taskqueue_enqueue_fast(sc->vtnet_tq, + &sc->vtnet_rx_intr_task); + } +} + +static int +vtnet_rx_vq_intr(void *xsc) +{ + struct vtnet_softc *sc; + + sc = xsc; + + vtnet_disable_rx_intr(sc); + taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_rx_intr_task); + + return (1); +} + +static void +vtnet_txeof(struct vtnet_softc *sc) +{ + struct virtqueue *vq; + struct ifnet *ifp; + struct vtnet_tx_header *txhdr; + int deq; + + vq = sc->vtnet_tx_vq; + ifp = sc->vtnet_ifp; + deq = 0; + + VTNET_LOCK_ASSERT(sc); + + while ((txhdr = virtqueue_dequeue(vq, NULL)) != NULL) { + deq++; + ifp->if_opackets++; + m_freem(txhdr->vth_mbuf); + uma_zfree(vtnet_tx_header_zone, txhdr); + } + + if (deq > 0) { + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + if (virtqueue_empty(vq)) + sc->vtnet_watchdog_timer = 0; + } +} + +static struct mbuf * +vtnet_tx_offload(struct vtnet_softc *sc, struct mbuf *m, + struct virtio_net_hdr *hdr) +{ + struct ifnet *ifp; + struct ether_header *eh; + struct ether_vlan_header *evh; + struct ip *ip; + struct ip6_hdr *ip6; + struct tcphdr *tcp; + int ip_offset; + uint16_t eth_type, csum_start; + uint8_t ip_proto, gso_type; + + ifp = sc->vtnet_ifp; + M_ASSERTPKTHDR(m); + + ip_offset = sizeof(struct ether_header); + if (m->m_len < ip_offset) { + if ((m = m_pullup(m, ip_offset)) == NULL) + return (NULL); + } + + eh = mtod(m, struct ether_header *); + eth_type = ntohs(eh->ether_type); + if (eth_type == ETHERTYPE_VLAN) { + ip_offset = sizeof(struct ether_vlan_header); + if (m->m_len < ip_offset) { + if ((m = m_pullup(m, ip_offset)) == NULL) + return (NULL); + } + evh = mtod(m, struct ether_vlan_header *); + eth_type = ntohs(evh->evl_proto); + } + + switch (eth_type) { + case ETHERTYPE_IP: + if (m->m_len < ip_offset + sizeof(struct ip)) { + m = m_pullup(m, ip_offset + sizeof(struct ip)); + if (m == NULL) + return (NULL); + } + + ip = (struct ip *)(mtod(m, uint8_t *) + ip_offset); + ip_proto = ip->ip_p; + csum_start = ip_offset + (ip->ip_hl << 2); + gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + break; + + case ETHERTYPE_IPV6: + if (m->m_len < ip_offset + sizeof(struct ip6_hdr)) { + m = m_pullup(m, ip_offset + sizeof(struct ip6_hdr)); + if (m == NULL) + return (NULL); + } + + ip6 = (struct ip6_hdr *)(mtod(m, uint8_t *) + ip_offset); + /* + * XXX Assume no extension headers are present. Presently, + * this will always be true in the case of TSO, and FreeBSD + * does not perform checksum offloading of IPv6 yet. + */ + ip_proto = ip6->ip6_nxt; + csum_start = ip_offset + sizeof(struct ip6_hdr); + gso_type = VIRTIO_NET_HDR_GSO_TCPV6; + break; + + default: + return (m); + } + + if (m->m_pkthdr.csum_flags & VTNET_CSUM_OFFLOAD) { + hdr->flags |= VIRTIO_NET_HDR_F_NEEDS_CSUM; + hdr->csum_start = csum_start; + hdr->csum_offset = m->m_pkthdr.csum_data; + + sc->vtnet_stats.tx_csum_offloaded++; + } + + if (m->m_pkthdr.csum_flags & CSUM_TSO) { + if (ip_proto != IPPROTO_TCP) + return (m); + + if (m->m_len < csum_start + sizeof(struct tcphdr)) { + m = m_pullup(m, csum_start + sizeof(struct tcphdr)); + if (m == NULL) + return (NULL); + } + + tcp = (struct tcphdr *)(mtod(m, uint8_t *) + csum_start); + hdr->gso_type = gso_type; + hdr->hdr_len = csum_start + (tcp->th_off << 2); + hdr->gso_size = m->m_pkthdr.tso_segsz; + + if (tcp->th_flags & TH_CWR) { + /* + * Drop if we did not negotiate VIRTIO_NET_F_HOST_ECN. + * ECN support is only configurable globally with the + * net.inet.tcp.ecn.enable sysctl knob. + */ + if ((sc->vtnet_flags & VTNET_FLAG_TSO_ECN) == 0) { + if_printf(ifp, "TSO with ECN not supported " + "by host\n"); + m_freem(m); + return (NULL); + } + + hdr->flags |= VIRTIO_NET_HDR_GSO_ECN; + } + + sc->vtnet_stats.tx_tso_offloaded++; + } + + return (m); +} + +static int +vtnet_enqueue_txbuf(struct vtnet_softc *sc, struct mbuf **m_head, + struct vtnet_tx_header *txhdr) +{ + struct sglist sg; + struct sglist_seg segs[VTNET_MAX_TX_SEGS]; + struct virtqueue *vq; + struct mbuf *m; + int collapsed, error; + + vq = sc->vtnet_tx_vq; + m = *m_head; + collapsed = 0; + + sglist_init(&sg, VTNET_MAX_TX_SEGS, segs); + error = sglist_append(&sg, &txhdr->vth_uhdr, sc->vtnet_hdr_size); + KASSERT(error == 0 && sg.sg_nseg == 1, + ("cannot add header to sglist")); + +again: + error = sglist_append_mbuf(&sg, m); + if (error) { + if (collapsed) + goto fail; + + m = m_collapse(m, M_DONTWAIT, VTNET_MAX_TX_SEGS - 1); + if (m == NULL) + goto fail; + + *m_head = m; + collapsed = 1; + goto again; + } + + txhdr->vth_mbuf = m; + + return (virtqueue_enqueue(vq, txhdr, &sg, sg.sg_nseg, 0)); + +fail: + m_freem(*m_head); + *m_head = NULL; + + return (ENOBUFS); +} + +static int +vtnet_encap(struct vtnet_softc *sc, struct mbuf **m_head) +{ + struct vtnet_tx_header *txhdr; + struct virtio_net_hdr *hdr; + struct mbuf *m; + int error; + + txhdr = uma_zalloc(vtnet_tx_header_zone, M_NOWAIT | M_ZERO); + if (txhdr == NULL) + return (ENOMEM); + + /* + * Always use the non-mergeable header to simplify things. When + * the mergeable feature is negotiated, the num_buffers field + * must be set to zero. We use vtnet_hdr_size later to enqueue + * the correct header size to the host. + */ + hdr = &txhdr->vth_uhdr.hdr; + m = *m_head; + + error = ENOBUFS; + + if (m->m_flags & M_VLANTAG) { + m = ether_vlanencap(m, m->m_pkthdr.ether_vtag); + if ((*m_head = m) == NULL) + goto fail; + m->m_flags &= ~M_VLANTAG; + } + + if (m->m_pkthdr.csum_flags != 0) { + m = vtnet_tx_offload(sc, m, hdr); + if ((*m_head = m) == NULL) + goto fail; + } + + error = vtnet_enqueue_txbuf(sc, m_head, txhdr); +fail: + if (error) + uma_zfree(vtnet_tx_header_zone, txhdr); + + return (error); +} + +static void +vtnet_start(struct ifnet *ifp) +{ + struct vtnet_softc *sc; + + sc = ifp->if_softc; + + VTNET_LOCK(sc); + vtnet_start_locked(ifp); + VTNET_UNLOCK(sc); +} + +static void +vtnet_start_locked(struct ifnet *ifp) +{ + struct vtnet_softc *sc; + struct virtqueue *vq; + struct mbuf *m0; + int enq; + + sc = ifp->if_softc; + vq = sc->vtnet_tx_vq; + enq = 0; + + VTNET_LOCK_ASSERT(sc); + + if ((ifp->if_drv_flags & (IFF_DRV_RUNNING | IFF_DRV_OACTIVE)) != + IFF_DRV_RUNNING || ((sc->vtnet_flags & VTNET_FLAG_LINK) == 0)) + return; + +#ifdef VTNET_TX_INTR_MODERATION + if (virtqueue_nused(vq) >= sc->vtnet_tx_size / 2) + vtnet_txeof(sc); +#endif + + while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + if (virtqueue_full(vq)) { + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; + } + + IFQ_DRV_DEQUEUE(&ifp->if_snd, m0); + if (m0 == NULL) + break; + + if (vtnet_encap(sc, &m0) != 0) { + if (m0 == NULL) + break; + IFQ_DRV_PREPEND(&ifp->if_snd, m0); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; + } + + enq++; + ETHER_BPF_MTAP(ifp, m0); + } + + if (enq > 0) { + virtqueue_notify(vq); + sc->vtnet_watchdog_timer = VTNET_WATCHDOG_TIMEOUT; + } +} + +static void +vtnet_tick(void *xsc) +{ + struct vtnet_softc *sc; + + sc = xsc; + + VTNET_LOCK_ASSERT(sc); +#ifdef VTNET_DEBUG + virtqueue_dump(sc->vtnet_rx_vq); + virtqueue_dump(sc->vtnet_tx_vq); +#endif + + vtnet_watchdog(sc); + callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc); +} + +static void +vtnet_tx_intr_task(void *arg, int pending) +{ + struct vtnet_softc *sc; + struct ifnet *ifp; + + sc = arg; + ifp = sc->vtnet_ifp; + + VTNET_LOCK(sc); + +#ifdef DEVICE_POLLING + if (ifp->if_capenable & IFCAP_POLLING) { + VTNET_UNLOCK(sc); + return; + } +#endif + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0) { + vtnet_enable_tx_intr(sc); + VTNET_UNLOCK(sc); + return; + } + + vtnet_txeof(sc); + + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + vtnet_start_locked(ifp); + + if (vtnet_enable_tx_intr(sc) != 0) { + vtnet_disable_tx_intr(sc); + sc->vtnet_stats.tx_task_rescheduled++; + VTNET_UNLOCK(sc); + taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_tx_intr_task); + return; + } + + VTNET_UNLOCK(sc); +} + +static int +vtnet_tx_vq_intr(void *xsc) +{ + struct vtnet_softc *sc; + + sc = xsc; + + vtnet_disable_tx_intr(sc); + taskqueue_enqueue_fast(sc->vtnet_tq, &sc->vtnet_tx_intr_task); + + return (1); +} + +static void +vtnet_stop(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + VTNET_LOCK_ASSERT(sc); + + sc->vtnet_watchdog_timer = 0; + callout_stop(&sc->vtnet_tick_ch); + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + vtnet_disable_rx_intr(sc); + vtnet_disable_tx_intr(sc); + + /* + * Stop the host VirtIO adapter. Note this will reset the host + * adapter's state back to the pre-initialized state, so in + * order to make the device usable again, we must drive it + * through virtio_reinit() and virtio_reinit_complete(). + */ + virtio_stop(dev); + + sc->vtnet_flags &= ~VTNET_FLAG_LINK; + + vtnet_free_rx_mbufs(sc); + vtnet_free_tx_mbufs(sc); +} + +static int +vtnet_reinit(struct vtnet_softc *sc) +{ + struct ifnet *ifp; + uint64_t features; + + ifp = sc->vtnet_ifp; + features = sc->vtnet_features; + + /* + * Re-negotiate with the host, removing any disabled receive + * features. Transmit features are disabled only on our side + * via if_capenable and if_hwassist. + */ + + if (ifp->if_capabilities & IFCAP_RXCSUM) { + if ((ifp->if_capenable & IFCAP_RXCSUM) == 0) + features &= ~VIRTIO_NET_F_GUEST_CSUM; + } + + if (ifp->if_capabilities & IFCAP_LRO) { + if ((ifp->if_capenable & IFCAP_LRO) == 0) + features &= ~VTNET_LRO_FEATURES; + } + + if (ifp->if_capabilities & IFCAP_VLAN_HWFILTER) { + if ((ifp->if_capenable & IFCAP_VLAN_HWFILTER) == 0) + features &= ~VIRTIO_NET_F_CTRL_VLAN; + } + + return (virtio_reinit(sc->vtnet_dev, features)); +} + +static void +vtnet_init_locked(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + int error; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + VTNET_LOCK_ASSERT(sc); + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + /* Stop host's adapter, cancel any pending I/O. */ + vtnet_stop(sc); + + /* Reinitialize the host device. */ + error = vtnet_reinit(sc); + if (error) { + device_printf(dev, + "reinitialization failed, stopping device...\n"); + vtnet_stop(sc); + return; + } + + /* Update host with assigned MAC address. */ + bcopy(IF_LLADDR(ifp), sc->vtnet_hwaddr, ETHER_ADDR_LEN); + vtnet_set_hwaddr(sc); + + ifp->if_hwassist = 0; + if (ifp->if_capenable & IFCAP_TXCSUM) + ifp->if_hwassist |= VTNET_CSUM_OFFLOAD; + if (ifp->if_capenable & IFCAP_TSO4) + ifp->if_hwassist |= CSUM_TSO; + + error = vtnet_init_rx_vq(sc); + if (error) { + device_printf(dev, + "cannot allocate mbufs for Rx virtqueue\n"); + vtnet_stop(sc); + return; + } + + if (sc->vtnet_flags & VTNET_FLAG_CTRL_VQ) { + if (sc->vtnet_flags & VTNET_FLAG_CTRL_RX) { + /* Restore promiscuous and all-multicast modes. */ + vtnet_rx_filter(sc); + + /* Restore filtered MAC addresses. */ + vtnet_rx_filter_mac(sc); + } + + /* Restore VLAN filters. */ + if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) + vtnet_rx_filter_vlan(sc); + } + +#ifdef DEVICE_POLLING + if (ifp->if_capenable & IFCAP_POLLING) { + vtnet_disable_rx_intr(sc); + vtnet_disable_tx_intr(sc); + } else +#endif + { + vtnet_enable_rx_intr(sc); + vtnet_enable_tx_intr(sc); + } + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + virtio_reinit_complete(dev); + + vtnet_update_link_status(sc); + callout_reset(&sc->vtnet_tick_ch, hz, vtnet_tick, sc); +} + +static void +vtnet_init(void *xsc) +{ + struct vtnet_softc *sc; + + sc = xsc; + + VTNET_LOCK(sc); + vtnet_init_locked(sc); + VTNET_UNLOCK(sc); +} + +static void +vtnet_exec_ctrl_cmd(struct vtnet_softc *sc, void *cookie, + struct sglist *sg, int readable, int writable) +{ + struct virtqueue *vq; + void *c; + + vq = sc->vtnet_ctrl_vq; + + VTNET_LOCK_ASSERT(sc); + KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_VQ, + ("no control virtqueue")); + KASSERT(virtqueue_empty(vq), + ("control command already enqueued")); + + if (virtqueue_enqueue(vq, cookie, sg, readable, writable) != 0) + return; + + virtqueue_notify(vq); + + /* + * Poll until the command is complete. Previously, we would + * sleep until the control virtqueue interrupt handler woke + * us up, but dropping the VTNET_MTX leads to serialization + * difficulties. + * + * Furthermore, it appears QEMU/KVM only allocates three MSIX + * vectors. Two of those vectors are needed for the Rx and Tx + * virtqueues. We do not support sharing both a Vq and config + * changed notification on the same MSIX vector. + */ + c = virtqueue_poll(vq, NULL); + KASSERT(c == cookie, ("unexpected control command response")); +} + +static void +vtnet_rx_filter(struct vtnet_softc *sc) +{ + device_t dev; + struct ifnet *ifp; + + dev = sc->vtnet_dev; + ifp = sc->vtnet_ifp; + + VTNET_LOCK_ASSERT(sc); + KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, + ("CTRL_RX feature not negotiated")); + + if (vtnet_set_promisc(sc, ifp->if_flags & IFF_PROMISC) != 0) + device_printf(dev, "cannot %s promiscuous mode\n", + ifp->if_flags & IFF_PROMISC ? "enable" : "disable"); + + if (vtnet_set_allmulti(sc, ifp->if_flags & IFF_ALLMULTI) != 0) + device_printf(dev, "cannot %s all-multicast mode\n", + ifp->if_flags & IFF_ALLMULTI ? "enable" : "disable"); +} + +static int +vtnet_ctrl_rx_cmd(struct vtnet_softc *sc, int cmd, int on) +{ + struct virtio_net_ctrl_hdr hdr; + struct sglist_seg segs[3]; + struct sglist sg; + uint8_t onoff, ack; + int error; + + if ((sc->vtnet_flags & VTNET_FLAG_CTRL_RX) == 0) + return (ENOTSUP); + + error = 0; + + hdr.class = VIRTIO_NET_CTRL_RX; + hdr.cmd = cmd; + onoff = !!on; + ack = VIRTIO_NET_ERR; + + sglist_init(&sg, 3, segs); + error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); + error |= sglist_append(&sg, &onoff, sizeof(uint8_t)); + error |= sglist_append(&sg, &ack, sizeof(uint8_t)); + KASSERT(error == 0 && sg.sg_nseg == 3, + ("error adding Rx filter message to sglist")); + + vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); + + return (ack == VIRTIO_NET_OK ? 0 : EIO); +} + +static int +vtnet_set_promisc(struct vtnet_softc *sc, int on) +{ + + return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_PROMISC, on)); +} + +static int +vtnet_set_allmulti(struct vtnet_softc *sc, int on) +{ + + return (vtnet_ctrl_rx_cmd(sc, VIRTIO_NET_CTRL_RX_ALLMULTI, on)); +} + +static void +vtnet_rx_filter_mac(struct vtnet_softc *sc) +{ + struct virtio_net_ctrl_hdr hdr; + struct vtnet_mac_filter *filter; + struct sglist_seg segs[4]; + struct sglist sg; + struct ifnet *ifp; + struct ifaddr *ifa; + struct ifmultiaddr *ifma; + int ucnt, mcnt, promisc, allmulti, error; + uint8_t ack; + + ifp = sc->vtnet_ifp; + ucnt = 0; + mcnt = 0; + promisc = 0; + allmulti = 0; + error = 0; + + VTNET_LOCK_ASSERT(sc); + KASSERT(sc->vtnet_flags & VTNET_FLAG_CTRL_RX, + ("CTRL_RX feature not negotiated")); + + /* + * Allocate the MAC filtering table. Note we could do this + * at attach time, but it is probably not worth keeping it + * around for an infrequent occurrence. + */ + filter = malloc(sizeof(struct vtnet_mac_filter), M_DEVBUF, + M_NOWAIT | M_ZERO); + if (filter == NULL) { + device_printf(sc->vtnet_dev, + "cannot allocate MAC address filtering table\n"); + return; + } + + /* Unicast MAC addresses: */ + if_addr_rlock(ifp); + TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { + if (ifa->ifa_addr->sa_family != AF_LINK) + continue; + else if (ucnt == VTNET_MAX_MAC_ENTRIES) + break; + + bcopy(LLADDR((struct sockaddr_dl *)ifa->ifa_addr), + &filter->vmf_unicast.macs[ucnt], ETHER_ADDR_LEN); + ucnt++; + } + if_addr_runlock(ifp); + + if (ucnt >= VTNET_MAX_MAC_ENTRIES) { + promisc = 1; + filter->vmf_unicast.nentries = 0; + + if_printf(ifp, "more than %d MAC addresses assigned, " + "falling back to promiscuous mode\n", + VTNET_MAX_MAC_ENTRIES); + } else + filter->vmf_unicast.nentries = ucnt; + + /* Multicast MAC addresses: */ + if_maddr_rlock(ifp); + TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) { + if (ifma->ifma_addr->sa_family != AF_LINK) + continue; + else if (mcnt == VTNET_MAX_MAC_ENTRIES) + break; + + bcopy(LLADDR((struct sockaddr_dl *)ifma->ifma_addr), + &filter->vmf_multicast.macs[mcnt], ETHER_ADDR_LEN); + mcnt++; + } + if_maddr_runlock(ifp); + + if (mcnt >= VTNET_MAX_MAC_ENTRIES) { + allmulti = 1; + filter->vmf_multicast.nentries = 0; + + if_printf(ifp, "more than %d multicast MAC addresses " + "assigned, falling back to all-multicast mode\n", + VTNET_MAX_MAC_ENTRIES); + } else + filter->vmf_multicast.nentries = mcnt; + + if (promisc && allmulti) + goto out; + + hdr.class = VIRTIO_NET_CTRL_MAC; + hdr.cmd = VIRTIO_NET_CTRL_MAC_TABLE_SET; + ack = VIRTIO_NET_ERR; + + sglist_init(&sg, 4, segs); + error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); + error |= sglist_append(&sg, &filter->vmf_unicast, + sizeof(struct vtnet_mac_table)); + error |= sglist_append(&sg, &filter->vmf_multicast, + sizeof(struct vtnet_mac_table)); + error |= sglist_append(&sg, &ack, sizeof(uint8_t)); + KASSERT(error == 0 && sg.sg_nseg == 4, + ("error adding MAC filtering message to sglist")); + + vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); + + if (ack != VIRTIO_NET_OK) + if_printf(ifp, "error setting host MAC filter table\n"); + +out: + free(filter, M_DEVBUF); + + if (promisc) + if (vtnet_set_promisc(sc, 1) != 0) + if_printf(ifp, "cannot enable promiscuous mode\n"); + if (allmulti) + if (vtnet_set_allmulti(sc, 1) != 0) + if_printf(ifp, "cannot enable all-multicast mode\n"); +} + +static int +vtnet_exec_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) +{ + struct virtio_net_ctrl_hdr hdr; + struct sglist_seg segs[3]; + struct sglist sg; + uint8_t ack; + int error; + + hdr.class = VIRTIO_NET_CTRL_VLAN; + hdr.cmd = add ? VIRTIO_NET_CTRL_VLAN_ADD : VIRTIO_NET_CTRL_VLAN_DEL; + ack = VIRTIO_NET_ERR; + error = 0; + + sglist_init(&sg, 3, segs); + error |= sglist_append(&sg, &hdr, sizeof(struct virtio_net_ctrl_hdr)); + error |= sglist_append(&sg, &tag, sizeof(uint16_t)); + error |= sglist_append(&sg, &ack, sizeof(uint8_t)); + KASSERT(error == 0 && sg.sg_nseg == 3, + ("error adding VLAN control message to sglist")); + + vtnet_exec_ctrl_cmd(sc, &ack, &sg, sg.sg_nseg - 1, 1); + + return (ack == VIRTIO_NET_OK ? 0 : EIO); +} + +static void +vtnet_rx_filter_vlan(struct vtnet_softc *sc) +{ + device_t dev; + uint32_t w, mask; + uint16_t tag; + int i, nvlans, error; + + VTNET_LOCK_ASSERT(sc); + KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER, + ("VLAN_FILTER feature not negotiated")); + + dev = sc->vtnet_dev; + nvlans = sc->vtnet_nvlans; + error = 0; + + /* Enable filtering for each configured VLAN. */ + for (i = 0; i < VTNET_VLAN_SHADOW_SIZE && nvlans > 0; i++) { + w = sc->vtnet_vlan_shadow[i]; + for (mask = 1, tag = i * 32; w != 0; mask <<= 1, tag++) { + if ((w & mask) != 0) { + w &= ~mask; + nvlans--; + if (vtnet_exec_vlan_filter(sc, 1, tag) != 0) + error++; + } + } + } + + KASSERT(nvlans == 0, ("VLAN count incorrect")); + if (error) + device_printf(dev, "cannot restore VLAN filter table\n"); +} + +static void +vtnet_set_vlan_filter(struct vtnet_softc *sc, int add, uint16_t tag) +{ + struct ifnet *ifp; + int idx, bit; + + KASSERT(sc->vtnet_flags & VTNET_FLAG_VLAN_FILTER, + ("VLAN_FILTER feature not negotiated")); + + if ((tag == 0) || (tag > 4095)) + return; + + ifp = sc->vtnet_ifp; + idx = (tag >> 5) & 0x7F; + bit = tag & 0x1F; + + VTNET_LOCK(sc); + + /* Update shadow VLAN table. */ + if (add) { + sc->vtnet_nvlans++; + sc->vtnet_vlan_shadow[idx] |= (1 << bit); + } else { + sc->vtnet_nvlans--; + sc->vtnet_vlan_shadow[idx] &= ~(1 << bit); + } + + if (ifp->if_capenable & IFCAP_VLAN_HWFILTER) { + if (vtnet_exec_vlan_filter(sc, add, tag) != 0) { + device_printf(sc->vtnet_dev, + "cannot %s VLAN %d %s the host filter table\n", + add ? "add" : "remove", tag, + add ? "to" : "from"); + } + } + + VTNET_UNLOCK(sc); +} + +static void +vtnet_register_vlan(void *arg, struct ifnet *ifp, uint16_t tag) +{ + + if (ifp->if_softc != arg) + return; + + vtnet_set_vlan_filter(arg, 1, tag); +} + +static void +vtnet_unregister_vlan(void *arg, struct ifnet *ifp, uint16_t tag) +{ + + if (ifp->if_softc != arg) + return; + + vtnet_set_vlan_filter(arg, 0, tag); +} + +static int +vtnet_ifmedia_upd(struct ifnet *ifp) +{ + struct vtnet_softc *sc; + struct ifmedia *ifm; + + sc = ifp->if_softc; + ifm = &sc->vtnet_media; + + if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER) + return (EINVAL); + + return (0); +} + +static void +vtnet_ifmedia_sts(struct ifnet *ifp, struct ifmediareq *ifmr) +{ + struct vtnet_softc *sc; + + sc = ifp->if_softc; + + ifmr->ifm_status = IFM_AVALID; + ifmr->ifm_active = IFM_ETHER; + + VTNET_LOCK(sc); + if (vtnet_is_link_up(sc) != 0) { + ifmr->ifm_status |= IFM_ACTIVE; + ifmr->ifm_active |= VTNET_MEDIATYPE; + } else + ifmr->ifm_active |= IFM_NONE; + VTNET_UNLOCK(sc); +} + +static void +vtnet_add_statistics(struct vtnet_softc *sc) +{ + device_t dev; + struct vtnet_statistics *stats; + struct sysctl_ctx_list *ctx; + struct sysctl_oid *tree; + struct sysctl_oid_list *child; + + dev = sc->vtnet_dev; + stats = &sc->vtnet_stats; + ctx = device_get_sysctl_ctx(dev); + tree = device_get_sysctl_tree(dev); + child = SYSCTL_CHILDREN(tree); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "mbuf_alloc_failed", + CTLFLAG_RD, &stats->mbuf_alloc_failed, + "Mbuf cluster allocation failures"); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_frame_too_large", + CTLFLAG_RD, &stats->rx_frame_too_large, + "Received frame larger than the mbuf chain"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_enq_replacement_failed", + CTLFLAG_RD, &stats->rx_enq_replacement_failed, + "Enqueuing the replacement receive mbuf failed"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_mergeable_failed", + CTLFLAG_RD, &stats->rx_mergeable_failed, + "Mergeable buffers receive failures"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ethtype", + CTLFLAG_RD, &stats->rx_csum_bad_ethtype, + "Received checksum offloaded buffer with unsupported " + "Ethernet type"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_start", + CTLFLAG_RD, &stats->rx_csum_bad_start, + "Received checksum offloaded buffer with incorrect start offset"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_ipproto", + CTLFLAG_RD, &stats->rx_csum_bad_ipproto, + "Received checksum offloaded buffer with incorrect IP protocol"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_bad_offset", + CTLFLAG_RD, &stats->rx_csum_bad_offset, + "Received checksum offloaded buffer with incorrect offset"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_failed", + CTLFLAG_RD, &stats->rx_csum_failed, + "Received buffer checksum offload failed"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_csum_offloaded", + CTLFLAG_RD, &stats->rx_csum_offloaded, + "Received buffer checksum offload succeeded"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "rx_task_rescheduled", + CTLFLAG_RD, &stats->rx_task_rescheduled, + "Times the receive interrupt task rescheduled itself"); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_offloaded", + CTLFLAG_RD, &stats->tx_csum_offloaded, + "Offloaded checksum of transmitted buffer"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_offloaded", + CTLFLAG_RD, &stats->tx_tso_offloaded, + "Segmentation offload of transmitted buffer"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_csum_bad_ethtype", + CTLFLAG_RD, &stats->tx_csum_bad_ethtype, + "Aborted transmit of checksum offloaded buffer with unknown " + "Ethernet type"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_tso_bad_ethtype", + CTLFLAG_RD, &stats->tx_tso_bad_ethtype, + "Aborted transmit of TSO buffer with unknown Ethernet type"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "tx_task_rescheduled", + CTLFLAG_RD, &stats->tx_task_rescheduled, + "Times the transmit interrupt task rescheduled itself"); +} + +static int +vtnet_enable_rx_intr(struct vtnet_softc *sc) +{ + + return (virtqueue_enable_intr(sc->vtnet_rx_vq)); +} + +static void +vtnet_disable_rx_intr(struct vtnet_softc *sc) +{ + + virtqueue_disable_intr(sc->vtnet_rx_vq); +} + +static int +vtnet_enable_tx_intr(struct vtnet_softc *sc) +{ + +#ifdef VTNET_TX_INTR_MODERATION + return (0); +#else + return (virtqueue_enable_intr(sc->vtnet_tx_vq)); +#endif +} + +static void +vtnet_disable_tx_intr(struct vtnet_softc *sc) +{ + + virtqueue_disable_intr(sc->vtnet_tx_vq); +} diff --git a/sys/dev/virtio/network/if_vtnetvar.h b/sys/dev/virtio/network/if_vtnetvar.h new file mode 100644 index 0000000..613b2b0 --- /dev/null +++ b/sys/dev/virtio/network/if_vtnetvar.h @@ -0,0 +1,240 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IF_VTNETVAR_H +#define _IF_VTNETVAR_H + +struct vtnet_statistics { + unsigned long mbuf_alloc_failed; + + unsigned long rx_frame_too_large; + unsigned long rx_enq_replacement_failed; + unsigned long rx_mergeable_failed; + unsigned long rx_csum_bad_ethtype; + unsigned long rx_csum_bad_start; + unsigned long rx_csum_bad_ipproto; + unsigned long rx_csum_bad_offset; + unsigned long rx_csum_failed; + unsigned long rx_csum_offloaded; + unsigned long rx_task_rescheduled; + + unsigned long tx_csum_offloaded; + unsigned long tx_tso_offloaded; + unsigned long tx_csum_bad_ethtype; + unsigned long tx_tso_bad_ethtype; + unsigned long tx_task_rescheduled; +}; + +struct vtnet_softc { + device_t vtnet_dev; + struct ifnet *vtnet_ifp; + struct mtx vtnet_mtx; + + uint32_t vtnet_flags; +#define VTNET_FLAG_LINK 0x0001 +#define VTNET_FLAG_SUSPENDED 0x0002 +#define VTNET_FLAG_CTRL_VQ 0x0004 +#define VTNET_FLAG_CTRL_RX 0x0008 +#define VTNET_FLAG_VLAN_FILTER 0x0010 +#define VTNET_FLAG_TSO_ECN 0x0020 +#define VTNET_FLAG_MRG_RXBUFS 0x0040 +#define VTNET_FLAG_LRO_NOMRG 0x0080 + + struct virtqueue *vtnet_rx_vq; + struct virtqueue *vtnet_tx_vq; + struct virtqueue *vtnet_ctrl_vq; + + int vtnet_hdr_size; + int vtnet_tx_size; + int vtnet_rx_size; + int vtnet_rx_process_limit; + int vtnet_rx_mbuf_size; + int vtnet_rx_mbuf_count; + int vtnet_if_flags; + int vtnet_watchdog_timer; + uint64_t vtnet_features; + + struct taskqueue *vtnet_tq; + struct task vtnet_rx_intr_task; + struct task vtnet_tx_intr_task; + struct task vtnet_cfgchg_task; + + struct vtnet_statistics vtnet_stats; + + struct callout vtnet_tick_ch; + + eventhandler_tag vtnet_vlan_attach; + eventhandler_tag vtnet_vlan_detach; + + struct ifmedia vtnet_media; + /* + * Fake media type; the host does not provide us with + * any real media information. + */ +#define VTNET_MEDIATYPE (IFM_ETHER | IFM_1000_T | IFM_FDX) + char vtnet_hwaddr[ETHER_ADDR_LEN]; + + /* + * During reset, the host's VLAN filtering table is lost. The + * array below is used to restore all the VLANs configured on + * this interface after a reset. + */ +#define VTNET_VLAN_SHADOW_SIZE (4096 / 32) + int vtnet_nvlans; + uint32_t vtnet_vlan_shadow[VTNET_VLAN_SHADOW_SIZE]; + + char vtnet_mtx_name[16]; +}; + +/* + * When mergeable buffers are not negotiated, the vtnet_rx_header structure + * below is placed at the beginning of the mbuf data. Use 4 bytes of pad to + * both keep the VirtIO header and the data non-contiguous and to keep the + * frame's payload 4 byte aligned. + * + * When mergeable buffers are negotiated, the host puts the VirtIO header in + * the beginning of the first mbuf's data. + */ +#define VTNET_RX_HEADER_PAD 4 +struct vtnet_rx_header { + struct virtio_net_hdr vrh_hdr; + char vrh_pad[VTNET_RX_HEADER_PAD]; +} __packed; + +/* + * For each outgoing frame, the vtnet_tx_header below is allocated from + * the vtnet_tx_header_zone. + */ +struct vtnet_tx_header { + union { + struct virtio_net_hdr hdr; + struct virtio_net_hdr_mrg_rxbuf mhdr; + } vth_uhdr; + + struct mbuf *vth_mbuf; +}; + +/* + * The VirtIO specification does not place a limit on the number of MAC + * addresses the guest driver may request to be filtered. In practice, + * the host is constrained by available resources. To simplify this driver, + * impose a reasonably high limit of MAC addresses we will filter before + * falling back to promiscuous or all-multicast modes. + */ +#define VTNET_MAX_MAC_ENTRIES 128 + +struct vtnet_mac_table { + uint32_t nentries; + uint8_t macs[VTNET_MAX_MAC_ENTRIES][ETHER_ADDR_LEN]; +} __packed; + +struct vtnet_mac_filter { + struct vtnet_mac_table vmf_unicast; + uint32_t vmf_pad; /* Make tables non-contiguous. */ + struct vtnet_mac_table vmf_multicast; +}; + +/* + * The MAC filter table is malloc(9)'d when needed. Ensure it will + * always fit in one segment. + */ +CTASSERT(sizeof(struct vtnet_mac_filter) <= PAGE_SIZE); + +#define VTNET_WATCHDOG_TIMEOUT 5 +#define VTNET_CSUM_OFFLOAD (CSUM_TCP | CSUM_UDP | CSUM_SCTP) + +/* Features desired/implemented by this driver. */ +#define VTNET_FEATURES \ + (VIRTIO_NET_F_MAC | \ + VIRTIO_NET_F_STATUS | \ + VIRTIO_NET_F_CTRL_VQ | \ + VIRTIO_NET_F_CTRL_RX | \ + VIRTIO_NET_F_CTRL_VLAN | \ + VIRTIO_NET_F_CSUM | \ + VIRTIO_NET_F_HOST_TSO4 | \ + VIRTIO_NET_F_HOST_TSO6 | \ + VIRTIO_NET_F_HOST_ECN | \ + VIRTIO_NET_F_GUEST_CSUM | \ + VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_GUEST_TSO6 | \ + VIRTIO_NET_F_GUEST_ECN | \ + VIRTIO_NET_F_MRG_RXBUF | \ + VIRTIO_RING_F_INDIRECT_DESC) + +/* + * The VIRTIO_NET_F_GUEST_TSO[46] features permit the host to send us + * frames larger than 1514 bytes. We do not yet support software LRO + * via tcp_lro_rx(). + */ +#define VTNET_LRO_FEATURES (VIRTIO_NET_F_GUEST_TSO4 | \ + VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN) + +#define VTNET_MAX_MTU 65536 +#define VTNET_MAX_RX_SIZE 65550 + +/* + * Used to preallocate the Vq indirect descriptors. The first segment + * is reserved for the header. + */ +#define VTNET_MIN_RX_SEGS 2 +#define VTNET_MAX_RX_SEGS 34 +#define VTNET_MAX_TX_SEGS 34 + +/* + * Assert we can receive and transmit the maximum with regular + * size clusters. + */ +CTASSERT(((VTNET_MAX_RX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_RX_SIZE); +CTASSERT(((VTNET_MAX_TX_SEGS - 1) * MCLBYTES) >= VTNET_MAX_MTU); + +/* + * Determine how many mbufs are in each receive buffer. For LRO without + * mergeable descriptors, we must allocate an mbuf chain large enough to + * hold both the vtnet_rx_header and the maximum receivable data. + */ +#define VTNET_NEEDED_RX_MBUFS(_sc) \ + ((_sc)->vtnet_flags & VTNET_FLAG_LRO_NOMRG) == 0 ? 1 : \ + howmany(sizeof(struct vtnet_rx_header) + VTNET_MAX_RX_SIZE, \ + (_sc)->vtnet_rx_mbuf_size) + +#define VTNET_MTX(_sc) &(_sc)->vtnet_mtx +#define VTNET_LOCK(_sc) mtx_lock(VTNET_MTX((_sc))) +#define VTNET_UNLOCK(_sc) mtx_unlock(VTNET_MTX((_sc))) +#define VTNET_LOCK_DESTROY(_sc) mtx_destroy(VTNET_MTX((_sc))) +#define VTNET_LOCK_ASSERT(_sc) mtx_assert(VTNET_MTX((_sc)), MA_OWNED) +#define VTNET_LOCK_ASSERT_NOTOWNED(_sc) \ + mtx_assert(VTNET_MTX((_sc)), MA_NOTOWNED) + +#define VTNET_LOCK_INIT(_sc) do { \ + snprintf((_sc)->vtnet_mtx_name, sizeof((_sc)->vtnet_mtx_name), \ + "%s", device_get_nameunit((_sc)->vtnet_dev)); \ + mtx_init(VTNET_MTX((_sc)), (_sc)->vtnet_mtx_name, \ + "VTNET Core Lock", MTX_DEF); \ +} while (0) + +#endif /* _IF_VTNETVAR_H */ diff --git a/sys/dev/virtio/network/virtio_net.h b/sys/dev/virtio/network/virtio_net.h new file mode 100644 index 0000000..7361aa1 --- /dev/null +++ b/sys/dev/virtio/network/virtio_net.h @@ -0,0 +1,138 @@ +/* + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_NET_H +#define _VIRTIO_NET_H + +#include <sys/types.h> + +/* The feature bitmap for virtio net */ +#define VIRTIO_NET_F_CSUM 0x00001 /* Host handles pkts w/ partial csum */ +#define VIRTIO_NET_F_GUEST_CSUM 0x00002 /* Guest handles pkts w/ partial csum*/ +#define VIRTIO_NET_F_MAC 0x00020 /* Host has given MAC address. */ +#define VIRTIO_NET_F_GSO 0x00040 /* Host handles pkts w/ any GSO type */ +#define VIRTIO_NET_F_GUEST_TSO4 0x00080 /* Guest can handle TSOv4 in. */ +#define VIRTIO_NET_F_GUEST_TSO6 0x00100 /* Guest can handle TSOv6 in. */ +#define VIRTIO_NET_F_GUEST_ECN 0x00200 /* Guest can handle TSO[6] w/ ECN in.*/ +#define VIRTIO_NET_F_GUEST_UFO 0x00400 /* Guest can handle UFO in. */ +#define VIRTIO_NET_F_HOST_TSO4 0x00800 /* Host can handle TSOv4 in. */ +#define VIRTIO_NET_F_HOST_TSO6 0x01000 /* Host can handle TSOv6 in. */ +#define VIRTIO_NET_F_HOST_ECN 0x02000 /* Host can handle TSO[6] w/ ECN in. */ +#define VIRTIO_NET_F_HOST_UFO 0x04000 /* Host can handle UFO in. */ +#define VIRTIO_NET_F_MRG_RXBUF 0x08000 /* Host can merge receive buffers. */ +#define VIRTIO_NET_F_STATUS 0x10000 /* virtio_net_config.status available*/ +#define VIRTIO_NET_F_CTRL_VQ 0x20000 /* Control channel available */ +#define VIRTIO_NET_F_CTRL_RX 0x40000 /* Control channel RX mode support */ +#define VIRTIO_NET_F_CTRL_VLAN 0x80000 /* Control channel VLAN filtering */ +#define VIRTIO_NET_F_CTRL_RX_EXTRA 0x100000 /* Extra RX mode control support */ + +#define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ + +struct virtio_net_config { + /* The config defining mac address (if VIRTIO_NET_F_MAC) */ + uint8_t mac[ETHER_ADDR_LEN]; + /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ + uint16_t status; +} __packed; + +/* + * This is the first element of the scatter-gather list. If you don't + * specify GSO or CSUM features, you can simply ignore the header. + */ +struct virtio_net_hdr { +#define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 /* Use csum_start,csum_offset*/ + uint8_t flags; +#define VIRTIO_NET_HDR_GSO_NONE 0 /* Not a GSO frame */ +#define VIRTIO_NET_HDR_GSO_TCPV4 1 /* GSO frame, IPv4 TCP (TSO) */ +#define VIRTIO_NET_HDR_GSO_UDP 3 /* GSO frame, IPv4 UDP (UFO) */ +#define VIRTIO_NET_HDR_GSO_TCPV6 4 /* GSO frame, IPv6 TCP */ +#define VIRTIO_NET_HDR_GSO_ECN 0x80 /* TCP has ECN set */ + uint8_t gso_type; + uint16_t hdr_len; /* Ethernet + IP + tcp/udp hdrs */ + uint16_t gso_size; /* Bytes to append to hdr_len per frame */ + uint16_t csum_start; /* Position to start checksumming from */ + uint16_t csum_offset; /* Offset after that to place checksum */ +}; + +/* + * This is the version of the header to use when the MRG_RXBUF + * feature has been negotiated. + */ +struct virtio_net_hdr_mrg_rxbuf { + struct virtio_net_hdr hdr; + uint16_t num_buffers; /* Number of merged rx buffers */ +}; + +/* + * Control virtqueue data structures + * + * The control virtqueue expects a header in the first sg entry + * and an ack/status response in the last entry. Data for the + * command goes in between. + */ +struct virtio_net_ctrl_hdr { + uint8_t class; + uint8_t cmd; +} __packed; + +typedef uint8_t virtio_net_ctrl_ack; + +#define VIRTIO_NET_OK 0 +#define VIRTIO_NET_ERR 1 + +/* + * Control the RX mode, ie. promiscuous, allmulti, etc... + * All commands require an "out" sg entry containing a 1 byte + * state value, zero = disable, non-zero = enable. Commands + * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature. + * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA. + */ +#define VIRTIO_NET_CTRL_RX 0 +#define VIRTIO_NET_CTRL_RX_PROMISC 0 +#define VIRTIO_NET_CTRL_RX_ALLMULTI 1 +#define VIRTIO_NET_CTRL_RX_ALLUNI 2 +#define VIRTIO_NET_CTRL_RX_NOMULTI 3 +#define VIRTIO_NET_CTRL_RX_NOUNI 4 +#define VIRTIO_NET_CTRL_RX_NOBCAST 5 + +/* + * Control the MAC filter table. + * + * The MAC filter table is managed by the hypervisor, the guest should + * assume the size is infinite. Filtering should be considered + * non-perfect, ie. based on hypervisor resources, the guest may + * received packets from sources not specified in the filter list. + * + * In addition to the class/cmd header, the TABLE_SET command requires + * two out scatterlists. Each contains a 4 byte count of entries followed + * by a concatenated byte stream of the ETH_ALEN MAC addresses. The + * first sg list contains unicast addresses, the second is for multicast. + * This functionality is present if the VIRTIO_NET_F_CTRL_RX feature + * is available. + */ +struct virtio_net_ctrl_mac { + uint32_t entries; + uint8_t macs[][ETHER_ADDR_LEN]; +} __packed; + +#define VIRTIO_NET_CTRL_MAC 1 +#define VIRTIO_NET_CTRL_MAC_TABLE_SET 0 + +/* + * Control VLAN filtering + * + * The VLAN filter table is controlled via a simple ADD/DEL interface. + * VLAN IDs not added may be filtered by the hypervisor. Del is the + * opposite of add. Both commands expect an out entry containing a 2 + * byte VLAN ID. VLAN filtering is available with the + * VIRTIO_NET_F_CTRL_VLAN feature bit. + */ +#define VIRTIO_NET_CTRL_VLAN 2 +#define VIRTIO_NET_CTRL_VLAN_ADD 0 +#define VIRTIO_NET_CTRL_VLAN_DEL 1 + +#endif /* _VIRTIO_NET_H */ diff --git a/sys/dev/virtio/pci/virtio_pci.c b/sys/dev/virtio/pci/virtio_pci.c new file mode 100644 index 0000000..dd348a5 --- /dev/null +++ b/sys/dev/virtio/pci/virtio_pci.c @@ -0,0 +1,1081 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* Driver for the VirtIO PCI interface. */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <sys/bus.h> +#include <sys/rman.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <dev/virtio/virtio.h> +#include <dev/virtio/virtqueue.h> +#include <dev/virtio/pci/virtio_pci.h> + +#include "virtio_bus_if.h" +#include "virtio_if.h" + +struct vtpci_softc { + device_t vtpci_dev; + struct resource *vtpci_res; + struct resource *vtpci_msix_res; + uint64_t vtpci_features; + uint32_t vtpci_flags; +#define VIRTIO_PCI_FLAG_NO_MSI 0x0001 +#define VIRTIO_PCI_FLAG_MSI 0x0002 +#define VIRTIO_PCI_FLAG_NO_MSIX 0x0010 +#define VIRTIO_PCI_FLAG_MSIX 0x0020 +#define VIRTIO_PCI_FLAG_SHARED_MSIX 0x0040 + + device_t vtpci_child_dev; + struct virtio_feature_desc *vtpci_child_feat_desc; + + /* + * Ideally, each virtqueue that the driver provides a callback for + * will receive its own MSIX vector. If there are not sufficient + * vectors available, we will then attempt to have all the VQs + * share one vector. Note that when using MSIX, the configuration + * changed notifications must be on their own vector. + * + * If MSIX is not available, we will attempt to have the whole + * device share one MSI vector, and then, finally, one legacy + * interrupt. + */ + int vtpci_nvqs; + struct vtpci_virtqueue { + struct virtqueue *vq; + + /* Index into vtpci_intr_res[] below. Unused, then -1. */ + int ires_idx; + } vtpci_vqx[VIRTIO_MAX_VIRTQUEUES]; + + /* + * When using MSIX interrupts, the first element of vtpci_intr_res[] + * is always the configuration changed notifications. The remaining + * element(s) are used for the virtqueues. + * + * With MSI and legacy interrupts, only the first element of + * vtpci_intr_res[] is used. + */ + int vtpci_nintr_res; + struct vtpci_intr_resource { + struct resource *irq; + int rid; + void *intrhand; + } vtpci_intr_res[1 + VIRTIO_MAX_VIRTQUEUES]; +}; + +static int vtpci_probe(device_t); +static int vtpci_attach(device_t); +static int vtpci_detach(device_t); +static int vtpci_suspend(device_t); +static int vtpci_resume(device_t); +static int vtpci_shutdown(device_t); +static void vtpci_driver_added(device_t, driver_t *); +static void vtpci_child_detached(device_t, device_t); +static int vtpci_read_ivar(device_t, device_t, int, uintptr_t *); +static int vtpci_write_ivar(device_t, device_t, int, uintptr_t); + +static uint64_t vtpci_negotiate_features(device_t, uint64_t); +static int vtpci_with_feature(device_t, uint64_t); +static int vtpci_alloc_virtqueues(device_t, int, int, + struct vq_alloc_info *); +static int vtpci_setup_intr(device_t, enum intr_type); +static void vtpci_stop(device_t); +static int vtpci_reinit(device_t, uint64_t); +static void vtpci_reinit_complete(device_t); +static void vtpci_notify_virtqueue(device_t, uint16_t); +static uint8_t vtpci_get_status(device_t); +static void vtpci_set_status(device_t, uint8_t); +static void vtpci_read_dev_config(device_t, bus_size_t, void *, int); +static void vtpci_write_dev_config(device_t, bus_size_t, void *, int); + +static void vtpci_describe_features(struct vtpci_softc *, const char *, + uint64_t); +static void vtpci_probe_and_attach_child(struct vtpci_softc *); + +static int vtpci_alloc_interrupts(struct vtpci_softc *, int, int, + struct vq_alloc_info *); +static int vtpci_alloc_intr_resources(struct vtpci_softc *, int, + struct vq_alloc_info *); +static int vtpci_alloc_msi(struct vtpci_softc *); +static int vtpci_alloc_msix(struct vtpci_softc *, int); +static int vtpci_register_msix_vector(struct vtpci_softc *, int, int); + +static void vtpci_free_interrupts(struct vtpci_softc *); +static void vtpci_free_virtqueues(struct vtpci_softc *); +static void vtpci_release_child_resources(struct vtpci_softc *); +static void vtpci_reset(struct vtpci_softc *); + +static int vtpci_legacy_intr(void *); +static int vtpci_vq_shared_intr(void *); +static int vtpci_vq_intr(void *); +static int vtpci_config_intr(void *); + +/* + * I/O port read/write wrappers. + */ +#define vtpci_read_config_1(sc, o) bus_read_1((sc)->vtpci_res, (o)) +#define vtpci_read_config_2(sc, o) bus_read_2((sc)->vtpci_res, (o)) +#define vtpci_read_config_4(sc, o) bus_read_4((sc)->vtpci_res, (o)) +#define vtpci_write_config_1(sc, o, v) bus_write_1((sc)->vtpci_res, (o), (v)) +#define vtpci_write_config_2(sc, o, v) bus_write_2((sc)->vtpci_res, (o), (v)) +#define vtpci_write_config_4(sc, o, v) bus_write_4((sc)->vtpci_res, (o), (v)) + +/* Tunables. */ +static int vtpci_disable_msix = 0; +TUNABLE_INT("hw.virtio.pci.disable_msix", &vtpci_disable_msix); + +static device_method_t vtpci_methods[] = { + /* Device interface. */ + DEVMETHOD(device_probe, vtpci_probe), + DEVMETHOD(device_attach, vtpci_attach), + DEVMETHOD(device_detach, vtpci_detach), + DEVMETHOD(device_suspend, vtpci_suspend), + DEVMETHOD(device_resume, vtpci_resume), + DEVMETHOD(device_shutdown, vtpci_shutdown), + + /* Bus interface. */ + DEVMETHOD(bus_driver_added, vtpci_driver_added), + DEVMETHOD(bus_child_detached, vtpci_child_detached), + DEVMETHOD(bus_read_ivar, vtpci_read_ivar), + DEVMETHOD(bus_write_ivar, vtpci_write_ivar), + + /* VirtIO bus interface. */ + DEVMETHOD(virtio_bus_negotiate_features, vtpci_negotiate_features), + DEVMETHOD(virtio_bus_with_feature, vtpci_with_feature), + DEVMETHOD(virtio_bus_alloc_virtqueues, vtpci_alloc_virtqueues), + DEVMETHOD(virtio_bus_setup_intr, vtpci_setup_intr), + DEVMETHOD(virtio_bus_stop, vtpci_stop), + DEVMETHOD(virtio_bus_reinit, vtpci_reinit), + DEVMETHOD(virtio_bus_reinit_complete, vtpci_reinit_complete), + DEVMETHOD(virtio_bus_notify_vq, vtpci_notify_virtqueue), + DEVMETHOD(virtio_bus_read_device_config, vtpci_read_dev_config), + DEVMETHOD(virtio_bus_write_device_config, vtpci_write_dev_config), + + { 0, 0 } +}; + +static driver_t vtpci_driver = { + "virtio_pci", + vtpci_methods, + sizeof(struct vtpci_softc) +}; + +devclass_t vtpci_devclass; + +DRIVER_MODULE(virtio_pci, pci, vtpci_driver, vtpci_devclass, 0, 0); +MODULE_VERSION(virtio_pci, 1); +MODULE_DEPEND(virtio_pci, pci, 1, 1, 1); +MODULE_DEPEND(virtio_pci, virtio, 1, 1, 1); + +static int +vtpci_probe(device_t dev) +{ + char desc[36]; + const char *name; + + if (pci_get_vendor(dev) != VIRTIO_PCI_VENDORID) + return (ENXIO); + + if (pci_get_device(dev) < VIRTIO_PCI_DEVICEID_MIN || + pci_get_device(dev) > VIRTIO_PCI_DEVICEID_MAX) + return (ENXIO); + + if (pci_get_revid(dev) != VIRTIO_PCI_ABI_VERSION) + return (ENXIO); + + name = virtio_device_name(pci_get_subdevice(dev)); + if (name == NULL) + name = "Unknown"; + + snprintf(desc, sizeof(desc), "VirtIO PCI %s adapter", name); + device_set_desc_copy(dev, desc); + + return (BUS_PROBE_DEFAULT); +} + +static int +vtpci_attach(device_t dev) +{ + struct vtpci_softc *sc; + device_t child; + int rid; + + sc = device_get_softc(dev); + sc->vtpci_dev = dev; + + pci_enable_busmaster(dev); + + rid = PCIR_BAR(0); + sc->vtpci_res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid, + RF_ACTIVE); + if (sc->vtpci_res == NULL) { + device_printf(dev, "cannot map I/O space\n"); + return (ENXIO); + } + + if (pci_find_extcap(dev, PCIY_MSI, NULL) != 0) + sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSI; + + if (pci_find_extcap(dev, PCIY_MSIX, NULL) == 0) { + rid = PCIR_BAR(1); + sc->vtpci_msix_res = bus_alloc_resource_any(dev, + SYS_RES_MEMORY, &rid, RF_ACTIVE); + } + + if (sc->vtpci_msix_res == NULL) + sc->vtpci_flags |= VIRTIO_PCI_FLAG_NO_MSIX; + + vtpci_reset(sc); + + /* Tell the host we've noticed this device. */ + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); + + if ((child = device_add_child(dev, NULL, -1)) == NULL) { + device_printf(dev, "cannot create child device\n"); + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED); + vtpci_detach(dev); + return (ENOMEM); + } + + sc->vtpci_child_dev = child; + vtpci_probe_and_attach_child(sc); + + return (0); +} + +static int +vtpci_detach(device_t dev) +{ + struct vtpci_softc *sc; + device_t child; + int error; + + sc = device_get_softc(dev); + + if ((child = sc->vtpci_child_dev) != NULL) { + error = device_delete_child(dev, child); + if (error) + return (error); + sc->vtpci_child_dev = NULL; + } + + vtpci_reset(sc); + + if (sc->vtpci_msix_res != NULL) { + bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(1), + sc->vtpci_msix_res); + sc->vtpci_msix_res = NULL; + } + + if (sc->vtpci_res != NULL) { + bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(0), + sc->vtpci_res); + sc->vtpci_res = NULL; + } + + return (0); +} + +static int +vtpci_suspend(device_t dev) +{ + + return (bus_generic_suspend(dev)); +} + +static int +vtpci_resume(device_t dev) +{ + + return (bus_generic_resume(dev)); +} + +static int +vtpci_shutdown(device_t dev) +{ + + (void) bus_generic_shutdown(dev); + /* Forcibly stop the host device. */ + vtpci_stop(dev); + + return (0); +} + +static void +vtpci_driver_added(device_t dev, driver_t *driver) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + vtpci_probe_and_attach_child(sc); +} + +static void +vtpci_child_detached(device_t dev, device_t child) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + vtpci_reset(sc); + vtpci_release_child_resources(sc); +} + +static int +vtpci_read_ivar(device_t dev, device_t child, int index, uintptr_t *result) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + if (sc->vtpci_child_dev != child) + return (ENOENT); + + switch (index) { + case VIRTIO_IVAR_DEVTYPE: + *result = pci_get_subdevice(dev); + break; + default: + return (ENOENT); + } + + return (0); +} + +static int +vtpci_write_ivar(device_t dev, device_t child, int index, uintptr_t value) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + if (sc->vtpci_child_dev != child) + return (ENOENT); + + switch (index) { + case VIRTIO_IVAR_FEATURE_DESC: + sc->vtpci_child_feat_desc = (void *) value; + break; + default: + return (ENOENT); + } + + return (0); +} + +static uint64_t +vtpci_negotiate_features(device_t dev, uint64_t child_features) +{ + struct vtpci_softc *sc; + uint64_t host_features, features; + + sc = device_get_softc(dev); + + host_features = vtpci_read_config_4(sc, VIRTIO_PCI_HOST_FEATURES); + vtpci_describe_features(sc, "host", host_features); + + /* + * Limit negotiated features to what the driver, virtqueue, and + * host all support. + */ + features = host_features & child_features; + features = virtqueue_filter_features(features); + sc->vtpci_features = features; + + vtpci_describe_features(sc, "negotiated", features); + vtpci_write_config_4(sc, VIRTIO_PCI_GUEST_FEATURES, features); + + return (features); +} + +static int +vtpci_with_feature(device_t dev, uint64_t feature) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + return ((sc->vtpci_features & feature) != 0); +} + +static int +vtpci_alloc_virtqueues(device_t dev, int flags, int nvqs, + struct vq_alloc_info *vq_info) +{ + struct vtpci_softc *sc; + struct vtpci_virtqueue *vqx; + struct vq_alloc_info *info; + int queue, error; + uint16_t vq_size; + + sc = device_get_softc(dev); + + if (sc->vtpci_nvqs != 0 || nvqs <= 0 || + nvqs > VIRTIO_MAX_VIRTQUEUES) + return (EINVAL); + + error = vtpci_alloc_interrupts(sc, flags, nvqs, vq_info); + if (error) { + device_printf(dev, "cannot allocate interrupts\n"); + return (error); + } + + if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) { + error = vtpci_register_msix_vector(sc, + VIRTIO_MSI_CONFIG_VECTOR, 0); + if (error) + return (error); + } + + for (queue = 0; queue < nvqs; queue++) { + vqx = &sc->vtpci_vqx[queue]; + info = &vq_info[queue]; + + vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue); + + vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM); + error = virtqueue_alloc(dev, queue, vq_size, + VIRTIO_PCI_VRING_ALIGN, 0xFFFFFFFFUL, info, &vqx->vq); + if (error) + return (error); + + if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) { + error = vtpci_register_msix_vector(sc, + VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx); + if (error) + return (error); + } + + vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN, + virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT); + + *info->vqai_vq = vqx->vq; + sc->vtpci_nvqs++; + } + + return (0); +} + +static int +vtpci_setup_intr(device_t dev, enum intr_type type) +{ + struct vtpci_softc *sc; + struct vtpci_intr_resource *ires; + struct vtpci_virtqueue *vqx; + int i, flags, error; + + sc = device_get_softc(dev); + flags = type | INTR_MPSAFE; + ires = &sc->vtpci_intr_res[0]; + + if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) == 0) { + error = bus_setup_intr(dev, ires->irq, flags, + vtpci_legacy_intr, NULL, sc, &ires->intrhand); + + return (error); + } + + error = bus_setup_intr(dev, ires->irq, flags, vtpci_config_intr, + NULL, sc, &ires->intrhand); + if (error) + return (error); + + if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX) { + ires = &sc->vtpci_intr_res[1]; + error = bus_setup_intr(dev, ires->irq, flags, + vtpci_vq_shared_intr, NULL, sc, &ires->intrhand); + + return (error); + } + + /* Setup an interrupt handler for each virtqueue. */ + for (i = 0; i < sc->vtpci_nvqs; i++) { + vqx = &sc->vtpci_vqx[i]; + if (vqx->ires_idx < 1) + continue; + + ires = &sc->vtpci_intr_res[vqx->ires_idx]; + error = bus_setup_intr(dev, ires->irq, flags, + vtpci_vq_intr, NULL, vqx->vq, &ires->intrhand); + if (error) + return (error); + } + + return (0); +} + +static void +vtpci_stop(device_t dev) +{ + + vtpci_reset(device_get_softc(dev)); +} + +static int +vtpci_reinit(device_t dev, uint64_t features) +{ + struct vtpci_softc *sc; + struct vtpci_virtqueue *vqx; + struct virtqueue *vq; + int queue, error; + uint16_t vq_size; + + sc = device_get_softc(dev); + + /* + * Redrive the device initialization. This is a bit of an abuse + * of the specification, but both VirtualBox and QEMU/KVM seem + * to play nice. We do not allow the host device to change from + * what was originally negotiated beyond what the guest driver + * changed (MSIX state should not change, number of virtqueues + * and their size remain the same, etc). + */ + + if (vtpci_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET) + vtpci_stop(dev); + + /* + * Quickly drive the status through ACK and DRIVER. The device + * does not become usable again until vtpci_reinit_complete(). + */ + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER); + + vtpci_negotiate_features(dev, features); + + if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) { + error = vtpci_register_msix_vector(sc, + VIRTIO_MSI_CONFIG_VECTOR, 0); + if (error) + return (error); + } + + for (queue = 0; queue < sc->vtpci_nvqs; queue++) { + vqx = &sc->vtpci_vqx[queue]; + vq = vqx->vq; + + KASSERT(vq != NULL, ("vq %d not allocated", queue)); + vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, queue); + + vq_size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM); + error = virtqueue_reinit(vq, vq_size); + if (error) + return (error); + + if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) { + error = vtpci_register_msix_vector(sc, + VIRTIO_MSI_QUEUE_VECTOR, vqx->ires_idx); + if (error) + return (error); + } + + vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN, + virtqueue_paddr(vqx->vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT); + } + + return (0); +} + +static void +vtpci_reinit_complete(device_t dev) +{ + + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK); +} + +static void +vtpci_notify_virtqueue(device_t dev, uint16_t queue) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_NOTIFY, queue); +} + +static uint8_t +vtpci_get_status(device_t dev) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + return (vtpci_read_config_1(sc, VIRTIO_PCI_STATUS)); +} + +static void +vtpci_set_status(device_t dev, uint8_t status) +{ + struct vtpci_softc *sc; + + sc = device_get_softc(dev); + + if (status != VIRTIO_CONFIG_STATUS_RESET) + status |= vtpci_get_status(dev); + + vtpci_write_config_1(sc, VIRTIO_PCI_STATUS, status); +} + +static void +vtpci_read_dev_config(device_t dev, bus_size_t offset, + void *dst, int length) +{ + struct vtpci_softc *sc; + bus_size_t off; + uint8_t *d; + int size; + + sc = device_get_softc(dev); + off = VIRTIO_PCI_CONFIG(sc) + offset; + + for (d = dst; length > 0; d += size, off += size, length -= size) { + if (length >= 4) { + size = 4; + *(uint32_t *)d = vtpci_read_config_4(sc, off); + } else if (length >= 2) { + size = 2; + *(uint16_t *)d = vtpci_read_config_2(sc, off); + } else { + size = 1; + *d = vtpci_read_config_1(sc, off); + } + } +} + +static void +vtpci_write_dev_config(device_t dev, bus_size_t offset, + void *src, int length) +{ + struct vtpci_softc *sc; + bus_size_t off; + uint8_t *s; + int size; + + sc = device_get_softc(dev); + off = VIRTIO_PCI_CONFIG(sc) + offset; + + for (s = src; length > 0; s += size, off += size, length -= size) { + if (length >= 4) { + size = 4; + vtpci_write_config_4(sc, off, *(uint32_t *)s); + } else if (length >= 2) { + size = 2; + vtpci_write_config_2(sc, off, *(uint16_t *)s); + } else { + size = 1; + vtpci_write_config_1(sc, off, *s); + } + } +} + +static void +vtpci_describe_features(struct vtpci_softc *sc, const char *msg, + uint64_t features) +{ + device_t dev, child; + + dev = sc->vtpci_dev; + child = sc->vtpci_child_dev; + + if (device_is_attached(child) && bootverbose == 0) + return; + + virtio_describe(dev, msg, features, sc->vtpci_child_feat_desc); +} + +static void +vtpci_probe_and_attach_child(struct vtpci_softc *sc) +{ + device_t dev, child; + + dev = sc->vtpci_dev; + child = sc->vtpci_child_dev; + + if (child == NULL) + return; + + if (device_get_state(child) != DS_NOTPRESENT) + return; + + if (device_probe(child) != 0) + return; + + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER); + if (device_attach(child) != 0) { + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED); + vtpci_reset(sc); + vtpci_release_child_resources(sc); + + /* Reset status for future attempt. */ + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK); + } else + vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK); +} + +static int +vtpci_alloc_interrupts(struct vtpci_softc *sc, int flags, int nvqs, + struct vq_alloc_info *vq_info) +{ + int i, nvectors, error; + + /* + * Only allocate a vector for virtqueues that are actually + * expecting an interrupt. + */ + for (nvectors = 0, i = 0; i < nvqs; i++) + if (vq_info[i].vqai_intr != NULL) + nvectors++; + + if (vtpci_disable_msix != 0 || + sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSIX || + flags & VIRTIO_ALLOC_VQS_DISABLE_MSIX || + vtpci_alloc_msix(sc, nvectors) != 0) { + /* + * Use MSI interrupts if available. Otherwise, we fallback + * to legacy interrupts. + */ + if ((sc->vtpci_flags & VIRTIO_PCI_FLAG_NO_MSI) == 0 && + vtpci_alloc_msi(sc) == 0) + sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSI; + + sc->vtpci_nintr_res = 1; + } + + error = vtpci_alloc_intr_resources(sc, nvqs, vq_info); + + return (error); +} + +static int +vtpci_alloc_intr_resources(struct vtpci_softc *sc, int nvqs, + struct vq_alloc_info *vq_info) +{ + device_t dev; + struct resource *irq; + struct vtpci_virtqueue *vqx; + int i, rid, flags, res_idx; + + dev = sc->vtpci_dev; + flags = RF_ACTIVE; + + if ((sc->vtpci_flags & + (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) == 0) { + rid = 0; + flags |= RF_SHAREABLE; + } else + rid = 1; + + for (i = 0; i < sc->vtpci_nintr_res; i++) { + irq = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid, flags); + if (irq == NULL) + return (ENXIO); + + sc->vtpci_intr_res[i].irq = irq; + sc->vtpci_intr_res[i].rid = rid++; + } + + /* + * Map the virtqueue into the correct index in vq_intr_res[]. Note the + * first index is reserved for configuration changes notifications. + */ + for (i = 0, res_idx = 1; i < nvqs; i++) { + vqx = &sc->vtpci_vqx[i]; + + if (sc->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) { + if (vq_info[i].vqai_intr == NULL) + vqx->ires_idx = -1; + else if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX) + vqx->ires_idx = res_idx; + else + vqx->ires_idx = res_idx++; + } else + vqx->ires_idx = -1; + } + + return (0); +} + +static int +vtpci_alloc_msi(struct vtpci_softc *sc) +{ + device_t dev; + int nmsi, cnt; + + dev = sc->vtpci_dev; + nmsi = pci_msi_count(dev); + + if (nmsi < 1) + return (1); + + cnt = 1; + if (pci_alloc_msi(dev, &cnt) == 0 && cnt == 1) + return (0); + + return (1); +} + +static int +vtpci_alloc_msix(struct vtpci_softc *sc, int nvectors) +{ + device_t dev; + int nmsix, cnt, required; + + dev = sc->vtpci_dev; + + nmsix = pci_msix_count(dev); + if (nmsix < 1) + return (1); + + /* An additional vector is needed for the config changes. */ + required = nvectors + 1; + if (nmsix >= required) { + cnt = required; + if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) + goto out; + + pci_release_msi(dev); + } + + /* Attempt shared MSIX configuration. */ + required = 2; + if (nmsix >= required) { + cnt = required; + if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) { + sc->vtpci_flags |= VIRTIO_PCI_FLAG_SHARED_MSIX; + goto out; + } + + pci_release_msi(dev); + } + + return (1); + +out: + sc->vtpci_nintr_res = required; + sc->vtpci_flags |= VIRTIO_PCI_FLAG_MSIX; + + if (bootverbose) { + if (sc->vtpci_flags & VIRTIO_PCI_FLAG_SHARED_MSIX) + device_printf(dev, "using shared virtqueue MSIX\n"); + else + device_printf(dev, "using per virtqueue MSIX\n"); + } + + return (0); +} + +static int +vtpci_register_msix_vector(struct vtpci_softc *sc, int offset, int res_idx) +{ + device_t dev; + uint16_t vector; + + dev = sc->vtpci_dev; + + if (offset != VIRTIO_MSI_CONFIG_VECTOR && + offset != VIRTIO_MSI_QUEUE_VECTOR) + return (EINVAL); + + if (res_idx != -1) { + /* Map from rid to host vector. */ + vector = sc->vtpci_intr_res[res_idx].rid - 1; + } else + vector = VIRTIO_MSI_NO_VECTOR; + + /* The first resource is special; make sure it is used correctly. */ + if (res_idx == 0) { + KASSERT(vector == 0, ("unexpected config vector")); + KASSERT(offset == VIRTIO_MSI_CONFIG_VECTOR, + ("unexpected config offset")); + } + + vtpci_write_config_2(sc, offset, vector); + + if (vtpci_read_config_2(sc, offset) != vector) { + device_printf(dev, "insufficient host resources for " + "MSIX interrupts\n"); + return (ENODEV); + } + + return (0); +} + +static void +vtpci_free_interrupts(struct vtpci_softc *sc) +{ + device_t dev; + struct vtpci_intr_resource *ires; + int i; + + dev = sc->vtpci_dev; + sc->vtpci_nintr_res = 0; + + if (sc->vtpci_flags & (VIRTIO_PCI_FLAG_MSI | VIRTIO_PCI_FLAG_MSIX)) { + pci_release_msi(dev); + sc->vtpci_flags &= ~(VIRTIO_PCI_FLAG_MSI | + VIRTIO_PCI_FLAG_MSIX | VIRTIO_PCI_FLAG_SHARED_MSIX); + } + + for (i = 0; i < 1 + VIRTIO_MAX_VIRTQUEUES; i++) { + ires = &sc->vtpci_intr_res[i]; + + if (ires->intrhand != NULL) { + bus_teardown_intr(dev, ires->irq, ires->intrhand); + ires->intrhand = NULL; + } + + if (ires->irq != NULL) { + bus_release_resource(dev, SYS_RES_IRQ, ires->rid, + ires->irq); + ires->irq = NULL; + } + + ires->rid = -1; + } +} + +static void +vtpci_free_virtqueues(struct vtpci_softc *sc) +{ + struct vtpci_virtqueue *vqx; + int i; + + sc->vtpci_nvqs = 0; + + for (i = 0; i < VIRTIO_MAX_VIRTQUEUES; i++) { + vqx = &sc->vtpci_vqx[i]; + + if (vqx->vq != NULL) { + virtqueue_free(vqx->vq); + vqx->vq = NULL; + } + } +} + +static void +vtpci_release_child_resources(struct vtpci_softc *sc) +{ + + vtpci_free_interrupts(sc); + vtpci_free_virtqueues(sc); +} + +static void +vtpci_reset(struct vtpci_softc *sc) +{ + + /* + * Setting the status to RESET sets the host device to + * the original, uninitialized state. + */ + vtpci_set_status(sc->vtpci_dev, VIRTIO_CONFIG_STATUS_RESET); +} + +static int +vtpci_legacy_intr(void *xsc) +{ + struct vtpci_softc *sc; + struct vtpci_virtqueue *vqx; + int i; + uint8_t isr; + + sc = xsc; + vqx = &sc->vtpci_vqx[0]; + + /* Reading the ISR also clears it. */ + isr = vtpci_read_config_1(sc, VIRTIO_PCI_ISR); + + if (isr & VIRTIO_PCI_ISR_CONFIG) + vtpci_config_intr(sc); + + if (isr & VIRTIO_PCI_ISR_INTR) + for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) + virtqueue_intr(vqx->vq); + + return (isr ? FILTER_HANDLED : FILTER_STRAY); +} + +static int +vtpci_vq_shared_intr(void *xsc) +{ + struct vtpci_softc *sc; + struct vtpci_virtqueue *vqx; + int i, rc; + + rc = 0; + sc = xsc; + vqx = &sc->vtpci_vqx[0]; + + for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) + rc |= virtqueue_intr(vqx->vq); + + return (rc ? FILTER_HANDLED : FILTER_STRAY); +} + +static int +vtpci_vq_intr(void *xvq) +{ + struct virtqueue *vq; + int rc; + + vq = xvq; + rc = virtqueue_intr(vq); + + return (rc ? FILTER_HANDLED : FILTER_STRAY); +} + +static int +vtpci_config_intr(void *xsc) +{ + struct vtpci_softc *sc; + device_t child; + int rc; + + rc = 0; + sc = xsc; + child = sc->vtpci_child_dev; + + if (child != NULL) + rc = VIRTIO_CONFIG_CHANGE(child); + + return (rc ? FILTER_HANDLED : FILTER_STRAY); +} diff --git a/sys/dev/virtio/pci/virtio_pci.h b/sys/dev/virtio/pci/virtio_pci.h new file mode 100644 index 0000000..6ebfdd5 --- /dev/null +++ b/sys/dev/virtio/pci/virtio_pci.h @@ -0,0 +1,64 @@ +/* + * Copyright IBM Corp. 2007 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_PCI_H +#define _VIRTIO_PCI_H + +/* VirtIO PCI vendor/device ID. */ +#define VIRTIO_PCI_VENDORID 0x1AF4 +#define VIRTIO_PCI_DEVICEID_MIN 0x1000 +#define VIRTIO_PCI_DEVICEID_MAX 0x103F + +/* VirtIO ABI version, this must match exactly. */ +#define VIRTIO_PCI_ABI_VERSION 0 + +/* + * VirtIO Header, located in BAR 0. + */ +#define VIRTIO_PCI_HOST_FEATURES 0 /* host's supported features (32bit, RO)*/ +#define VIRTIO_PCI_GUEST_FEATURES 4 /* guest's supported features (32, RW) */ +#define VIRTIO_PCI_QUEUE_PFN 8 /* physical address of VQ (32, RW) */ +#define VIRTIO_PCI_QUEUE_NUM 12 /* number of ring entries (16, RO) */ +#define VIRTIO_PCI_QUEUE_SEL 14 /* current VQ selection (16, RW) */ +#define VIRTIO_PCI_QUEUE_NOTIFY 16 /* notify host regarding VQ (16, RW) */ +#define VIRTIO_PCI_STATUS 18 /* device status register (8, RW) */ +#define VIRTIO_PCI_ISR 19 /* interrupt status register, reading + * also clears the register (8, RO) */ +/* Only if MSIX is enabled: */ +#define VIRTIO_MSI_CONFIG_VECTOR 20 /* configuration change vector (16, RW) */ +#define VIRTIO_MSI_QUEUE_VECTOR 22 /* vector for selected VQ notifications + (16, RW) */ + +/* The bit of the ISR which indicates a device has an interrupt. */ +#define VIRTIO_PCI_ISR_INTR 0x1 +/* The bit of the ISR which indicates a device configuration change. */ +#define VIRTIO_PCI_ISR_CONFIG 0x2 +/* Vector value used to disable MSI for queue. */ +#define VIRTIO_MSI_NO_VECTOR 0xFFFF + +/* + * The remaining space is defined by each driver as the per-driver + * configuration space. + */ +#define VIRTIO_PCI_CONFIG(sc) \ + (((sc)->vtpci_flags & VIRTIO_PCI_FLAG_MSIX) ? 24 : 20) + +/* + * How many bits to shift physical queue address written to QUEUE_PFN. + * 12 is historical, and due to x86 page size. + */ +#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12 + +/* The alignment to use between consumer and producer parts of vring. */ +#define VIRTIO_PCI_VRING_ALIGN 4096 + +#endif /* _VIRTIO_PCI_H */ diff --git a/sys/dev/virtio/virtio.c b/sys/dev/virtio/virtio.c new file mode 100644 index 0000000..e385575 --- /dev/null +++ b/sys/dev/virtio/virtio.c @@ -0,0 +1,283 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/module.h> +#include <sys/sbuf.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <machine/_inttypes.h> +#include <sys/bus.h> +#include <sys/rman.h> + +#include <dev/virtio/virtio.h> +#include <dev/virtio/virtqueue.h> + +#include "virtio_bus_if.h" + +static int virtio_modevent(module_t, int, void *); +static const char *virtio_feature_name(uint64_t, struct virtio_feature_desc *); + +static struct virtio_ident { + uint16_t devid; + char *name; +} virtio_ident_table[] = { + { VIRTIO_ID_NETWORK, "Network" }, + { VIRTIO_ID_BLOCK, "Block" }, + { VIRTIO_ID_CONSOLE, "Console" }, + { VIRTIO_ID_ENTROPY, "Entropy" }, + { VIRTIO_ID_BALLOON, "Balloon" }, + { VIRTIO_ID_IOMEMORY, "IOMemory" }, + { VIRTIO_ID_9P, "9P Transport" }, + + { 0, NULL } +}; + +/* Device independent features. */ +static struct virtio_feature_desc virtio_common_feature_desc[] = { + { VIRTIO_F_NOTIFY_ON_EMPTY, "NotifyOnEmpty" }, + { VIRTIO_RING_F_INDIRECT_DESC, "RingIndirect" }, + { VIRTIO_RING_F_EVENT_IDX, "EventIdx" }, + { VIRTIO_F_BAD_FEATURE, "BadFeature" }, + + { 0, NULL } +}; + +const char * +virtio_device_name(uint16_t devid) +{ + struct virtio_ident *ident; + + for (ident = virtio_ident_table; ident->name != NULL; ident++) { + if (ident->devid == devid) + return (ident->name); + } + + return (NULL); +} + +int +virtio_get_device_type(device_t dev) +{ + uintptr_t devtype; + + devtype = -1; + + BUS_READ_IVAR(device_get_parent(dev), dev, + VIRTIO_IVAR_DEVTYPE, &devtype); + + return ((int) devtype); +} + +void +virtio_set_feature_desc(device_t dev, + struct virtio_feature_desc *feature_desc) +{ + + BUS_WRITE_IVAR(device_get_parent(dev), dev, + VIRTIO_IVAR_FEATURE_DESC, (uintptr_t) feature_desc); +} + +void +virtio_describe(device_t dev, const char *msg, + uint64_t features, struct virtio_feature_desc *feature_desc) +{ + struct sbuf sb; + uint64_t val; + char *buf; + const char *name; + int n; + + if ((buf = malloc(512, M_TEMP, M_NOWAIT)) == NULL) { + device_printf(dev, "%s features: 0x%"PRIx64"\n", msg, + features); + return; + } + + sbuf_new(&sb, buf, 512, SBUF_FIXEDLEN); + sbuf_printf(&sb, "%s features: 0x%"PRIx64, msg, features); + + for (n = 0, val = 1ULL << 63; val != 0; val >>= 1) { + /* + * BAD_FEATURE is used to detect broken Linux clients + * and therefore is not applicable to FreeBSD. + */ + if (((features & val) == 0) || val == VIRTIO_F_BAD_FEATURE) + continue; + + if (n++ == 0) + sbuf_cat(&sb, " <"); + else + sbuf_cat(&sb, ","); + + name = NULL; + if (feature_desc != NULL) + name = virtio_feature_name(val, feature_desc); + if (name == NULL) + name = virtio_feature_name(val, + virtio_common_feature_desc); + + if (name == NULL) + sbuf_printf(&sb, "0x%"PRIx64, val); + else + sbuf_cat(&sb, name); + } + + if (n > 0) + sbuf_cat(&sb, ">"); + +#if __FreeBSD_version < 900020 + sbuf_finish(&sb); + if (sbuf_overflowed(&sb) == 0) +#else + if (sbuf_finish(&sb) == 0) +#endif + device_printf(dev, "%s\n", sbuf_data(&sb)); + + sbuf_delete(&sb); + free(buf, M_TEMP); +} + +static const char * +virtio_feature_name(uint64_t val, struct virtio_feature_desc *feature_desc) +{ + int i; + + for (i = 0; feature_desc[i].vfd_val != 0; i++) + if (val == feature_desc[i].vfd_val) + return (feature_desc[i].vfd_str); + + return (NULL); +} + +/* + * VirtIO bus method wrappers. + */ + +uint64_t +virtio_negotiate_features(device_t dev, uint64_t child_features) +{ + + return (VIRTIO_BUS_NEGOTIATE_FEATURES(device_get_parent(dev), + child_features)); +} + +int +virtio_alloc_virtqueues(device_t dev, int flags, int nvqs, + struct vq_alloc_info *info) +{ + + return (VIRTIO_BUS_ALLOC_VIRTQUEUES(device_get_parent(dev), flags, + nvqs, info)); +} + +int +virtio_setup_intr(device_t dev, enum intr_type type) +{ + + return (VIRTIO_BUS_SETUP_INTR(device_get_parent(dev), type)); +} + +int +virtio_with_feature(device_t dev, uint64_t feature) +{ + + return (VIRTIO_BUS_WITH_FEATURE(device_get_parent(dev), feature)); +} + +void +virtio_stop(device_t dev) +{ + + VIRTIO_BUS_STOP(device_get_parent(dev)); +} + +int +virtio_reinit(device_t dev, uint64_t features) +{ + + return (VIRTIO_BUS_REINIT(device_get_parent(dev), features)); +} + +void +virtio_reinit_complete(device_t dev) +{ + + VIRTIO_BUS_REINIT_COMPLETE(device_get_parent(dev)); +} + +void +virtio_read_device_config(device_t dev, bus_size_t offset, void *dst, int len) +{ + + VIRTIO_BUS_READ_DEVICE_CONFIG(device_get_parent(dev), + offset, dst, len); +} + +void +virtio_write_device_config(device_t dev, bus_size_t offset, void *dst, int len) +{ + + VIRTIO_BUS_WRITE_DEVICE_CONFIG(device_get_parent(dev), + offset, dst, len); +} + +static int +virtio_modevent(module_t mod, int type, void *unused) +{ + int error; + + error = 0; + + switch (type) { + case MOD_LOAD: + case MOD_QUIESCE: + case MOD_UNLOAD: + case MOD_SHUTDOWN: + break; + default: + error = EOPNOTSUPP; + break; + } + + return (error); +} + +static moduledata_t virtio_mod = { + "virtio", + virtio_modevent, + 0 +}; + +DECLARE_MODULE(virtio, virtio_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); +MODULE_VERSION(virtio, 1); diff --git a/sys/dev/virtio/virtio.h b/sys/dev/virtio/virtio.h new file mode 100644 index 0000000..ebd3c74 --- /dev/null +++ b/sys/dev/virtio/virtio.h @@ -0,0 +1,130 @@ +/* + * This header is BSD licensed so anyone can use the definitions to implement + * compatible drivers/servers. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_H_ +#define _VIRTIO_H_ + +#include <sys/types.h> + +struct vq_alloc_info; + +/* VirtIO device IDs. */ +#define VIRTIO_ID_NETWORK 0x01 +#define VIRTIO_ID_BLOCK 0x02 +#define VIRTIO_ID_CONSOLE 0x03 +#define VIRTIO_ID_ENTROPY 0x04 +#define VIRTIO_ID_BALLOON 0x05 +#define VIRTIO_ID_IOMEMORY 0x06 +#define VIRTIO_ID_9P 0x09 + +/* Status byte for guest to report progress. */ +#define VIRTIO_CONFIG_STATUS_RESET 0x00 +#define VIRTIO_CONFIG_STATUS_ACK 0x01 +#define VIRTIO_CONFIG_STATUS_DRIVER 0x02 +#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04 +#define VIRTIO_CONFIG_STATUS_FAILED 0x80 + +/* + * Generate interrupt when the virtqueue ring is + * completely used, even if we've suppressed them. + */ +#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24) + +/* + * The guest should never negotiate this feature; it + * is used to detect faulty drivers. + */ +#define VIRTIO_F_BAD_FEATURE (1 << 30) + +/* + * Some VirtIO feature bits (currently bits 28 through 31) are + * reserved for the transport being used (eg. virtio_ring), the + * rest are per-device feature bits. + */ +#define VIRTIO_TRANSPORT_F_START 28 +#define VIRTIO_TRANSPORT_F_END 32 + +/* + * Maximum number of virtqueues per device. + */ +#define VIRTIO_MAX_VIRTQUEUES 8 + +/* + * Each virtqueue indirect descriptor list must be physically contiguous. + * To allow us to malloc(9) each list individually, limit the number + * supported to what will fit in one page. With 4KB pages, this is a limit + * of 256 descriptors. If there is ever a need for more, we can switch to + * contigmalloc(9) for the larger allocations, similar to what + * bus_dmamem_alloc(9) does. + * + * Note the sizeof(struct vring_desc) is 16 bytes. + */ +#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16)) + +/* + * VirtIO instance variables indices. + */ +#define VIRTIO_IVAR_DEVTYPE 1 +#define VIRTIO_IVAR_FEATURE_DESC 2 + +struct virtio_feature_desc { + uint64_t vfd_val; + char *vfd_str; +}; + +const char *virtio_device_name(uint16_t devid); +int virtio_get_device_type(device_t dev); +void virtio_set_feature_desc(device_t dev, + struct virtio_feature_desc *feature_desc); +void virtio_describe(device_t dev, const char *msg, + uint64_t features, struct virtio_feature_desc *feature_desc); + +/* + * VirtIO Bus Methods. + */ +uint64_t virtio_negotiate_features(device_t dev, uint64_t child_features); +int virtio_alloc_virtqueues(device_t dev, int flags, int nvqs, + struct vq_alloc_info *info); +int virtio_setup_intr(device_t dev, enum intr_type type); +int virtio_with_feature(device_t dev, uint64_t feature); +void virtio_stop(device_t dev); +int virtio_reinit(device_t dev, uint64_t features); +void virtio_reinit_complete(device_t dev); + +/* + * Read/write a variable amount from the device specific (ie, network) + * configuration region. This region is encoded in the same endian as + * the guest. + */ +void virtio_read_device_config(device_t dev, bus_size_t offset, + void *dst, int length); +void virtio_write_device_config(device_t dev, bus_size_t offset, + void *src, int length); + +/* Inlined device specific read/write functions for common lengths. */ +#define VIRTIO_RDWR_DEVICE_CONFIG(size, type) \ +static inline type \ +__CONCAT(virtio_read_dev_config_,size)(device_t dev, \ + bus_size_t offset) \ +{ \ + type val; \ + virtio_read_device_config(dev, offset, &val, sizeof(type)); \ + return (val); \ +} \ + \ +static inline void \ +__CONCAT(virtio_write_dev_config_,size)(device_t dev, \ + bus_size_t offset, type val) \ +{ \ + virtio_write_device_config(dev, offset, &val, sizeof(type)); \ +} + +VIRTIO_RDWR_DEVICE_CONFIG(1, uint8_t); +VIRTIO_RDWR_DEVICE_CONFIG(2, uint16_t); +VIRTIO_RDWR_DEVICE_CONFIG(4, uint32_t); + +#endif /* _VIRTIO_H_ */ diff --git a/sys/dev/virtio/virtio_bus_if.m b/sys/dev/virtio/virtio_bus_if.m new file mode 100644 index 0000000..ec2029d --- /dev/null +++ b/sys/dev/virtio/virtio_bus_if.m @@ -0,0 +1,92 @@ +#- +# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ + +#include <sys/bus.h> +#include <machine/bus.h> + +INTERFACE virtio_bus; + +HEADER { +struct vq_alloc_info; +}; + +METHOD uint64_t negotiate_features { + device_t dev; + uint64_t child_features; +}; + +METHOD int with_feature { + device_t dev; + uint64_t feature; +}; + +METHOD int alloc_virtqueues { + device_t dev; + int flags; + int nvqs; + struct vq_alloc_info *info; +}; +HEADER { +#define VIRTIO_ALLOC_VQS_DISABLE_MSIX 0x1 +}; + +METHOD int setup_intr { + device_t dev; + enum intr_type type; +}; + +METHOD void stop { + device_t dev; +}; + +METHOD int reinit { + device_t dev; + uint64_t features; +}; + +METHOD void reinit_complete { + device_t dev; +}; + +METHOD void notify_vq { + device_t dev; + uint16_t queue; +}; + +METHOD void read_device_config { + device_t dev; + bus_size_t offset; + void *dst; + int len; +}; + +METHOD void write_device_config { + device_t dev; + bus_size_t offset; + void *src; + int len; +}; diff --git a/sys/dev/virtio/virtio_if.m b/sys/dev/virtio/virtio_if.m new file mode 100644 index 0000000..701678c --- /dev/null +++ b/sys/dev/virtio/virtio_if.m @@ -0,0 +1,43 @@ +#- +# Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $FreeBSD$ + +#include <sys/bus.h> + +INTERFACE virtio; + +CODE { + static int + virtio_default_config_change(device_t dev) + { + /* Return that we've handled the change. */ + return (1); + } +}; + +METHOD int config_change { + device_t dev; +} DEFAULT virtio_default_config_change; diff --git a/sys/dev/virtio/virtio_ring.h b/sys/dev/virtio/virtio_ring.h new file mode 100644 index 0000000..124cb4d --- /dev/null +++ b/sys/dev/virtio/virtio_ring.h @@ -0,0 +1,119 @@ +/* + * This header is BSD licensed so anyone can use the definitions + * to implement compatible drivers/servers. + * + * Copyright Rusty Russell IBM Corporation 2007. + */ +/* $FreeBSD$ */ + +#ifndef VIRTIO_RING_H +#define VIRTIO_RING_H + +#include <sys/types.h> + +/* This marks a buffer as continuing via the next field. */ +#define VRING_DESC_F_NEXT 1 +/* This marks a buffer as write-only (otherwise read-only). */ +#define VRING_DESC_F_WRITE 2 +/* This means the buffer contains a list of buffer descriptors. */ +#define VRING_DESC_F_INDIRECT 4 + +/* The Host uses this in used->flags to advise the Guest: don't kick me + * when you add a buffer. It's unreliable, so it's simply an + * optimization. Guest will still kick if it's out of buffers. */ +#define VRING_USED_F_NO_NOTIFY 1 +/* The Guest uses this in avail->flags to advise the Host: don't + * interrupt me when you consume a buffer. It's unreliable, so it's + * simply an optimization. */ +#define VRING_AVAIL_F_NO_INTERRUPT 1 + +/* VirtIO ring descriptors: 16 bytes. + * These can chain together via "next". */ +struct vring_desc { + /* Address (guest-physical). */ + uint64_t addr; + /* Length. */ + uint32_t len; + /* The flags as indicated above. */ + uint16_t flags; + /* We chain unused descriptors via this, too. */ + uint16_t next; +}; + +struct vring_avail { + uint16_t flags; + uint16_t idx; + uint16_t ring[0]; +}; + +/* uint32_t is used here for ids for padding reasons. */ +struct vring_used_elem { + /* Index of start of used descriptor chain. */ + uint32_t id; + /* Total length of the descriptor chain which was written to. */ + uint32_t len; +}; + +struct vring_used { + uint16_t flags; + uint16_t idx; + struct vring_used_elem ring[0]; +}; + +struct vring { + unsigned int num; + + struct vring_desc *desc; + struct vring_avail *avail; + struct vring_used *used; +}; + +/* The standard layout for the ring is a continuous chunk of memory which + * looks like this. We assume num is a power of 2. + * + * struct vring { + * // The actual descriptors (16 bytes each) + * struct vring_desc desc[num]; + * + * // A ring of available descriptor heads with free-running index. + * __u16 avail_flags; + * __u16 avail_idx; + * __u16 available[num]; + * + * // Padding to the next align boundary. + * char pad[]; + * + * // A ring of used descriptor heads with free-running index. + * __u16 used_flags; + * __u16 used_idx; + * struct vring_used_elem used[num]; + * }; + * + * NOTE: for VirtIO PCI, align is 4096. + */ + +static inline int +vring_size(unsigned int num, unsigned long align) +{ + int size; + + size = num * sizeof(struct vring_desc); + size += sizeof(struct vring_avail) + (num * sizeof(uint16_t)); + size = (size + align - 1) & ~(align - 1); + size += sizeof(struct vring_used) + + (num * sizeof(struct vring_used_elem)); + return (size); +} + +static inline void +vring_init(struct vring *vr, unsigned int num, uint8_t *p, + unsigned long align) +{ + vr->num = num; + vr->desc = (struct vring_desc *) p; + vr->avail = (struct vring_avail *) (p + + num * sizeof(struct vring_desc)); + vr->used = (void *) + (((unsigned long) &vr->avail->ring[num] + align-1) & ~(align-1)); +} +#endif /* VIRTIO_RING_H */ diff --git a/sys/dev/virtio/virtqueue.c b/sys/dev/virtio/virtqueue.c new file mode 100644 index 0000000..1fb182e --- /dev/null +++ b/sys/dev/virtio/virtqueue.c @@ -0,0 +1,755 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Implements the virtqueue interface as basically described + * in the original VirtIO paper. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/sglist.h> +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/cpu.h> +#include <machine/bus.h> +#include <machine/atomic.h> +#include <machine/resource.h> +#include <sys/bus.h> +#include <sys/rman.h> + +#include <dev/virtio/virtio.h> +#include <dev/virtio/virtqueue.h> +#include <dev/virtio/virtio_ring.h> + +#include "virtio_bus_if.h" + +struct virtqueue { + device_t vq_dev; + char vq_name[VIRTQUEUE_MAX_NAME_SZ]; + uint16_t vq_queue_index; + uint16_t vq_nentries; + uint32_t vq_flags; +#define VIRTQUEUE_FLAG_INDIRECT 0x0001 + + int vq_alignment; + int vq_ring_size; + void *vq_ring_mem; + int vq_max_indirect_size; + int vq_indirect_mem_size; + virtqueue_intr_t *vq_intrhand; + void *vq_intrhand_arg; + + struct vring vq_ring; + uint16_t vq_free_cnt; + uint16_t vq_queued_cnt; + /* + * Head of the free chain in the descriptor table. If + * there are no free descriptors, this will be set to + * VQ_RING_DESC_CHAIN_END. + */ + uint16_t vq_desc_head_idx; + /* + * Last consumed descriptor in the used table, + * trails vq_ring.used->idx. + */ + uint16_t vq_used_cons_idx; + + struct vq_desc_extra { + void *cookie; + struct vring_desc *indirect; + vm_paddr_t indirect_paddr; + uint16_t ndescs; + } vq_descx[0]; +}; + +/* + * The maximum virtqueue size is 2^15. Use that value as the end of + * descriptor chain terminator since it will never be a valid index + * in the descriptor table. This is used to verify we are correctly + * handling vq_free_cnt. + */ +#define VQ_RING_DESC_CHAIN_END 32768 + +#define VQASSERT(_vq, _exp, _msg, ...) \ + KASSERT((_exp),("%s: %s - "_msg, __func__, (_vq)->vq_name, \ + ##__VA_ARGS__)) + +#define VQ_RING_ASSERT_VALID_IDX(_vq, _idx) \ + VQASSERT((_vq), (_idx) < (_vq)->vq_nentries, \ + "invalid ring index: %d, max: %d", (_idx), \ + (_vq)->vq_nentries) + +#define VQ_RING_ASSERT_CHAIN_TERM(_vq) \ + VQASSERT((_vq), (_vq)->vq_desc_head_idx == \ + VQ_RING_DESC_CHAIN_END, "full ring terminated " \ + "incorrectly: head idx: %d", (_vq)->vq_desc_head_idx) + +static int virtqueue_init_indirect(struct virtqueue *vq, int); +static void virtqueue_free_indirect(struct virtqueue *vq); +static void virtqueue_init_indirect_list(struct virtqueue *, + struct vring_desc *); + +static void vq_ring_init(struct virtqueue *); +static void vq_ring_update_avail(struct virtqueue *, uint16_t); +static uint16_t vq_ring_enqueue_segments(struct virtqueue *, + struct vring_desc *, uint16_t, struct sglist *, int, int); +static int vq_ring_use_indirect(struct virtqueue *, int); +static void vq_ring_enqueue_indirect(struct virtqueue *, void *, + struct sglist *, int, int); +static void vq_ring_notify_host(struct virtqueue *, int); +static void vq_ring_free_chain(struct virtqueue *, uint16_t); + +uint64_t +virtqueue_filter_features(uint64_t features) +{ + uint64_t mask; + + mask = (1 << VIRTIO_TRANSPORT_F_START) - 1; + mask |= VIRTIO_RING_F_INDIRECT_DESC; + + return (features & mask); +} + +int +virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, int align, + vm_paddr_t highaddr, struct vq_alloc_info *info, struct virtqueue **vqp) +{ + struct virtqueue *vq; + int error; + + *vqp = NULL; + error = 0; + + if (size == 0) { + device_printf(dev, + "virtqueue %d (%s) does not exist (size is zero)\n", + queue, info->vqai_name); + return (ENODEV); + } else if (!powerof2(size)) { + device_printf(dev, + "virtqueue %d (%s) size is not a power of 2: %d\n", + queue, info->vqai_name, size); + return (ENXIO); + } else if (info->vqai_maxindirsz > VIRTIO_MAX_INDIRECT) { + device_printf(dev, "virtqueue %d (%s) requested too many " + "indirect descriptors: %d, max %d\n", + queue, info->vqai_name, info->vqai_maxindirsz, + VIRTIO_MAX_INDIRECT); + return (EINVAL); + } + + vq = malloc(sizeof(struct virtqueue) + + size * sizeof(struct vq_desc_extra), M_DEVBUF, M_NOWAIT | M_ZERO); + if (vq == NULL) { + device_printf(dev, "cannot allocate virtqueue\n"); + return (ENOMEM); + } + + vq->vq_dev = dev; + strlcpy(vq->vq_name, info->vqai_name, sizeof(vq->vq_name)); + vq->vq_queue_index = queue; + vq->vq_alignment = align; + vq->vq_nentries = size; + vq->vq_free_cnt = size; + vq->vq_intrhand = info->vqai_intr; + vq->vq_intrhand_arg = info->vqai_intr_arg; + + if (info->vqai_maxindirsz > 1) { + error = virtqueue_init_indirect(vq, info->vqai_maxindirsz); + if (error) + goto fail; + } + + vq->vq_ring_size = round_page(vring_size(size, align)); + vq->vq_ring_mem = contigmalloc(vq->vq_ring_size, M_DEVBUF, + M_NOWAIT | M_ZERO, 0, highaddr, PAGE_SIZE, 0); + if (vq->vq_ring_mem == NULL) { + device_printf(dev, + "cannot allocate memory for virtqueue ring\n"); + error = ENOMEM; + goto fail; + } + + vq_ring_init(vq); + virtqueue_disable_intr(vq); + + *vqp = vq; + +fail: + if (error) + virtqueue_free(vq); + + return (error); +} + +static int +virtqueue_init_indirect(struct virtqueue *vq, int indirect_size) +{ + device_t dev; + struct vq_desc_extra *dxp; + int i, size; + + dev = vq->vq_dev; + + if (VIRTIO_BUS_WITH_FEATURE(dev, VIRTIO_RING_F_INDIRECT_DESC) == 0) { + /* + * Indirect descriptors requested by the driver but not + * negotiated. Return zero to keep the initialization + * going: we'll run fine without. + */ + if (bootverbose) + device_printf(dev, "virtqueue %d (%s) requested " + "indirect descriptors but not negotiated\n", + vq->vq_queue_index, vq->vq_name); + return (0); + } + + size = indirect_size * sizeof(struct vring_desc); + vq->vq_max_indirect_size = indirect_size; + vq->vq_indirect_mem_size = size; + vq->vq_flags |= VIRTQUEUE_FLAG_INDIRECT; + + for (i = 0; i < vq->vq_nentries; i++) { + dxp = &vq->vq_descx[i]; + + dxp->indirect = malloc(size, M_DEVBUF, M_NOWAIT); + if (dxp->indirect == NULL) { + device_printf(dev, "cannot allocate indirect list\n"); + return (ENOMEM); + } + + dxp->indirect_paddr = vtophys(dxp->indirect); + virtqueue_init_indirect_list(vq, dxp->indirect); + } + + return (0); +} + +static void +virtqueue_free_indirect(struct virtqueue *vq) +{ + struct vq_desc_extra *dxp; + int i; + + for (i = 0; i < vq->vq_nentries; i++) { + dxp = &vq->vq_descx[i]; + + if (dxp->indirect == NULL) + break; + + free(dxp->indirect, M_DEVBUF); + dxp->indirect = NULL; + dxp->indirect_paddr = 0; + } + + vq->vq_flags &= ~VIRTQUEUE_FLAG_INDIRECT; + vq->vq_indirect_mem_size = 0; +} + +static void +virtqueue_init_indirect_list(struct virtqueue *vq, + struct vring_desc *indirect) +{ + int i; + + bzero(indirect, vq->vq_indirect_mem_size); + + for (i = 0; i < vq->vq_max_indirect_size - 1; i++) + indirect[i].next = i + 1; + indirect[i].next = VQ_RING_DESC_CHAIN_END; +} + +int +virtqueue_reinit(struct virtqueue *vq, uint16_t size) +{ + struct vq_desc_extra *dxp; + int i; + + if (vq->vq_nentries != size) { + device_printf(vq->vq_dev, + "%s: '%s' changed size; old=%hu, new=%hu\n", + __func__, vq->vq_name, vq->vq_nentries, size); + return (EINVAL); + } + + /* Warn if the virtqueue was not properly cleaned up. */ + if (vq->vq_free_cnt != vq->vq_nentries) { + device_printf(vq->vq_dev, + "%s: warning, '%s' virtqueue not empty, " + "leaking %d entries\n", __func__, vq->vq_name, + vq->vq_nentries - vq->vq_free_cnt); + } + + vq->vq_desc_head_idx = 0; + vq->vq_used_cons_idx = 0; + vq->vq_queued_cnt = 0; + vq->vq_free_cnt = vq->vq_nentries; + + /* To be safe, reset all our allocated memory. */ + bzero(vq->vq_ring_mem, vq->vq_ring_size); + for (i = 0; i < vq->vq_nentries; i++) { + dxp = &vq->vq_descx[i]; + dxp->cookie = NULL; + dxp->ndescs = 0; + if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT) + virtqueue_init_indirect_list(vq, dxp->indirect); + } + + vq_ring_init(vq); + virtqueue_disable_intr(vq); + + return (0); +} + +void +virtqueue_free(struct virtqueue *vq) +{ + + if (vq->vq_free_cnt != vq->vq_nentries) { + device_printf(vq->vq_dev, "%s: freeing non-empty virtqueue, " + "leaking %d entries\n", vq->vq_name, + vq->vq_nentries - vq->vq_free_cnt); + } + + if (vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT) + virtqueue_free_indirect(vq); + + if (vq->vq_ring_mem != NULL) { + contigfree(vq->vq_ring_mem, vq->vq_ring_size, M_DEVBUF); + vq->vq_ring_size = 0; + vq->vq_ring_mem = NULL; + } + + free(vq, M_DEVBUF); +} + +vm_paddr_t +virtqueue_paddr(struct virtqueue *vq) +{ + + return (vtophys(vq->vq_ring_mem)); +} + +int +virtqueue_size(struct virtqueue *vq) +{ + + return (vq->vq_nentries); +} + +int +virtqueue_empty(struct virtqueue *vq) +{ + + return (vq->vq_nentries == vq->vq_free_cnt); +} + +int +virtqueue_full(struct virtqueue *vq) +{ + + return (vq->vq_free_cnt == 0); +} + +void +virtqueue_notify(struct virtqueue *vq) +{ + + vq->vq_queued_cnt = 0; + vq_ring_notify_host(vq, 0); +} + +int +virtqueue_nused(struct virtqueue *vq) +{ + uint16_t used_idx, nused; + + used_idx = vq->vq_ring.used->idx; + if (used_idx >= vq->vq_used_cons_idx) + nused = used_idx - vq->vq_used_cons_idx; + else + nused = UINT16_MAX - vq->vq_used_cons_idx + + used_idx + 1; + VQASSERT(vq, nused <= vq->vq_nentries, "used more than available"); + + return (nused); +} + +int +virtqueue_intr(struct virtqueue *vq) +{ + + if (vq->vq_intrhand == NULL || + vq->vq_used_cons_idx == vq->vq_ring.used->idx) + return (0); + + vq->vq_intrhand(vq->vq_intrhand_arg); + + return (1); +} + +int +virtqueue_enable_intr(struct virtqueue *vq) +{ + + /* + * Enable interrupts, making sure we get the latest + * index of what's already been consumed. + */ + vq->vq_ring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; + + mb(); + + /* + * Additional items may have been consumed in the time between + * since we last checked and enabled interrupts above. Let our + * caller know so it processes the new entries. + */ + if (vq->vq_used_cons_idx != vq->vq_ring.used->idx) + return (1); + + return (0); +} + +void +virtqueue_disable_intr(struct virtqueue *vq) +{ + + /* + * Note this is only considered a hint to the host. + */ + vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT; +} + +int +virtqueue_enqueue(struct virtqueue *vq, void *cookie, struct sglist *sg, + int readable, int writable) +{ + struct vq_desc_extra *dxp; + int needed; + uint16_t head_idx, idx; + + needed = readable + writable; + + VQASSERT(vq, cookie != NULL, "enqueuing with no cookie"); + VQASSERT(vq, needed == sg->sg_nseg, + "segment count mismatch, %d, %d", needed, sg->sg_nseg); + VQASSERT(vq, + needed <= vq->vq_nentries || needed <= vq->vq_max_indirect_size, + "too many segments to enqueue: %d, %d/%d", needed, + vq->vq_nentries, vq->vq_max_indirect_size); + + if (needed < 1) + return (EINVAL); + if (vq->vq_free_cnt == 0) + return (ENOSPC); + + if (vq_ring_use_indirect(vq, needed)) { + vq_ring_enqueue_indirect(vq, cookie, sg, readable, writable); + return (0); + } else if (vq->vq_free_cnt < needed) + return (EMSGSIZE); + + head_idx = vq->vq_desc_head_idx; + VQ_RING_ASSERT_VALID_IDX(vq, head_idx); + dxp = &vq->vq_descx[head_idx]; + + VQASSERT(vq, dxp->cookie == NULL, + "cookie already exists for index %d", head_idx); + dxp->cookie = cookie; + dxp->ndescs = needed; + + idx = vq_ring_enqueue_segments(vq, vq->vq_ring.desc, head_idx, + sg, readable, writable); + + vq->vq_desc_head_idx = idx; + vq->vq_free_cnt -= needed; + if (vq->vq_free_cnt == 0) + VQ_RING_ASSERT_CHAIN_TERM(vq); + else + VQ_RING_ASSERT_VALID_IDX(vq, idx); + + vq_ring_update_avail(vq, head_idx); + + return (0); +} + +void * +virtqueue_dequeue(struct virtqueue *vq, uint32_t *len) +{ + struct vring_used_elem *uep; + void *cookie; + uint16_t used_idx, desc_idx; + + if (vq->vq_used_cons_idx == vq->vq_ring.used->idx) + return (NULL); + + used_idx = vq->vq_used_cons_idx++ & (vq->vq_nentries - 1); + uep = &vq->vq_ring.used->ring[used_idx]; + + mb(); + desc_idx = (uint16_t) uep->id; + if (len != NULL) + *len = uep->len; + + vq_ring_free_chain(vq, desc_idx); + + cookie = vq->vq_descx[desc_idx].cookie; + VQASSERT(vq, cookie != NULL, "no cookie for index %d", desc_idx); + vq->vq_descx[desc_idx].cookie = NULL; + + return (cookie); +} + +void * +virtqueue_poll(struct virtqueue *vq, uint32_t *len) +{ + void *cookie; + + while ((cookie = virtqueue_dequeue(vq, len)) == NULL) + cpu_spinwait(); + + return (cookie); +} + +void * +virtqueue_drain(struct virtqueue *vq, int *last) +{ + void *cookie; + int idx; + + cookie = NULL; + idx = *last; + + while (idx < vq->vq_nentries && cookie == NULL) { + if ((cookie = vq->vq_descx[idx].cookie) != NULL) { + vq->vq_descx[idx].cookie = NULL; + /* Free chain to keep free count consistent. */ + vq_ring_free_chain(vq, idx); + } + idx++; + } + + *last = idx; + + return (cookie); +} + +void +virtqueue_dump(struct virtqueue *vq) +{ + + if (vq == NULL) + return; + + printf("VQ: %s - size=%d; free=%d; used=%d; queued=%d; " + "desc_head_idx=%d; avail.idx=%d; used_cons_idx=%d; " + "used.idx=%d; avail.flags=0x%x; used.flags=0x%x\n", + vq->vq_name, vq->vq_nentries, vq->vq_free_cnt, + virtqueue_nused(vq), vq->vq_queued_cnt, vq->vq_desc_head_idx, + vq->vq_ring.avail->idx, vq->vq_used_cons_idx, + vq->vq_ring.used->idx, vq->vq_ring.avail->flags, + vq->vq_ring.used->flags); +} + +static void +vq_ring_init(struct virtqueue *vq) +{ + struct vring *vr; + char *ring_mem; + int i, size; + + ring_mem = vq->vq_ring_mem; + size = vq->vq_nentries; + vr = &vq->vq_ring; + + vring_init(vr, size, ring_mem, vq->vq_alignment); + + for (i = 0; i < size - 1; i++) + vr->desc[i].next = i + 1; + vr->desc[i].next = VQ_RING_DESC_CHAIN_END; +} + +static void +vq_ring_update_avail(struct virtqueue *vq, uint16_t desc_idx) +{ + uint16_t avail_idx; + + /* + * Place the head of the descriptor chain into the next slot and make + * it usable to the host. The chain is made available now rather than + * deferring to virtqueue_notify() in the hopes that if the host is + * currently running on another CPU, we can keep it processing the new + * descriptor. + */ + avail_idx = vq->vq_ring.avail->idx & (vq->vq_nentries - 1); + vq->vq_ring.avail->ring[avail_idx] = desc_idx; + + mb(); + vq->vq_ring.avail->idx++; + + /* Keep pending count until virtqueue_notify() for debugging. */ + vq->vq_queued_cnt++; +} + +static uint16_t +vq_ring_enqueue_segments(struct virtqueue *vq, struct vring_desc *desc, + uint16_t head_idx, struct sglist *sg, int readable, int writable) +{ + struct sglist_seg *seg; + struct vring_desc *dp; + int i, needed; + uint16_t idx; + + needed = readable + writable; + + for (i = 0, idx = head_idx, seg = sg->sg_segs; + i < needed; + i++, idx = dp->next, seg++) { + VQASSERT(vq, idx != VQ_RING_DESC_CHAIN_END, + "premature end of free desc chain"); + + dp = &desc[idx]; + dp->addr = seg->ss_paddr; + dp->len = seg->ss_len; + dp->flags = 0; + + if (i < needed - 1) + dp->flags |= VRING_DESC_F_NEXT; + if (i >= readable) + dp->flags |= VRING_DESC_F_WRITE; + } + + return (idx); +} + +static int +vq_ring_use_indirect(struct virtqueue *vq, int needed) +{ + + if ((vq->vq_flags & VIRTQUEUE_FLAG_INDIRECT) == 0) + return (0); + + if (vq->vq_max_indirect_size < needed) + return (0); + + if (needed < 2) + return (0); + + return (1); +} + +static void +vq_ring_enqueue_indirect(struct virtqueue *vq, void *cookie, + struct sglist *sg, int readable, int writable) +{ + struct vring_desc *dp; + struct vq_desc_extra *dxp; + int needed; + uint16_t head_idx; + + needed = readable + writable; + VQASSERT(vq, needed <= vq->vq_max_indirect_size, + "enqueuing too many indirect descriptors"); + + head_idx = vq->vq_desc_head_idx; + VQ_RING_ASSERT_VALID_IDX(vq, head_idx); + dp = &vq->vq_ring.desc[head_idx]; + dxp = &vq->vq_descx[head_idx]; + + VQASSERT(vq, dxp->cookie == NULL, + "cookie already exists for index %d", head_idx); + dxp->cookie = cookie; + dxp->ndescs = 1; + + dp->addr = dxp->indirect_paddr; + dp->len = needed * sizeof(struct vring_desc); + dp->flags = VRING_DESC_F_INDIRECT; + + vq_ring_enqueue_segments(vq, dxp->indirect, 0, + sg, readable, writable); + + vq->vq_desc_head_idx = dp->next; + vq->vq_free_cnt--; + if (vq->vq_free_cnt == 0) + VQ_RING_ASSERT_CHAIN_TERM(vq); + else + VQ_RING_ASSERT_VALID_IDX(vq, vq->vq_desc_head_idx); + + vq_ring_update_avail(vq, head_idx); +} + +static void +vq_ring_notify_host(struct virtqueue *vq, int force) +{ + + mb(); + + if (force || + (vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY) == 0) + VIRTIO_BUS_NOTIFY_VQ(vq->vq_dev, vq->vq_queue_index); +} + +static void +vq_ring_free_chain(struct virtqueue *vq, uint16_t desc_idx) +{ + struct vring_desc *dp; + struct vq_desc_extra *dxp; + + VQ_RING_ASSERT_VALID_IDX(vq, desc_idx); + dp = &vq->vq_ring.desc[desc_idx]; + dxp = &vq->vq_descx[desc_idx]; + + if (vq->vq_free_cnt == 0) + VQ_RING_ASSERT_CHAIN_TERM(vq); + + vq->vq_free_cnt += dxp->ndescs; + dxp->ndescs--; + + if ((dp->flags & VRING_DESC_F_INDIRECT) == 0) { + while (dp->flags & VRING_DESC_F_NEXT) { + VQ_RING_ASSERT_VALID_IDX(vq, dp->next); + dp = &vq->vq_ring.desc[dp->next]; + dxp->ndescs--; + } + } + VQASSERT(vq, dxp->ndescs == 0, "failed to free entire desc chain"); + + /* + * We must append the existing free chain, if any, to the end of + * newly freed chain. If the virtqueue was completely used, then + * head would be VQ_RING_DESC_CHAIN_END (ASSERTed above). + */ + dp->next = vq->vq_desc_head_idx; + vq->vq_desc_head_idx = desc_idx; +} diff --git a/sys/dev/virtio/virtqueue.h b/sys/dev/virtio/virtqueue.h new file mode 100644 index 0000000..e790e65 --- /dev/null +++ b/sys/dev/virtio/virtqueue.h @@ -0,0 +1,98 @@ +/*- + * Copyright (c) 2011, Bryan Venteicher <bryanv@daemoninthecloset.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VIRTIO_VIRTQUEUE_H +#define _VIRTIO_VIRTQUEUE_H + +#include <sys/types.h> + +struct virtqueue; +struct sglist; + +/* Support for indirect buffer descriptors. */ +#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28) + +/* The guest publishes the used index for which it expects an interrupt + * at the end of the avail ring. Host should ignore the avail->flags field. + * The host publishes the avail index for which it expects a kick + * at the end of the used ring. Guest should ignore the used->flags field. + */ +#define VIRTIO_RING_F_EVENT_IDX (1 << 29) + +/* Device callback for a virtqueue interrupt. */ +typedef int virtqueue_intr_t(void *); + +#define VIRTQUEUE_MAX_NAME_SZ 32 + +/* One for each virtqueue the device wishes to allocate. */ +struct vq_alloc_info { + char vqai_name[VIRTQUEUE_MAX_NAME_SZ]; + int vqai_maxindirsz; + virtqueue_intr_t *vqai_intr; + void *vqai_intr_arg; + struct virtqueue **vqai_vq; +}; + +#define VQ_ALLOC_INFO_INIT(_i,_nsegs,_intr,_arg,_vqp,_str,...) do { \ + snprintf((_i)->vqai_name, VIRTQUEUE_MAX_NAME_SZ, _str, \ + ##__VA_ARGS__); \ + (_i)->vqai_maxindirsz = (_nsegs); \ + (_i)->vqai_intr = (_intr); \ + (_i)->vqai_intr_arg = (_arg); \ + (_i)->vqai_vq = (_vqp); \ +} while (0) + +uint64_t virtqueue_filter_features(uint64_t features); + +int virtqueue_alloc(device_t dev, uint16_t queue, uint16_t size, + int align, vm_paddr_t highaddr, struct vq_alloc_info *info, + struct virtqueue **vqp); +void *virtqueue_drain(struct virtqueue *vq, int *last); +void virtqueue_free(struct virtqueue *vq); +int virtqueue_reinit(struct virtqueue *vq, uint16_t size); + +int virtqueue_intr(struct virtqueue *vq); +int virtqueue_enable_intr(struct virtqueue *vq); +void virtqueue_disable_intr(struct virtqueue *vq); + +/* Get physical address of the virtqueue ring. */ +vm_paddr_t virtqueue_paddr(struct virtqueue *vq); + +int virtqueue_full(struct virtqueue *vq); +int virtqueue_empty(struct virtqueue *vq); +int virtqueue_size(struct virtqueue *vq); +int virtqueue_nused(struct virtqueue *vq); +void virtqueue_notify(struct virtqueue *vq); +void virtqueue_dump(struct virtqueue *vq); + +int virtqueue_enqueue(struct virtqueue *vq, void *cookie, + struct sglist *sg, int readable, int writable); +void *virtqueue_dequeue(struct virtqueue *vq, uint32_t *len); +void *virtqueue_poll(struct virtqueue *vq, uint32_t *len); + +#endif /* _VIRTIO_VIRTQUEUE_H */ diff --git a/sys/modules/Makefile b/sys/modules/Makefile index c3b13e1..36fc5ca 100644 --- a/sys/modules/Makefile +++ b/sys/modules/Makefile @@ -317,6 +317,7 @@ SUBDIR= ${_3dfx} \ usb \ utopia \ ${_vesa} \ + ${_virtio} \ vge \ vkbd \ ${_vpo} \ @@ -537,6 +538,7 @@ _padlock= padlock _s3= s3 _twa= twa _vesa= vesa +_virtio= virtio _x86bios= x86bios .elif ${MACHINE} == "pc98" _canbepm= canbepm @@ -636,6 +638,7 @@ _sppp= sppp _tpm= tpm _twa= twa _vesa= vesa +_virtio= virtio _vxge= vxge _x86bios= x86bios _wi= wi diff --git a/sys/modules/virtio/Makefile b/sys/modules/virtio/Makefile new file mode 100644 index 0000000..9c9457a --- /dev/null +++ b/sys/modules/virtio/Makefile @@ -0,0 +1,28 @@ +# +# $FreeBSD$ +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +SUBDIR= virtio pci network block balloon + +.include <bsd.subdir.mk> diff --git a/sys/modules/virtio/balloon/Makefile b/sys/modules/virtio/balloon/Makefile new file mode 100644 index 0000000..dc14cbc --- /dev/null +++ b/sys/modules/virtio/balloon/Makefile @@ -0,0 +1,36 @@ +# +# $FreeBSD$ +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +.PATH: ${.CURDIR}/../../../dev/virtio/balloon + +KMOD= virtio_balloon +SRCS= virtio_balloon.c +SRCS+= virtio_bus_if.h virtio_if.h +SRCS+= bus_if.h device_if.h + +MFILES= kern/bus_if.m kern/device_if.m \ + dev/virtio/virtio_bus_if.m dev/virtio/virtio_if.m + +.include <bsd.kmod.mk> diff --git a/sys/modules/virtio/block/Makefile b/sys/modules/virtio/block/Makefile new file mode 100644 index 0000000..5df9eab --- /dev/null +++ b/sys/modules/virtio/block/Makefile @@ -0,0 +1,36 @@ +# +# $FreeBSD$ +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +.PATH: ${.CURDIR}/../../../dev/virtio/block + +KMOD= virtio_blk +SRCS= virtio_blk.c +SRCS+= virtio_bus_if.h virtio_if.h +SRCS+= bus_if.h device_if.h + +MFILES= kern/bus_if.m kern/device_if.m \ + dev/virtio/virtio_bus_if.m dev/virtio/virtio_if.m + +.include <bsd.kmod.mk> diff --git a/sys/modules/virtio/network/Makefile b/sys/modules/virtio/network/Makefile new file mode 100644 index 0000000..8463309c --- /dev/null +++ b/sys/modules/virtio/network/Makefile @@ -0,0 +1,36 @@ +# +# $FreeBSD$ +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +.PATH: ${.CURDIR}/../../../dev/virtio/network + +KMOD= if_vtnet +SRCS= if_vtnet.c +SRCS+= virtio_bus_if.h virtio_if.h +SRCS+= bus_if.h device_if.h + +MFILES= kern/bus_if.m kern/device_if.m \ + dev/virtio/virtio_bus_if.m dev/virtio/virtio_if.m + +.include <bsd.kmod.mk> diff --git a/sys/modules/virtio/pci/Makefile b/sys/modules/virtio/pci/Makefile new file mode 100644 index 0000000..a58d64c --- /dev/null +++ b/sys/modules/virtio/pci/Makefile @@ -0,0 +1,36 @@ +# +# $FreeBSD$ +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +.PATH: ${.CURDIR}/../../../dev/virtio/pci + +KMOD= virtio_pci +SRCS= virtio_pci.c +SRCS+= virtio_bus_if.h virtio_if.h +SRCS+= bus_if.h device_if.h pci_if.h + +MFILES= kern/bus_if.m kern/device_if.m dev/pci/pci_if.m \ + dev/virtio/virtio_bus_if.m dev/virtio/virtio_if.m + +.include <bsd.kmod.mk> diff --git a/sys/modules/virtio/virtio/Makefile b/sys/modules/virtio/virtio/Makefile new file mode 100644 index 0000000..e8973c0 --- /dev/null +++ b/sys/modules/virtio/virtio/Makefile @@ -0,0 +1,38 @@ +# +# $FreeBSD$ +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# + +.PATH: ${.CURDIR}/../../../dev/virtio + +KMOD= virtio + +SRCS= virtio.c virtqueue.c +SRCS+= virtio_bus_if.c virtio_bus_if.h +SRCS+= virtio_if.c virtio_if.h +SRCS+= bus_if.h device_if.h + +MFILES= kern/bus_if.m kern/device_if.m \ + dev/virtio/virtio_bus_if.m dev/virtio/virtio_if.m + +.include <bsd.kmod.mk> |