summaryrefslogtreecommitdiffstats
path: root/sys/dev/xen/blkback/blkback.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/dev/xen/blkback/blkback.c')
-rw-r--r--sys/dev/xen/blkback/blkback.c3663
1 files changed, 2644 insertions, 1019 deletions
diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c
index 259f2f6..72087f5 100644
--- a/sys/dev/xen/blkback/blkback.c
+++ b/sys/dev/xen/blkback/blkback.c
@@ -1,1055 +1,1919 @@
-/*
- * Copyright (c) 2006, Cisco Systems, Inc.
+/*-
+ * Copyright (c) 2009-2010 Spectra Logic Corporation
* All rights reserved.
*
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
* are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions, and the following disclaimer,
+ * without modification.
+ * 2. Redistributions in binary form must reproduce at minimum a disclaimer
+ * substantially similar to the "NO WARRANTY" disclaimer below
+ * ("Disclaimer") and any redistribution must be conditioned upon
+ * including a substantially similar Disclaimer requirement for further
+ * binary redistribution.
*
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * NO WARRANTY
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+ * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGES.
*
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
- * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
+ * Authors: Justin T. Gibbs (Spectra Logic Corporation)
+ * Ken Merry (Spectra Logic Corporation)
*/
-
#include <sys/cdefs.h>
__FBSDID("$FreeBSD$");
+/**
+ * \file blkback.c
+ *
+ * \brief Device driver supporting the vending of block storage from
+ * a FreeBSD domain to other domains.
+ */
+
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/mbuf.h>
-#include <sys/malloc.h>
#include <sys/kernel.h>
-#include <sys/socket.h>
-#include <sys/queue.h>
-#include <sys/taskqueue.h>
+#include <sys/malloc.h>
+
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/conf.h>
+#include <sys/devicestat.h>
+#include <sys/disk.h>
+#include <sys/fcntl.h>
+#include <sys/filedesc.h>
+#include <sys/kdb.h>
+#include <sys/module.h>
#include <sys/namei.h>
#include <sys/proc.h>
-#include <sys/filedesc.h>
+#include <sys/rman.h>
+#include <sys/taskqueue.h>
+#include <sys/types.h>
#include <sys/vnode.h>
-#include <sys/fcntl.h>
-#include <sys/disk.h>
-#include <sys/bio.h>
-
-#include <sys/module.h>
-#include <sys/bus.h>
-#include <sys/sysctl.h>
+#include <sys/mount.h>
#include <geom/geom.h>
+#include <machine/_inttypes.h>
+#include <machine/xen/xen-os.h>
+
+#include <vm/vm.h>
#include <vm/vm_extern.h>
#include <vm/vm_kern.h>
-#include <machine/xen-os.h>
-#include <machine/hypervisor.h>
-#include <machine/hypervisor-ifs.h>
-#include <machine/xen_intr.h>
-#include <machine/evtchn.h>
-#include <machine/xenbus.h>
-#include <machine/gnttab.h>
-#include <machine/xen-public/memory.h>
-#include <dev/xen/xenbus/xenbus_comms.h>
+#include <xen/blkif.h>
+#include <xen/evtchn.h>
+#include <xen/gnttab.h>
+#include <xen/xen_intr.h>
+#include <xen/interface/event_channel.h>
+#include <xen/interface/grant_table.h>
-#if XEN_BLKBACK_DEBUG
-#define DPRINTF(fmt, args...) \
- printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
-#else
-#define DPRINTF(fmt, args...) ((void)0)
-#endif
-
-#define WPRINTF(fmt, args...) \
- printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
+#include <xen/xenbus/xenbusvar.h>
-#define BLKBACK_INVALID_HANDLE (~0)
+/*--------------------------- Compile-time Tunables --------------------------*/
+/**
+ * The maximum number of outstanding request blocks (request headers plus
+ * additional segment blocks) we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define XBB_MAX_REQUESTS 256
-struct ring_ref {
- vm_offset_t va;
- grant_handle_t handle;
- uint64_t bus_addr;
-};
+/**
+ * \brief Define to force all I/O to be performed on memory owned by the
+ * backend device, with a copy-in/out to the remote domain's memory.
+ *
+ * \note This option is currently required when this driver's domain is
+ * operating in HVM mode on a system using an IOMMU.
+ *
+ * This driver uses Xen's grant table API to gain access to the memory of
+ * the remote domains it serves. When our domain is operating in PV mode,
+ * the grant table mechanism directly updates our domain's page table entries
+ * to point to the physical pages of the remote domain. This scheme guarantees
+ * that blkback and the backing devices it uses can safely perform DMA
+ * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to
+ * insure that our domain cannot DMA to pages owned by another domain. As
+ * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant
+ * table API. For this reason, in HVM mode, we must bounce all requests into
+ * memory that is mapped into our domain at domain startup and thus has
+ * valid IOMMU mappings.
+ */
+#define XBB_USE_BOUNCE_BUFFERS
-typedef struct blkback_info {
+/**
+ * \brief Define to enable rudimentary request logging to the console.
+ */
+#undef XBB_DEBUG
- /* Schedule lists */
- STAILQ_ENTRY(blkback_info) next_req;
- int on_req_sched_list;
+/*---------------------------------- Macros ----------------------------------*/
+/**
+ * Custom malloc type for all driver allocations.
+ */
+MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data");
- struct xenbus_device *xdev;
- XenbusState frontend_state;
+#ifdef XBB_DEBUG
+#define DPRINTF(fmt, args...) \
+ printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args)
+#else
+#define DPRINTF(fmt, args...) do {} while(0)
+#endif
- domid_t domid;
+/**
+ * The maximum mapped region size per request we will allow in a negotiated
+ * block-front/back communication channel.
+ */
+#define XBB_MAX_REQUEST_SIZE \
+ MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE)
- int state;
- int ring_connected;
- struct ring_ref rr;
- blkif_back_ring_t ring;
- evtchn_port_t evtchn;
- int irq;
- void *irq_cookie;
+/**
+ * The maximum number of segments (within a request header and accompanying
+ * segment blocks) per request we will allow in a negotiated block-front/back
+ * communication channel.
+ */
+#define XBB_MAX_SEGMENTS_PER_REQUEST \
+ (MIN(UIO_MAXIOV, \
+ MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \
+ (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1)))
+
+/**
+ * The maximum number of shared memory ring pages we will allow in a
+ * negotiated block-front/back communication channel. Allow enough
+ * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd.
+ */
+#define XBB_MAX_RING_PAGES \
+ BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \
+ * XBB_MAX_REQUESTS)
- int ref_cnt;
+/*--------------------------- Forward Declarations ---------------------------*/
+struct xbb_softc;
- int handle;
- char *mode;
- char *type;
- char *dev_name;
+static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt,
+ ...) __attribute__((format(printf, 3, 4)));
+static int xbb_shutdown(struct xbb_softc *xbb);
+static int xbb_detach(device_t dev);
- struct vnode *vn;
- struct cdev *cdev;
- struct cdevsw *csw;
- u_int sector_size;
- int sector_size_shift;
- off_t media_size;
- u_int media_num_sectors;
- int major;
- int minor;
- int read_only;
-
- struct mtx blk_ring_lock;
-
- device_t ndev;
-
- /* Stats */
- int st_rd_req;
- int st_wr_req;
- int st_oo_req;
- int st_err_req;
-} blkif_t;
-
-/*
- * These are rather arbitrary. They are fairly large because adjacent requests
- * pulled from a communication ring are quite likely to end up being part of
- * the same scatter/gather request at the disc.
- *
- * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
- *
- * This will increase the chances of being able to write whole tracks.
- * 64 should be enough to keep us competitive with Linux.
+/*------------------------------ Data Structures -----------------------------*/
+/**
+ * \brief Object tracking an in-flight I/O from a Xen VBD consumer.
*/
-static int blkif_reqs = 64;
-TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs);
+struct xbb_xen_req {
+ /**
+ * Linked list links used to aggregate idle request in the
+ * request free pool (xbb->request_free_slist).
+ */
+ SLIST_ENTRY(xbb_xen_req) links;
+
+ /**
+ * Back reference to the parent block back instance for this
+ * request. Used during bio_done handling.
+ */
+ struct xbb_softc *xbb;
+
+ /**
+ * The remote domain's identifier for this I/O request.
+ */
+ uint64_t id;
+
+ /**
+ * Kernel virtual address space reserved for this request
+ * structure and used to map the remote domain's pages for
+ * this I/O, into our domain's address space.
+ */
+ uint8_t *kva;
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ /**
+ * Pre-allocated domain local memory used to proxy remote
+ * domain memory during I/O operations.
+ */
+ uint8_t *bounce;
+#endif
-static int mmap_pages;
+ /**
+ * Base, psuedo-physical address, corresponding to the start
+ * of this request's kva region.
+ */
+ uint64_t gnt_base;
+
+ /**
+ * The number of pages currently mapped for this request.
+ */
+ int nr_pages;
+
+ /**
+ * The number of 512 byte sectors comprising this requests.
+ */
+ int nr_512b_sectors;
+
+ /**
+ * The number of struct bio requests still outstanding for this
+ * request on the backend device. This field is only used for
+ * device (rather than file) backed I/O.
+ */
+ int pendcnt;
+
+ /**
+ * BLKIF_OP code for this request.
+ */
+ int operation;
+
+ /**
+ * BLKIF_RSP status code for this request.
+ *
+ * This field allows an error status to be recorded even if the
+ * delivery of this status must be deferred. Deferred reporting
+ * is necessary, for example, when an error is detected during
+ * completion processing of one bio when other bios for this
+ * request are still outstanding.
+ */
+ int status;
+
+ /**
+ * Device statistics request ordering type (ordered or simple).
+ */
+ devstat_tag_type ds_tag_type;
+
+ /**
+ * Device statistics request type (read, write, no_data).
+ */
+ devstat_trans_flags ds_trans_type;
+
+ /**
+ * The start time for this request.
+ */
+ struct bintime ds_t0;
+
+ /**
+ * Array of grant handles (one per page) used to map this request.
+ */
+ grant_handle_t *gnt_handles;
+};
+SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req);
-/*
- * Each outstanding request that we've passed to the lower device layers has a
- * 'pending_req' allocated to it. Each buffer_head that completes decrements
- * the pendcnt towards zero. When it hits zero, the specified domain has a
- * response queued for it, with the saved 'id' passed back.
+/**
+ * \brief Configuration data for the shared memory request ring
+ * used to communicate with the front-end client of this
+ * this driver.
*/
-typedef struct pending_req {
- blkif_t *blkif;
- uint64_t id;
- int nr_pages;
- int pendcnt;
- unsigned short operation;
- int status;
- STAILQ_ENTRY(pending_req) free_list;
-} pending_req_t;
-
-static pending_req_t *pending_reqs;
-static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free =
- STAILQ_HEAD_INITIALIZER(pending_free);
-static struct mtx pending_free_lock;
-
-static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list =
- STAILQ_HEAD_INITIALIZER(req_sched_list);
-static struct mtx req_sched_list_lock;
-
-static unsigned long mmap_vstart;
-static unsigned long *pending_vaddrs;
-static grant_handle_t *pending_grant_handles;
-
-static struct task blk_req_task;
-
-/* Protos */
-static void disconnect_ring(blkif_t *blkif);
-static int vbd_add_dev(struct xenbus_device *xdev);
-
-static inline int vaddr_pagenr(pending_req_t *req, int seg)
-{
- return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-}
-
-static inline unsigned long vaddr(pending_req_t *req, int seg)
-{
- return pending_vaddrs[vaddr_pagenr(req, seg)];
-}
-
-#define pending_handle(_req, _seg) \
- (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+struct xbb_ring_config {
+ /** KVA address where ring memory is mapped. */
+ vm_offset_t va;
+
+ /** The pseudo-physical address where ring memory is mapped.*/
+ uint64_t gnt_addr;
+
+ /**
+ * Grant table handles, one per-ring page, returned by the
+ * hyperpervisor upon mapping of the ring and required to
+ * unmap it when a connection is torn down.
+ */
+ grant_handle_t handle[XBB_MAX_RING_PAGES];
+
+ /**
+ * The device bus address returned by the hypervisor when
+ * mapping the ring and required to unmap it when a connection
+ * is torn down.
+ */
+ uint64_t bus_addr[XBB_MAX_RING_PAGES];
+
+ /** The number of ring pages mapped for the current connection. */
+ u_int ring_pages;
+
+ /**
+ * The grant references, one per-ring page, supplied by the
+ * front-end, allowing us to reference the ring pages in the
+ * front-end's domain and to map these pages into our own domain.
+ */
+ grant_ref_t ring_ref[XBB_MAX_RING_PAGES];
+
+ /** The interrupt driven even channel used to signal ring events. */
+ evtchn_port_t evtchn;
+};
-static unsigned long
-alloc_empty_page_range(unsigned long nr_pages)
+/**
+ * Per-instance connection state flags.
+ */
+typedef enum
{
- void *pages;
- int i = 0, j = 0;
- multicall_entry_t mcl[17];
- unsigned long mfn_list[16];
- struct xen_memory_reservation reservation = {
- .extent_start = mfn_list,
- .nr_extents = 0,
- .address_bits = 0,
- .extent_order = 0,
- .domid = DOMID_SELF
- };
-
- pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT);
- if (pages == NULL)
- return 0;
+ /**
+ * The front-end requested a read-only mount of the
+ * back-end device/file.
+ */
+ XBBF_READ_ONLY = 0x01,
+
+ /** Communication with the front-end has been established. */
+ XBBF_RING_CONNECTED = 0x02,
+
+ /**
+ * Front-end requests exist in the ring and are waiting for
+ * xbb_xen_req objects to free up.
+ */
+ XBBF_RESOURCE_SHORTAGE = 0x04,
+
+ /** Connection teardown in progress. */
+ XBBF_SHUTDOWN = 0x08
+} xbb_flag_t;
+
+/** Backend device type. */
+typedef enum {
+ /** Backend type unknown. */
+ XBB_TYPE_NONE = 0x00,
+
+ /**
+ * Backend type disk (access via cdev switch
+ * strategy routine).
+ */
+ XBB_TYPE_DISK = 0x01,
+
+ /** Backend type file (access vnode operations.). */
+ XBB_TYPE_FILE = 0x02
+} xbb_type;
+
+/**
+ * \brief Structure used to memoize information about a per-request
+ * scatter-gather list.
+ *
+ * The chief benefit of using this data structure is it avoids having
+ * to reparse the possibly discontiguous S/G list in the original
+ * request. Due to the way that the mapping of the memory backing an
+ * I/O transaction is handled by Xen, a second pass is unavoidable.
+ * At least this way the second walk is a simple array traversal.
+ *
+ * \note A single Scatter/Gather element in the block interface covers
+ * at most 1 machine page. In this context a sector (blkif
+ * nomenclature, not what I'd choose) is a 512b aligned unit
+ * of mapping within the machine page referenced by an S/G
+ * element.
+ */
+struct xbb_sg {
+ /** The number of 512b data chunks mapped in this S/G element. */
+ int16_t nsect;
+
+ /**
+ * The index (0 based) of the first 512b data chunk mapped
+ * in this S/G element.
+ */
+ uint8_t first_sect;
+
+ /**
+ * The index (0 based) of the last 512b data chunk mapped
+ * in this S/G element.
+ */
+ uint8_t last_sect;
+};
- memset(mcl, 0, sizeof(mcl));
+/**
+ * Character device backend specific configuration data.
+ */
+struct xbb_dev_data {
+ /** Cdev used for device backend access. */
+ struct cdev *cdev;
- while (i < nr_pages) {
- unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE);
+ /** Cdev switch used for device backend access. */
+ struct cdevsw *csw;
- mcl[j].op = __HYPERVISOR_update_va_mapping;
- mcl[j].args[0] = va;
+ /** Used to hold a reference on opened cdev backend devices. */
+ int dev_ref;
+};
- mfn_list[j++] = vtomach(va) >> PAGE_SHIFT;
+/**
+ * File backend specific configuration data.
+ */
+struct xbb_file_data {
+ /** Credentials to use for vnode backed (file based) I/O. */
+ struct ucred *cred;
+
+ /**
+ * \brief Array of io vectors used to process file based I/O.
+ *
+ * Only a single file based request is outstanding per-xbb instance,
+ * so we only need one of these.
+ */
+ struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+#ifdef XBB_USE_BOUNCE_BUFFERS
+
+ /**
+ * \brief Array of io vectors used to handle bouncing of file reads.
+ *
+ * Vnode operations are free to modify uio data during their
+ * exectuion. In the case of a read with bounce buffering active,
+ * we need some of the data from the original uio in order to
+ * bounce-out the read data. This array serves as the temporary
+ * storage for this saved data.
+ */
+ struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST];
+
+ /**
+ * \brief Array of memoized bounce buffer kva offsets used
+ * in the file based backend.
+ *
+ * Due to the way that the mapping of the memory backing an
+ * I/O transaction is handled by Xen, a second pass through
+ * the request sg elements is unavoidable. We memoize the computed
+ * bounce address here to reduce the cost of the second walk.
+ */
+ void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST];
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+};
- xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY;
+/**
+ * Collection of backend type specific data.
+ */
+union xbb_backend_data {
+ struct xbb_dev_data dev;
+ struct xbb_file_data file;
+};
- if (j == 16 || i == nr_pages) {
- mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL;
+/**
+ * Function signature of backend specific I/O handlers.
+ */
+typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, int nseg,
+ int operation, int flags);
- reservation.nr_extents = j;
+/**
+ * Per-instance configuration data.
+ */
+struct xbb_softc {
+
+ /**
+ * Task-queue used to process I/O requests.
+ */
+ struct taskqueue *io_taskqueue;
+
+ /**
+ * Single "run the request queue" task enqueued
+ * on io_taskqueue.
+ */
+ struct task io_task;
+
+ /** Device type for this instance. */
+ xbb_type device_type;
+
+ /** NewBus device corresponding to this instance. */
+ device_t dev;
+
+ /** Backend specific dispatch routine for this instance. */
+ xbb_dispatch_t dispatch_io;
+
+ /** The number of requests outstanding on the backend device/file. */
+ u_int active_request_count;
+
+ /** Free pool of request tracking structures. */
+ struct xbb_xen_req_slist request_free_slist;
+
+ /** Array, sized at connection time, of request tracking structures. */
+ struct xbb_xen_req *requests;
+
+ /**
+ * Global pool of kva used for mapping remote domain ring
+ * and I/O transaction data.
+ */
+ vm_offset_t kva;
+
+ /** Psuedo-physical address corresponding to kva. */
+ uint64_t gnt_base_addr;
+
+ /** The size of the global kva pool. */
+ int kva_size;
+
+ /**
+ * \brief Cached value of the front-end's domain id.
+ *
+ * This value is used at once for each mapped page in
+ * a transaction. We cache it to avoid incuring the
+ * cost of an ivar access every time this is needed.
+ */
+ domid_t otherend_id;
+
+ /**
+ * \brief The blkif protocol abi in effect.
+ *
+ * There are situations where the back and front ends can
+ * have a different, native abi (e.g. intel x86_64 and
+ * 32bit x86 domains on the same machine). The back-end
+ * always accomodates the front-end's native abi. That
+ * value is pulled from the XenStore and recorded here.
+ */
+ int abi;
+
+ /**
+ * \brief The maximum number of requests allowed to be in
+ * flight at a time.
+ *
+ * This value is negotiated via the XenStore.
+ */
+ uint32_t max_requests;
+
+ /**
+ * \brief The maximum number of segments (1 page per segment)
+ * that can be mapped by a request.
+ *
+ * This value is negotiated via the XenStore.
+ */
+ uint32_t max_request_segments;
+
+ /**
+ * The maximum size of any request to this back-end
+ * device.
+ *
+ * This value is negotiated via the XenStore.
+ */
+ uint32_t max_request_size;
+
+ /** Various configuration and state bit flags. */
+ xbb_flag_t flags;
+
+ /** Ring mapping and interrupt configuration data. */
+ struct xbb_ring_config ring_config;
+
+ /** Runtime, cross-abi safe, structures for ring access. */
+ blkif_back_rings_t rings;
+
+ /** IRQ mapping for the communication ring event channel. */
+ int irq;
+
+ /**
+ * \brief Backend access mode flags (e.g. write, or read-only).
+ *
+ * This value is passed to us by the front-end via the XenStore.
+ */
+ char *dev_mode;
+
+ /**
+ * \brief Backend device type (e.g. "disk", "cdrom", "floppy").
+ *
+ * This value is passed to us by the front-end via the XenStore.
+ * Currently unused.
+ */
+ char *dev_type;
+
+ /**
+ * \brief Backend device/file identifier.
+ *
+ * This value is passed to us by the front-end via the XenStore.
+ * We expect this to be a POSIX path indicating the file or
+ * device to open.
+ */
+ char *dev_name;
+
+ /**
+ * Vnode corresponding to the backend device node or file
+ * we are acessing.
+ */
+ struct vnode *vn;
+
+ union xbb_backend_data backend;
+ /** The native sector size of the backend. */
+ u_int sector_size;
+
+ /** log2 of sector_size. */
+ u_int sector_size_shift;
+
+ /** Size in bytes of the backend device or file. */
+ off_t media_size;
+
+ /**
+ * \brief media_size expressed in terms of the backend native
+ * sector size.
+ *
+ * (e.g. xbb->media_size >> xbb->sector_size_shift).
+ */
+ uint64_t media_num_sectors;
+
+ /**
+ * \brief Array of memoized scatter gather data computed during the
+ * conversion of blkif ring requests to internal xbb_xen_req
+ * structures.
+ *
+ * Ring processing is serialized so we only need one of these.
+ */
+ struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST];
+
+ /** Mutex protecting per-instance data. */
+ struct mtx lock;
+
+#ifdef XENHVM
+ /**
+ * Resource representing allocated physical address space
+ * associated with our per-instance kva region.
+ */
+ struct resource *pseudo_phys_res;
+
+ /** Resource id for allocated physical address space. */
+ int pseudo_phys_res_id;
+#endif
- mcl[j].op = __HYPERVISOR_memory_op;
- mcl[j].args[0] = XENMEM_decrease_reservation;
- mcl[j].args[1] = (unsigned long)&reservation;
-
- (void)HYPERVISOR_multicall(mcl, j+1);
+ /** I/O statistics. */
+ struct devstat *xbb_stats;
+};
- mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0;
- j = 0;
+/*---------------------------- Request Processing ----------------------------*/
+/**
+ * Allocate an internal transaction tracking structure from the free pool.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return On success, a pointer to the allocated xbb_xen_req structure.
+ * Otherwise NULL.
+ */
+static inline struct xbb_xen_req *
+xbb_get_req(struct xbb_softc *xbb)
+{
+ struct xbb_xen_req *req;
+
+ req = NULL;
+ mtx_lock(&xbb->lock);
+
+ /*
+ * Do not allow new requests to be allocated while we
+ * are shutting down.
+ */
+ if ((xbb->flags & XBBF_SHUTDOWN) == 0) {
+ if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) {
+ SLIST_REMOVE_HEAD(&xbb->request_free_slist, links);
+ xbb->active_request_count++;
+ } else {
+ xbb->flags |= XBBF_RESOURCE_SHORTAGE;
}
}
-
- return (unsigned long)pages;
+ mtx_unlock(&xbb->lock);
+ return (req);
}
-static pending_req_t *
-alloc_req(void)
-{
- pending_req_t *req;
- mtx_lock(&pending_free_lock);
- if ((req = STAILQ_FIRST(&pending_free))) {
- STAILQ_REMOVE(&pending_free, req, pending_req, free_list);
- STAILQ_NEXT(req, free_list) = NULL;
- }
- mtx_unlock(&pending_free_lock);
- return req;
-}
-
-static void
-free_req(pending_req_t *req)
+/**
+ * Return an allocated transaction tracking structure to the free pool.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param req The request structure to free.
+ */
+static inline void
+xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req)
{
- int was_empty;
-
- mtx_lock(&pending_free_lock);
- was_empty = STAILQ_EMPTY(&pending_free);
- STAILQ_INSERT_TAIL(&pending_free, req, free_list);
- mtx_unlock(&pending_free_lock);
- if (was_empty)
- taskqueue_enqueue(taskqueue_swi, &blk_req_task);
-}
+ int wake_thread;
-static void
-fast_flush_area(pending_req_t *req)
-{
- struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- unsigned int i, invcount = 0;
- grant_handle_t handle;
- int ret;
+ mtx_lock(&xbb->lock);
+ wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE;
+ xbb->flags &= ~XBBF_RESOURCE_SHORTAGE;
+ SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+ xbb->active_request_count--;
- for (i = 0; i < req->nr_pages; i++) {
- handle = pending_handle(req, i);
- if (handle == BLKBACK_INVALID_HANDLE)
- continue;
- unmap[invcount].host_addr = vaddr(req, i);
- unmap[invcount].dev_bus_addr = 0;
- unmap[invcount].handle = handle;
- pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
- invcount++;
+ if ((xbb->flags & XBBF_SHUTDOWN) != 0) {
+ /*
+ * Shutdown is in progress. See if we can
+ * progress further now that one more request
+ * has completed and been returned to the
+ * free pool.
+ */
+ xbb_shutdown(xbb);
}
+ mtx_unlock(&xbb->lock);
- ret = HYPERVISOR_grant_table_op(
- GNTTABOP_unmap_grant_ref, unmap, invcount);
- PANIC_IF(ret);
+ if (wake_thread != 0)
+ taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
}
-static void
-blkif_get(blkif_t *blkif)
+/**
+ * Given a page index and 512b sector offset within that page,
+ * calculate an offset into a request's kva region.
+ *
+ * \param req The request structure whose kva region will be accessed.
+ * \param pagenr The page index used to compute the kva offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * kva offset.
+ *
+ * \return The computed global KVA offset.
+ */
+static inline uint8_t *
+xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector)
{
- atomic_add_int(&blkif->ref_cnt, 1);
+ return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9));
}
-static void
-blkif_put(blkif_t *blkif)
+#ifdef XBB_USE_BOUNCE_BUFFERS
+/**
+ * Given a page index and 512b sector offset within that page,
+ * calculate an offset into a request's local bounce memory region.
+ *
+ * \param req The request structure whose bounce region will be accessed.
+ * \param pagenr The page index used to compute the bounce offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * bounce offset.
+ *
+ * \return The computed global bounce buffer address.
+ */
+static inline uint8_t *
+xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector)
{
- if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) {
- DPRINTF("Removing %x\n", (unsigned int)blkif);
- disconnect_ring(blkif);
- if (blkif->mode)
- free(blkif->mode, M_DEVBUF);
- if (blkif->type)
- free(blkif->type, M_DEVBUF);
- if (blkif->dev_name)
- free(blkif->dev_name, M_DEVBUF);
- free(blkif, M_DEVBUF);
- }
+ return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9));
}
+#endif
-static int
-blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params)
+/**
+ * Given a page number and 512b sector offset within that page,
+ * calculate an offset into the request's memory region that the
+ * underlying backend device/file should use for I/O.
+ *
+ * \param req The request structure whose I/O region will be accessed.
+ * \param pagenr The page index used to compute the I/O offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * I/O offset.
+ *
+ * \return The computed global I/O address.
+ *
+ * Depending on configuration, this will either be a local bounce buffer
+ * or a pointer to the memory mapped in from the front-end domain for
+ * this request.
+ */
+static inline uint8_t *
+xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector)
{
- blkif_t *blkif;
-
- blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO);
- if (!blkif)
- return ENOMEM;
-
- DPRINTF("Created %x\n", (unsigned int)blkif);
-
- blkif->ref_cnt = 1;
- blkif->domid = xdev->otherend_id;
- blkif->handle = handle;
- blkif->mode = mode;
- blkif->type = type;
- blkif->dev_name = params;
- blkif->xdev = xdev;
- xdev->data = blkif;
-
- mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF);
-
- if (strcmp(mode, "w"))
- blkif->read_only = 1;
-
- return 0;
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ return (xbb_req_bounce_addr(req, pagenr, sector));
+#else
+ return (xbb_req_vaddr(req, pagenr, sector));
+#endif
}
-static void
-add_to_req_schedule_list_tail(blkif_t *blkif)
+/**
+ * Given a page index and 512b sector offset within that page, calculate
+ * an offset into the local psuedo-physical address space used to map a
+ * front-end's request data into a request.
+ *
+ * \param req The request structure whose pseudo-physical region
+ * will be accessed.
+ * \param pagenr The page index used to compute the pseudo-physical offset.
+ * \param sector The 512b sector index used to compute the page relative
+ * pseudo-physical offset.
+ *
+ * \return The computed global pseudo-phsyical address.
+ *
+ * Depending on configuration, this will either be a local bounce buffer
+ * or a pointer to the memory mapped in from the front-end domain for
+ * this request.
+ */
+static inline uintptr_t
+xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector)
{
- if (!blkif->on_req_sched_list) {
- mtx_lock(&req_sched_list_lock);
- if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) {
- blkif_get(blkif);
- STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
- blkif->on_req_sched_list = 1;
- taskqueue_enqueue(taskqueue_swi, &blk_req_task);
- }
- mtx_unlock(&req_sched_list_lock);
- }
+ return ((uintptr_t)(req->gnt_base
+ + (PAGE_SIZE * pagenr) + (sector << 9)));
}
-/* This routine does not call blkif_get(), does not schedule the blk_req_task to run,
- and assumes that the state is connected */
+/**
+ * Unmap the front-end pages associated with this I/O request.
+ *
+ * \param req The request structure to unmap.
+ */
static void
-add_to_req_schedule_list_tail2(blkif_t *blkif)
+xbb_unmap_req(struct xbb_xen_req *req)
{
- mtx_lock(&req_sched_list_lock);
- if (!blkif->on_req_sched_list) {
- STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req);
- blkif->on_req_sched_list = 1;
- }
- mtx_unlock(&req_sched_list_lock);
-}
+ struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST];
+ u_int i;
+ u_int invcount;
+ int error;
-/* Removes blkif from front of list and does not call blkif_put() (caller must) */
-static blkif_t *
-remove_from_req_schedule_list(void)
-{
- blkif_t *blkif;
+ invcount = 0;
+ for (i = 0; i < req->nr_pages; i++) {
- mtx_lock(&req_sched_list_lock);
+ if (req->gnt_handles[i] == GRANT_REF_INVALID)
+ continue;
- if ((blkif = STAILQ_FIRST(&req_sched_list))) {
- STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req);
- STAILQ_NEXT(blkif, next_req) = NULL;
- blkif->on_req_sched_list = 0;
+ unmap[invcount].host_addr = xbb_req_gntaddr(req, i, 0);
+ unmap[invcount].dev_bus_addr = 0;
+ unmap[invcount].handle = req->gnt_handles[i];
+ req->gnt_handles[i] = GRANT_REF_INVALID;
+ invcount++;
}
- mtx_unlock(&req_sched_list_lock);
-
- return blkif;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref,
+ unmap, invcount);
+ KASSERT(error == 0, ("Grant table operation failed"));
}
+/**
+ * Create and transmit a response to a blkif request.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param req The request structure to which to respond.
+ * \param status The status code to report. See BLKIF_RSP_*
+ * in sys/xen/interface/io/blkif.h.
+ */
static void
-make_response(blkif_t *blkif, uint64_t id,
- unsigned short op, int st)
+xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status)
{
blkif_response_t *resp;
- blkif_back_ring_t *blk_ring = &blkif->ring;
- int more_to_do = 0;
- int notify;
+ int more_to_do;
+ int notify;
+
+ more_to_do = 0;
+
+ /*
+ * Place on the response ring for the relevant domain.
+ * For now, only the spacing between entries is different
+ * in the different ABIs, not the response entry layout.
+ */
+ mtx_lock(&xbb->lock);
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ resp = RING_GET_RESPONSE(&xbb->rings.native,
+ xbb->rings.native.rsp_prod_pvt);
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ resp = (blkif_response_t *)
+ RING_GET_RESPONSE(&xbb->rings.x86_32,
+ xbb->rings.x86_32.rsp_prod_pvt);
+ break;
+ case BLKIF_PROTOCOL_X86_64:
+ resp = (blkif_response_t *)
+ RING_GET_RESPONSE(&xbb->rings.x86_64,
+ xbb->rings.x86_64.rsp_prod_pvt);
+ break;
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ }
- mtx_lock(&blkif->blk_ring_lock);
+ resp->id = req->id;
+ resp->operation = req->operation;
+ resp->status = status;
+ xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages);
+ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify);
- /* Place on the response ring for the relevant domain. */
- resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt);
- resp->id = id;
- resp->operation = op;
- resp->status = st;
- blk_ring->rsp_prod_pvt++;
- RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify);
+ if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) {
- if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) {
/*
* Tail check for pending requests. Allows frontend to avoid
* notifications if requests are already in flight (lower
* overheads and promotes batching).
*/
- RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
+ RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do);
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) {
- } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring))
more_to_do = 1;
+ }
- mtx_unlock(&blkif->blk_ring_lock);
+ mtx_unlock(&xbb->lock);
if (more_to_do)
- add_to_req_schedule_list_tail(blkif);
+ taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
if (notify)
- notify_remote_via_irq(blkif->irq);
+ notify_remote_via_irq(xbb->irq);
}
+/**
+ * Completion handler for buffer I/O requests issued by the device
+ * backend driver.
+ *
+ * \param bio The buffer I/O request on which to perform completion
+ * processing.
+ */
static void
-end_block_io_op(struct bio *bio)
+xbb_bio_done(struct bio *bio)
{
- pending_req_t *pending_req = bio->bio_caller2;
+ struct xbb_softc *xbb;
+ struct xbb_xen_req *req;
+
+ req = bio->bio_caller1;
+ xbb = req->xbb;
+ /* Only include transferred I/O in stats. */
+ req->nr_512b_sectors -= bio->bio_resid >> 9;
if (bio->bio_error) {
DPRINTF("BIO returned error %d for operation on device %s\n",
- bio->bio_error, pending_req->blkif->dev_name);
- pending_req->status = BLKIF_RSP_ERROR;
- pending_req->blkif->st_err_req++;
+ bio->bio_error, xbb->dev_name);
+ req->status = BLKIF_RSP_ERROR;
+
+ if (bio->bio_error == ENXIO
+ && xenbus_get_state(xbb->dev) == XenbusStateConnected) {
+
+ /*
+ * Backend device has disappeared. Signal the
+ * front-end that we (the device proxy) want to
+ * go away.
+ */
+ xenbus_set_state(xbb->dev, XenbusStateClosing);
+ }
}
-#if 0
- printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n",
- (unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags);
-#endif
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ if (bio->bio_cmd == BIO_READ) {
+ vm_offset_t kva_offset;
- if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) {
- fast_flush_area(pending_req);
- make_response(pending_req->blkif, pending_req->id,
- pending_req->operation, pending_req->status);
- blkif_put(pending_req->blkif);
- free_req(pending_req);
+ kva_offset = (vm_offset_t)bio->bio_data
+ - (vm_offset_t)req->bounce;
+ memcpy((uint8_t *)req->kva + kva_offset,
+ bio->bio_data, bio->bio_bcount);
+ }
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+
+ if (atomic_fetchadd_int(&req->pendcnt, -1) == 1) {
+ xbb_unmap_req(req);
+ xbb_send_response(xbb, req, req->status);
+ devstat_end_transaction(xbb->xbb_stats,
+ /*bytes*/req->nr_512b_sectors << 9,
+ req->ds_tag_type,
+ req->ds_trans_type,
+ /*now*/NULL,
+ /*then*/&req->ds_t0);
+ xbb_release_req(xbb, req);
}
g_destroy_bio(bio);
}
+/**
+ * Parse a blkif request into an internal request structure and send
+ * it to the backend for processing.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param ring_req Front-end's I/O request as pulled from the shared
+ * communication ring.
+ * \param req Allocated internal request structure.
+ * \param req_ring_idx The location of ring_req within the shared
+ * communication ring.
+ *
+ * This routine performs the backend common aspects of request parsing
+ * including compiling an internal request structure, parsing the S/G
+ * list and any secondary ring requests in which they may reside, and
+ * the mapping of front-end I/O pages into our domain.
+ */
static void
-dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req)
+xbb_dispatch_io(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, RING_IDX req_ring_idx)
{
- struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- struct {
- unsigned long buf; unsigned int nsec;
- } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- unsigned int nseg = req->nr_segments, nr_sects = 0;
- struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- int operation, ret, i, nbio = 0;
+ struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQUEST];
+ struct xbb_sg *xbb_sg;
+ struct gnttab_map_grant_ref *map;
+ struct blkif_request_segment *sg;
+ struct blkif_request_segment *last_block_sg;
+ u_int nseg;
+ u_int seg_idx;
+ u_int block_segs;
+ int nr_sects;
+ int operation;
+ uint8_t bio_flags;
+ int error;
+
+ nseg = ring_req->nr_segments;
+ nr_sects = 0;
+ req->xbb = xbb;
+ req->id = ring_req->id;
+ req->operation = ring_req->operation;
+ req->status = BLKIF_RSP_OKAY;
+ req->ds_tag_type = DEVSTAT_TAG_SIMPLE;
+ req->nr_pages = nseg;
+ req->nr_512b_sectors = 0;
+ bio_flags = 0;
+ sg = NULL;
+
+ binuptime(&req->ds_t0);
+ devstat_start_transaction(xbb->xbb_stats, &req->ds_t0);
+
+ switch (req->operation) {
+ case BLKIF_OP_WRITE_BARRIER:
+ bio_flags |= BIO_ORDERED;
+ req->ds_tag_type = DEVSTAT_TAG_ORDERED;
+ /* FALLTHROUGH */
+ case BLKIF_OP_WRITE:
+ operation = BIO_WRITE;
+ req->ds_trans_type = DEVSTAT_WRITE;
+ if ((xbb->flags & XBBF_READ_ONLY) != 0) {
+ DPRINTF("Attempt to write to read only device %s\n",
+ xbb->dev_name);
+ goto fail_send_response;
+ }
+ break;
+ case BLKIF_OP_READ:
+ operation = BIO_READ;
+ req->ds_trans_type = DEVSTAT_READ;
+ break;
+ case BLKIF_OP_FLUSH_DISKCACHE:
+ operation = BIO_FLUSH;
+ req->ds_tag_type = DEVSTAT_TAG_ORDERED;
+ req->ds_trans_type = DEVSTAT_NO_DATA;
+ goto do_dispatch;
+ /*NOTREACHED*/
+ default:
+ DPRINTF("error: unknown block io operation [%d]\n",
+ req->operation);
+ goto fail_send_response;
+ }
/* Check that number of segments is sane. */
- if (unlikely(nseg == 0) ||
- unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+ if (unlikely(nseg == 0)
+ || unlikely(nseg > xbb->max_request_segments)) {
DPRINTF("Bad number of segments in request (%d)\n", nseg);
- goto fail_response;
+ goto fail_send_response;
}
- if (req->operation == BLKIF_OP_WRITE) {
- if (blkif->read_only) {
- DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name);
- goto fail_response;
- }
- operation = BIO_WRITE;
- } else
- operation = BIO_READ;
-
- pending_req->blkif = blkif;
- pending_req->id = req->id;
- pending_req->operation = req->operation;
- pending_req->status = BLKIF_RSP_OKAY;
- pending_req->nr_pages = nseg;
+ map = maps;
+ xbb_sg = xbb->xbb_sgs;
+ block_segs = MIN(req->nr_pages, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK);
+ sg = ring_req->seg;
+ last_block_sg = sg + block_segs;
+ seg_idx = 0;
+ while (1) {
- for (i = 0; i < nseg; i++) {
- seg[i].nsec = req->seg[i].last_sect -
- req->seg[i].first_sect + 1;
+ while (sg < last_block_sg) {
+
+ xbb_sg->first_sect = sg->first_sect;
+ xbb_sg->last_sect = sg->last_sect;
+ xbb_sg->nsect =
+ (int8_t)(sg->last_sect - sg->first_sect + 1);
+
+ if ((sg->last_sect >= (PAGE_SIZE >> 9))
+ || (xbb_sg->nsect <= 0))
+ goto fail_send_response;
+
+ nr_sects += xbb_sg->nsect;
+ map->host_addr = xbb_req_gntaddr(req, seg_idx,
+ /*sector*/0);
+ map->flags = GNTMAP_host_map;
+ map->ref = sg->gref;
+ map->dom = xbb->otherend_id;
+ if (operation == BIO_WRITE)
+ map->flags |= GNTMAP_readonly;
+ sg++;
+ map++;
+ xbb_sg++;
+ seg_idx++;
+ }
- if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
- (seg[i].nsec <= 0))
- goto fail_response;
- nr_sects += seg[i].nsec;
+ block_segs = MIN(nseg - seg_idx,
+ BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK);
+ if (block_segs == 0)
+ break;
- map[i].host_addr = vaddr(pending_req, i);
- map[i].dom = blkif->domid;
- map[i].ref = req->seg[i].gref;
- map[i].flags = GNTMAP_host_map;
- if (operation == BIO_WRITE)
- map[i].flags |= GNTMAP_readonly;
+ /*
+ * Fetch the next request block full of SG elements.
+ * For now, only the spacing between entries is different
+ * in the different ABIs, not the sg entry layout.
+ */
+ req_ring_idx++;
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native,
+ req_ring_idx);
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32,
+ req_ring_idx);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64,
+ req_ring_idx);
+ break;
+ }
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ /* NOTREACHED */
+ }
+ last_block_sg = sg + block_segs;
}
/* Convert to the disk's sector size */
- nr_sects = (nr_sects << 9) >> blkif->sector_size_shift;
-
- ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
- PANIC_IF(ret);
+ req->nr_512b_sectors = nr_sects;
+ nr_sects = (nr_sects << 9) >> xbb->sector_size_shift;
+
+ if ((req->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) {
+ device_printf(xbb->dev, "%s: I/O size (%d) is not a multiple "
+ "of the backing store sector size (%d)\n",
+ __func__, req->nr_512b_sectors << 9,
+ xbb->sector_size);
+ goto fail_send_response;
+ }
- for (i = 0; i < nseg; i++) {
- if (unlikely(map[i].status != 0)) {
- DPRINTF("invalid buffer -- could not remap it\n");
- goto fail_flush;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref,
+ maps, req->nr_pages);
+ if (error != 0)
+ panic("Grant table operation failed (%d)", error);
+
+ for (seg_idx = 0, map = maps; seg_idx < nseg; seg_idx++, map++) {
+
+ if (unlikely(map->status != 0)) {
+ DPRINTF("invalid buffer -- could not remap it (%d)\n",
+ map->status);
+ DPRINTF("Mapping(%d): Host Addr 0x%lx, flags 0x%x "
+ "ref 0x%x, dom %d\n", seg_idx,
+ map->host_addr, map->flags, map->ref,
+ map->dom);
+ goto fail_unmap_req;
}
- pending_handle(pending_req, i) = map[i].handle;
-#if 0
- /* Can't do this in FreeBSD since vtophys() returns the pfn */
- /* of the remote domain who loaned us the machine page - DPT */
- xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] =
- map[i]dev_bus_addr >> PAGE_SHIFT;
-#endif
- seg[i].buf = map[i].dev_bus_addr |
- (req->seg[i].first_sect << 9);
+ req->gnt_handles[seg_idx] = map->handle;
}
+ if (ring_req->sector_number + nr_sects > xbb->media_num_sectors) {
- if (req->sector_number + nr_sects > blkif->media_num_sectors) {
- DPRINTF("%s of [%llu,%llu] extends past end of device %s\n",
+ DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] "
+ "extends past end of device %s\n",
operation == BIO_READ ? "read" : "write",
- req->sector_number,
- req->sector_number + nr_sects, blkif->dev_name);
- goto fail_flush;
+ ring_req->sector_number,
+ ring_req->sector_number + nr_sects, xbb->dev_name);
+ goto fail_unmap_req;
}
- for (i = 0; i < nseg; i++) {
- struct bio *bio;
-
- if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) {
- DPRINTF("Misaligned I/O request from domain %d", blkif->domid);
- goto fail_put_bio;
- }
-
- bio = biolist[nbio++] = g_new_bio();
- if (unlikely(bio == NULL))
- goto fail_put_bio;
+do_dispatch:
- bio->bio_cmd = operation;
- bio->bio_offset = req->sector_number << blkif->sector_size_shift;
- bio->bio_length = seg[i].nsec << 9;
- bio->bio_bcount = bio->bio_length;
- bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK));
- bio->bio_done = end_block_io_op;
- bio->bio_caller2 = pending_req;
- bio->bio_dev = blkif->cdev;
+ error = xbb->dispatch_io(xbb,
+ ring_req,
+ req,
+ nseg,
+ operation,
+ bio_flags);
- req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift;
-#if 0
- printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n",
- (unsigned int)bio, req->operation, req->sector_number, seg[i].nsec,
- blkif->cdev->si_iosize_max, seg[i].buf);
-#endif
+ if (error != 0) {
+ if (operation == BIO_FLUSH)
+ goto fail_send_response;
+ else
+ goto fail_unmap_req;
}
- pending_req->pendcnt = nbio;
- blkif_get(blkif);
+ return;
- for (i = 0; i < nbio; i++)
- (*blkif->csw->d_strategy)(biolist[i]);
- return;
+fail_unmap_req:
+ xbb_unmap_req(req);
+ /* FALLTHROUGH */
- fail_put_bio:
- for (i = 0; i < (nbio-1); i++)
- g_destroy_bio(biolist[i]);
- fail_flush:
- fast_flush_area(pending_req);
- fail_response:
- make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
- free_req(pending_req);
+fail_send_response:
+ xbb_send_response(xbb, req, BLKIF_RSP_ERROR);
+ xbb_release_req(xbb, req);
+ devstat_end_transaction(xbb->xbb_stats,
+ /*bytes*/0,
+ req->ds_tag_type,
+ req->ds_trans_type,
+ /*now*/NULL,
+ /*then*/&req->ds_t0);
}
+/**
+ * Process incoming requests from the shared communication ring in response
+ * to a signal on the ring's event channel.
+ *
+ * \param context Callback argument registerd during task initialization -
+ * the xbb_softc for this instance.
+ * \param pending The number of taskqueue_enqueue events that have
+ * occurred since this handler was last run.
+ */
static void
-blk_req_action(void *context, int pending)
+xbb_run_queue(void *context, int pending)
{
- blkif_t *blkif;
-
- DPRINTF("\n");
-
- while (!STAILQ_EMPTY(&req_sched_list)) {
- blkif_back_ring_t *blk_ring;
- RING_IDX rc, rp;
-
- blkif = remove_from_req_schedule_list();
-
- blk_ring = &blkif->ring;
- rc = blk_ring->req_cons;
- rp = blk_ring->sring->req_prod;
- rmb(); /* Ensure we see queued requests up to 'rp'. */
-
- while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
- blkif_request_t *req;
- pending_req_t *pending_req;
-
- pending_req = alloc_req();
- if (pending_req == NULL)
- goto out_of_preqs;
-
- req = RING_GET_REQUEST(blk_ring, rc);
- blk_ring->req_cons = ++rc; /* before make_response() */
-
- switch (req->operation) {
- case BLKIF_OP_READ:
- blkif->st_rd_req++;
- dispatch_rw_block_io(blkif, req, pending_req);
- break;
- case BLKIF_OP_WRITE:
- blkif->st_wr_req++;
- dispatch_rw_block_io(blkif, req, pending_req);
- break;
- default:
- blkif->st_err_req++;
- DPRINTF("error: unknown block io operation [%d]\n",
- req->operation);
- make_response(blkif, req->id, req->operation,
- BLKIF_RSP_ERROR);
- free_req(pending_req);
- break;
- }
+ struct xbb_softc *xbb;
+ blkif_back_rings_t *rings;
+ RING_IDX rp;
+
+
+ xbb = (struct xbb_softc *)context;
+ rings = &xbb->rings;
+
+ /*
+ * Cache req_prod to avoid accessing a cache line shared
+ * with the frontend.
+ */
+ rp = rings->common.sring->req_prod;
+
+ /* Ensure we see queued requests up to 'rp'. */
+ rmb();
+
+ /**
+ * Run so long as there is work to consume and the generation
+ * of a response will not overflow the ring.
+ *
+ * @note There's a 1 to 1 relationship between requests and responses,
+ * so an overflow should never occur. This test is to protect
+ * our domain from digesting bogus data. Shouldn't we log this?
+ */
+ while (rings->common.req_cons != rp
+ && RING_REQUEST_CONS_OVERFLOW(&rings->common,
+ rings->common.req_cons) == 0) {
+ blkif_request_t ring_req_storage;
+ blkif_request_t *ring_req;
+ struct xbb_xen_req *req;
+ RING_IDX req_ring_idx;
+
+ req = xbb_get_req(xbb);
+ if (req == NULL) {
+ /*
+ * Resource shortage has been recorded.
+ * We'll be scheduled to run once a request
+ * object frees up due to a completion.
+ */
+ break;
}
- blkif_put(blkif);
- }
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ ring_req = RING_GET_REQUEST(&xbb->rings.native,
+ rings->common.req_cons);
+ break;
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ struct blkif_x86_32_request *ring_req32;
+
+ ring_req32 = RING_GET_REQUEST(&xbb->rings.x86_32,
+ rings->common.req_cons);
+ blkif_get_x86_32_req(&ring_req_storage, ring_req32);
+ ring_req = &ring_req_storage;
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ struct blkif_x86_64_request *ring_req64;
+
+ ring_req64 = RING_GET_REQUEST(&xbb->rings.x86_64,
+ rings->common.req_cons);
+ blkif_get_x86_64_req(&ring_req_storage, ring_req64);
+ ring_req = &ring_req_storage;
+ break;
+ }
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ /* NOTREACHED */
+ }
- return;
+ /*
+ * Signify that we can overwrite this request with a
+ * response by incrementing our consumer index. The
+ * response won't be generated until after we've already
+ * consumed all necessary data out of the version of the
+ * request in the ring buffer (for native mode). We
+ * must update the consumer index before issueing back-end
+ * I/O so there is no possibility that it will complete
+ * and a response be generated before we make room in
+ * the queue for that response.
+ */
+ req_ring_idx = xbb->rings.common.req_cons;
+ xbb->rings.common.req_cons +=
+ BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments);
- out_of_preqs:
- /* We ran out of pending req structs */
- /* Just requeue interface and wait to be rescheduled to run when one is freed */
- add_to_req_schedule_list_tail2(blkif);
- blkif->st_oo_req++;
+ xbb_dispatch_io(xbb, ring_req, req, req_ring_idx);
+ }
}
-/* Handle interrupt from a frontend */
+/**
+ * Interrupt handler bound to the shared ring's event channel.
+ *
+ * \param arg Callback argument registerd during event channel
+ * binding - the xbb_softc for this instance.
+ */
static void
-blkback_intr(void *arg)
+xbb_intr(void *arg)
{
- blkif_t *blkif = arg;
- DPRINTF("%x\n", (unsigned int)blkif);
- add_to_req_schedule_list_tail(blkif);
+ struct xbb_softc *xbb;
+
+ /* Defer to kernel thread. */
+ xbb = (struct xbb_softc *)arg;
+ taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task);
}
-/* Map grant ref for ring */
+/*----------------------------- Backend Handlers -----------------------------*/
+/**
+ * Backend handler for character device access.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param ring_req Front-end's I/O request as pulled from the shared
+ * communication ring.
+ * \param req Allocated internal request structure.
+ * \param nseg The number of valid segments for this request in
+ * xbb->xbb_sgs.
+ * \param operation BIO_* I/O operation code.
+ * \param bio_flags Additional bio_flag data to pass to any generated
+ * bios (e.g. BIO_ORDERED)..
+ *
+ * \return 0 for success, errno codes for failure.
+ */
static int
-map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring)
+xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, int nseg, int operation,
+ int bio_flags)
{
- struct gnttab_map_grant_ref op;
+ struct xbb_dev_data *dev_data;
+ struct bio *bios[XBB_MAX_SEGMENTS_PER_REQUEST];
+ off_t bio_offset;
+ struct bio *bio;
+ struct xbb_sg *xbb_sg;
+ u_int nbio;
+ u_int bio_idx;
+ u_int seg_idx;
+ int error;
+
+ dev_data = &xbb->backend.dev;
+ bio_offset = (off_t)ring_req->sector_number
+ << xbb->sector_size_shift;
+ error = 0;
+ nbio = 0;
+ bio_idx = 0;
+
+ if (operation == BIO_FLUSH) {
+ bio = g_new_bio();
+ if (unlikely(bio == NULL)) {
+ DPRINTF("Unable to allocate bio for BIO_FLUSH\n");
+ error = ENOMEM;
+ return (error);
+ }
+
+ bio->bio_cmd = BIO_FLUSH;
+ bio->bio_flags |= BIO_ORDERED;
+ bio->bio_dev = dev_data->cdev;
+ bio->bio_offset = 0;
+ bio->bio_data = 0;
+ bio->bio_done = xbb_bio_done;
+ bio->bio_caller1 = req;
+ bio->bio_pblkno = 0;
- ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE);
- if (ring->va == 0)
- return ENOMEM;
+ req->pendcnt = 1;
- op.host_addr = ring->va;
- op.flags = GNTMAP_host_map;
- op.ref = ref;
- op.dom = dom;
- HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1);
- if (op.status) {
- WPRINTF("grant table op err=%d\n", op.status);
- kmem_free(kernel_map, ring->va, PAGE_SIZE);
- ring->va = 0;
- return EACCES;
+ (*dev_data->csw->d_strategy)(bios[bio_idx]);
+
+ return (0);
}
- ring->handle = op.handle;
- ring->bus_addr = op.dev_bus_addr;
+ for (seg_idx = 0, bio = NULL, xbb_sg = xbb->xbb_sgs;
+ seg_idx < nseg;
+ seg_idx++, xbb_sg++) {
- return 0;
-}
+ /*
+ * KVA will not be contiguous, so any additional
+ * I/O will need to be represented in a new bio.
+ */
+ if ((bio != NULL)
+ && (xbb_sg->first_sect != 0)) {
+ if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
+ printf("%s: Discontiguous I/O request from "
+ "domain %d ends on non-sector "
+ "boundary\n", __func__,
+ xbb->otherend_id);
+ error = EINVAL;
+ goto fail_free_bios;
+ }
+ bio = NULL;
+ }
-/* Unmap grant ref for ring */
-static void
-unmap_ring(struct ring_ref *ring)
-{
- struct gnttab_unmap_grant_ref op;
+ if (bio == NULL) {
+ /*
+ * Make sure that the start of this bio is aligned
+ * to a device sector.
+ */
+ if ((bio_offset & (xbb->sector_size - 1)) != 0) {
+ printf("%s: Misaligned I/O request from "
+ "domain %d\n", __func__,
+ xbb->otherend_id);
+ error = EINVAL;
+ goto fail_free_bios;
+ }
- op.host_addr = ring->va;
- op.dev_bus_addr = ring->bus_addr;
- op.handle = ring->handle;
- HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1);
- if (op.status)
- WPRINTF("grant table op err=%d\n", op.status);
+ bio = bios[nbio++] = g_new_bio();
+ if (unlikely(bio == NULL)) {
+ error = ENOMEM;
+ goto fail_free_bios;
+ }
+ bio->bio_cmd = operation;
+ bio->bio_flags |= bio_flags;
+ bio->bio_dev = dev_data->cdev;
+ bio->bio_offset = bio_offset;
+ bio->bio_data = xbb_req_ioaddr(req, seg_idx,
+ xbb_sg->first_sect);
+ bio->bio_done = xbb_bio_done;
+ bio->bio_caller1 = req;
+ bio->bio_pblkno = bio_offset
+ >> xbb->sector_size_shift;
+ }
- kmem_free(kernel_map, ring->va, PAGE_SIZE);
- ring->va = 0;
-}
+ bio->bio_length += xbb_sg->nsect << 9;
+ bio->bio_bcount = bio->bio_length;
+ bio_offset += xbb_sg->nsect << 9;
-static int
-connect_ring(blkif_t *blkif)
-{
- struct xenbus_device *xdev = blkif->xdev;
- blkif_sring_t *ring;
- unsigned long ring_ref;
- evtchn_port_t evtchn;
- evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain };
- int err;
-
- if (blkif->ring_connected)
- return 0;
-
- // Grab FE data and map his memory
- err = xenbus_gather(NULL, xdev->otherend,
- "ring-ref", "%lu", &ring_ref,
- "event-channel", "%u", &evtchn, NULL);
- if (err) {
- xenbus_dev_fatal(xdev, err,
- "reading %s/ring-ref and event-channel",
- xdev->otherend);
- return err;
- }
-
- err = map_ring(ring_ref, blkif->domid, &blkif->rr);
- if (err) {
- xenbus_dev_fatal(xdev, err, "mapping ring");
- return err;
- }
- ring = (blkif_sring_t *)blkif->rr.va;
- BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE);
-
- op.u.bind_interdomain.remote_dom = blkif->domid;
- op.u.bind_interdomain.remote_port = evtchn;
- err = HYPERVISOR_event_channel_op(&op);
- if (err) {
- unmap_ring(&blkif->rr);
- xenbus_dev_fatal(xdev, err, "binding event channel");
- return err;
- }
- blkif->evtchn = op.u.bind_interdomain.local_port;
-
- /* bind evtchn to irq handler */
- blkif->irq =
- bind_evtchn_to_irqhandler(blkif->evtchn, "blkback",
- blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie);
-
- blkif->ring_connected = 1;
-
- DPRINTF("%x rings connected! evtchn=%d irq=%d\n",
- (unsigned int)blkif, blkif->evtchn, blkif->irq);
+ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) {
- return 0;
-}
+ if ((bio->bio_length & (xbb->sector_size - 1)) != 0) {
+ printf("%s: Discontiguous I/O request from "
+ "domain %d ends on non-sector "
+ "boundary\n", __func__,
+ xbb->otherend_id);
+ error = EINVAL;
+ goto fail_free_bios;
+ }
+ /*
+ * KVA will not be contiguous, so any additional
+ * I/O will need to be represented in a new bio.
+ */
+ bio = NULL;
+ }
+ }
-static void
-disconnect_ring(blkif_t *blkif)
-{
- DPRINTF("\n");
+ req->pendcnt = nbio;
+
+ for (bio_idx = 0; bio_idx < nbio; bio_idx++)
+ {
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ vm_offset_t kva_offset;
- if (blkif->ring_connected) {
- unbind_from_irqhandler(blkif->irq, blkif->irq_cookie);
- blkif->irq = 0;
- unmap_ring(&blkif->rr);
- blkif->ring_connected = 0;
+ kva_offset = (vm_offset_t)bios[bio_idx]->bio_data
+ - (vm_offset_t)req->bounce;
+ if (operation == BIO_WRITE) {
+ memcpy(bios[bio_idx]->bio_data,
+ (uint8_t *)req->kva + kva_offset,
+ bios[bio_idx]->bio_bcount);
+ }
+#endif
+ (*dev_data->csw->d_strategy)(bios[bio_idx]);
}
+
+ return (error);
+
+fail_free_bios:
+ for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++)
+ g_destroy_bio(bios[bio_idx]);
+
+ return (error);
}
-static void
-connect(blkif_t *blkif)
+/**
+ * Backend handler for file access.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param ring_req Front-end's I/O request as pulled from the shared
+ * communication ring.
+ * \param req Allocated internal request structure.
+ * \param nseg The number of valid segments for this request in
+ * xbb->xbb_sgs.
+ * \param operation BIO_* I/O operation code.
+ * \param bio_flags Additional bio_flag data to pass to any generated bios
+ * (e.g. BIO_ORDERED)..
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req,
+ struct xbb_xen_req *req, int nseg, int operation,
+ int flags)
{
- struct xenbus_transaction *xbt;
- struct xenbus_device *xdev = blkif->xdev;
- int err;
+ struct xbb_file_data *file_data;
+ u_int seg_idx;
+ struct uio xuio;
+ struct xbb_sg *xbb_sg;
+ struct iovec *xiovec;
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ void **p_vaddr;
+ int saved_uio_iovcnt;
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+ int vfs_is_locked;
+ int error;
+
+ file_data = &xbb->backend.file;
+ error = 0;
+ bzero(&xuio, sizeof(xuio));
+
+ req->pendcnt = 0;
+
+ switch (operation) {
+ case BIO_READ:
+ xuio.uio_rw = UIO_READ;
+ break;
+ case BIO_WRITE:
+ xuio.uio_rw = UIO_WRITE;
+ break;
+ case BIO_FLUSH: {
+ struct mount *mountpoint;
- if (!blkif->ring_connected ||
- blkif->vn == NULL ||
- blkif->state == XenbusStateConnected)
- return;
+ vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
- DPRINTF("%s\n", xdev->otherend);
+ (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT);
- /* Supply the information about the device the frontend needs */
-again:
- xbt = xenbus_transaction_start();
- if (IS_ERR(xbt)) {
- xenbus_dev_fatal(xdev, PTR_ERR(xbt),
- "Error writing configuration for backend "
- "(start transaction)");
- return;
- }
+ vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
+ error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread);
+ VOP_UNLOCK(xbb->vn, 0);
- err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u",
- blkif->media_num_sectors);
- if (err) {
- xenbus_dev_fatal(xdev, err, "writing %s/sectors",
- xdev->nodename);
- goto abort;
- }
+ vn_finished_write(mountpoint);
+
+ VFS_UNLOCK_GIANT(vfs_is_locked);
- err = xenbus_printf(xbt, xdev->nodename, "info", "%u",
- blkif->read_only ? VDISK_READONLY : 0);
- if (err) {
- xenbus_dev_fatal(xdev, err, "writing %s/info",
- xdev->nodename);
- goto abort;
+ goto bailout_send_response;
+ /* NOTREACHED */
}
- err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u",
- blkif->sector_size);
- if (err) {
- xenbus_dev_fatal(xdev, err, "writing %s/sector-size",
- xdev->nodename);
- goto abort;
+ default:
+ panic("invalid operation %d", operation);
+ /* NOTREACHED */
}
+ xuio.uio_offset = (vm_offset_t)ring_req->sector_number
+ << xbb->sector_size_shift;
- err = xenbus_transaction_end(xbt, 0);
- if (err == -EAGAIN)
- goto again;
- if (err)
- xenbus_dev_fatal(xdev, err, "ending transaction");
+ xuio.uio_segflg = UIO_SYSSPACE;
+ xuio.uio_iov = file_data->xiovecs;
+ xuio.uio_iovcnt = 0;
- err = xenbus_switch_state(xdev, NULL, XenbusStateConnected);
- if (err)
- xenbus_dev_fatal(xdev, err, "switching to Connected state",
- xdev->nodename);
+ for (seg_idx = 0, xiovec = NULL, xbb_sg = xbb->xbb_sgs;
+ seg_idx < nseg; seg_idx++, xbb_sg++) {
- blkif->state = XenbusStateConnected;
+ /*
+ * If the first sector is not 0, the KVA will not be
+ * contiguous and we'll need to go on to another segment.
+ */
+ if (xbb_sg->first_sect != 0)
+ xiovec = NULL;
+
+ if (xiovec == NULL) {
+ xiovec = &file_data->xiovecs[xuio.uio_iovcnt];
+ xiovec->iov_base = xbb_req_ioaddr(req, seg_idx,
+ xbb_sg->first_sect);
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ /*
+ * Store the address of the incoming buffer at this
+ * particular offset as well, so we can do the copy
+ * later without having to do more work to
+ * recalculate this address.
+ */
+ p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt];
+ *p_vaddr = xbb_req_vaddr(req, seg_idx,
+ xbb_sg->first_sect);
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+ xiovec->iov_len = 0;
+ xuio.uio_iovcnt++;
+ }
- return;
+ xiovec->iov_len += xbb_sg->nsect << 9;
- abort:
- xenbus_transaction_end(xbt, 1);
-}
+ xuio.uio_resid += xbb_sg->nsect << 9;
-static int
-blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id)
-{
- int err;
- char *p, *mode = NULL, *type = NULL, *params = NULL;
- long handle;
+ /*
+ * If the last sector is not the full page size count,
+ * the next segment will not be contiguous in KVA and we
+ * need a new iovec.
+ */
+ if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9)
+ xiovec = NULL;
+ }
- DPRINTF("node=%s\n", xdev->nodename);
+ xuio.uio_td = curthread;
- p = strrchr(xdev->otherend, '/') + 1;
- handle = strtoul(p, NULL, 0);
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ saved_uio_iovcnt = xuio.uio_iovcnt;
- mode = xenbus_read(NULL, xdev->nodename, "mode", NULL);
- if (IS_ERR(mode)) {
- xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode");
- err = PTR_ERR(mode);
- goto error;
- }
-
- type = xenbus_read(NULL, xdev->nodename, "type", NULL);
- if (IS_ERR(type)) {
- xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type");
- err = PTR_ERR(type);
- goto error;
- }
-
- params = xenbus_read(NULL, xdev->nodename, "params", NULL);
- if (IS_ERR(type)) {
- xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params");
- err = PTR_ERR(params);
- goto error;
- }
-
- err = blkif_create(xdev, handle, mode, type, params);
- if (err) {
- xenbus_dev_fatal(xdev, err, "creating blkif");
- goto error;
- }
+ if (operation == BIO_WRITE) {
+ /* Copy the write data to the local buffer. */
+ for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
+ xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt;
+ seg_idx++, xiovec++, p_vaddr++) {
- err = vbd_add_dev(xdev);
- if (err) {
- blkif_put((blkif_t *)xdev->data);
- xenbus_dev_fatal(xdev, err, "adding vbd device");
+ memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len);
+ }
+ } else {
+ /*
+ * We only need to save off the iovecs in the case of a
+ * read, because the copy for the read happens after the
+ * VOP_READ(). (The uio will get modified in that call
+ * sequence.)
+ */
+ memcpy(file_data->saved_xiovecs, xuio.uio_iov,
+ xuio.uio_iovcnt * sizeof(xuio.uio_iov[0]));
}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
- return err;
+ vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
+ switch (operation) {
+ case BIO_READ:
- error:
- if (mode)
- free(mode, M_DEVBUF);
- if (type)
- free(type, M_DEVBUF);
- if (params)
- free(params, M_DEVBUF);
- return err;
-}
+ vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
-static int
-blkback_remove(struct xenbus_device *xdev)
-{
- blkif_t *blkif = xdev->data;
- device_t ndev;
+ /*
+ * UFS pays attention to IO_DIRECT for reads. If the
+ * DIRECTIO option is configured into the kernel, it calls
+ * ffs_rawread(). But that only works for single-segment
+ * uios with user space addresses. In our case, with a
+ * kernel uio, it still reads into the buffer cache, but it
+ * will just try to release the buffer from the cache later
+ * on in ffs_read().
+ *
+ * ZFS does not pay attention to IO_DIRECT for reads.
+ *
+ * UFS does not pay attention to IO_SYNC for reads.
+ *
+ * ZFS pays attention to IO_SYNC (which translates into the
+ * Solaris define FRSYNC for zfs_read()) for reads. It
+ * attempts to sync the file before reading.
+ *
+ * So, to attempt to provide some barrier semantics in the
+ * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC.
+ */
+ error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
+ (IO_DIRECT|IO_SYNC) : 0, file_data->cred);
- DPRINTF("node=%s\n", xdev->nodename);
+ VOP_UNLOCK(xbb->vn, 0);
+ break;
+ case BIO_WRITE: {
+ struct mount *mountpoint;
- blkif->state = XenbusStateClosing;
+ (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT);
- if ((ndev = blkif->ndev)) {
- blkif->ndev = NULL;
- mtx_lock(&Giant);
- device_detach(ndev);
- mtx_unlock(&Giant);
- }
+ vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY);
- xdev->data = NULL;
- blkif->xdev = NULL;
- blkif_put(blkif);
+ /*
+ * UFS pays attention to IO_DIRECT for writes. The write
+ * is done asynchronously. (Normally the write would just
+ * get put into cache.
+ *
+ * UFS pays attention to IO_SYNC for writes. It will
+ * attempt to write the buffer out synchronously if that
+ * flag is set.
+ *
+ * ZFS does not pay attention to IO_DIRECT for writes.
+ *
+ * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC)
+ * for writes. It will flush the transaction from the
+ * cache before returning.
+ *
+ * So if we've got the BIO_ORDERED flag set, we want
+ * IO_SYNC in either the UFS or ZFS case.
+ */
+ error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ?
+ IO_SYNC : 0, file_data->cred);
+ VOP_UNLOCK(xbb->vn, 0);
- return 0;
-}
+ vn_finished_write(mountpoint);
-static int
-blkback_resume(struct xenbus_device *xdev)
-{
- DPRINTF("node=%s\n", xdev->nodename);
- return 0;
+ break;
+ }
+ default:
+ panic("invalid operation %d", operation);
+ /* NOTREACHED */
+ }
+ VFS_UNLOCK_GIANT(vfs_is_locked);
+
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ /* We only need to copy here for read operations */
+ if (operation == BIO_READ) {
+
+ for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr,
+ xiovec = file_data->saved_xiovecs;
+ seg_idx < saved_uio_iovcnt; seg_idx++,
+ xiovec++, p_vaddr++) {
+
+ /*
+ * Note that we have to use the copy of the
+ * io vector we made above. uiomove() modifies
+ * the uio and its referenced vector as uiomove
+ * performs the copy, so we can't rely on any
+ * state from the original uio.
+ */
+ memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len);
+ }
+ }
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+
+bailout_send_response:
+
+ /*
+ * All I/O is already done, send the response. A lock is not
+ * necessary here because we're single threaded, and therefore the
+ * only context accessing this request right now. If that changes,
+ * we may need some locking here.
+ */
+ xbb_unmap_req(req);
+ xbb_send_response(xbb, req, (error == 0) ? BLKIF_RSP_OKAY :
+ BLKIF_RSP_ERROR);
+ devstat_end_transaction(xbb->xbb_stats,
+ /*bytes*/error == 0 ? req->nr_512b_sectors << 9
+ : 0,
+ req->ds_tag_type,
+ req->ds_trans_type,
+ /*now*/NULL,
+ /*then*/&req->ds_t0);
+ xbb_release_req(xbb, req);
+
+ return (0);
}
+/*--------------------------- Backend Configuration --------------------------*/
+/**
+ * Close and cleanup any backend device/file specific state for this
+ * block back instance.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
static void
-frontend_changed(struct xenbus_device *xdev,
- XenbusState frontend_state)
+xbb_close_backend(struct xbb_softc *xbb)
{
- blkif_t *blkif = xdev->data;
+ DROP_GIANT();
+ DPRINTF("closing dev=%s\n", xbb->dev_name);
+ if (xbb->vn) {
+ int flags = FREAD;
+ int vfs_is_locked = 0;
- DPRINTF("state=%d\n", frontend_state);
+ if ((xbb->flags & XBBF_READ_ONLY) == 0)
+ flags |= FWRITE;
- blkif->frontend_state = frontend_state;
+ switch (xbb->device_type) {
+ case XBB_TYPE_DISK:
+ if (xbb->backend.dev.csw) {
+ dev_relthread(xbb->backend.dev.cdev,
+ xbb->backend.dev.dev_ref);
+ xbb->backend.dev.csw = NULL;
+ xbb->backend.dev.cdev = NULL;
+ }
+ break;
+ case XBB_TYPE_FILE:
+ vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount);
+ break;
+ case XBB_TYPE_NONE:
+ default:
+ panic("Unexpected backend type.");
+ break;
+ }
- switch (frontend_state) {
- case XenbusStateInitialising:
- break;
- case XenbusStateInitialised:
- case XenbusStateConnected:
- connect_ring(blkif);
- connect(blkif);
- break;
- case XenbusStateClosing:
- xenbus_switch_state(xdev, NULL, XenbusStateClosing);
- break;
- case XenbusStateClosed:
- xenbus_remove_device(xdev);
- break;
- case XenbusStateUnknown:
- case XenbusStateInitWait:
- xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend",
- frontend_state);
- break;
+ (void)vn_close(xbb->vn, flags, NOCRED, curthread);
+ xbb->vn = NULL;
+
+ switch (xbb->device_type) {
+ case XBB_TYPE_DISK:
+ break;
+ case XBB_TYPE_FILE:
+ VFS_UNLOCK_GIANT(vfs_is_locked);
+ if (xbb->backend.file.cred != NULL) {
+ crfree(xbb->backend.file.cred);
+ xbb->backend.file.cred = NULL;
+ }
+ break;
+ case XBB_TYPE_NONE:
+ default:
+ panic("Unexpected backend type.");
+ break;
+ }
}
+ PICKUP_GIANT();
}
-/* ** Driver registration ** */
-
-static struct xenbus_device_id blkback_ids[] = {
- { "vbd" },
- { "" }
-};
+/**
+ * Open a character device to be used for backend I/O.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_open_dev(struct xbb_softc *xbb)
+{
+ struct vattr vattr;
+ struct cdev *dev;
+ struct cdevsw *devsw;
+ int error;
+
+ xbb->device_type = XBB_TYPE_DISK;
+ xbb->dispatch_io = xbb_dispatch_dev;
+ xbb->backend.dev.cdev = xbb->vn->v_rdev;
+ xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev,
+ &xbb->backend.dev.dev_ref);
+ if (xbb->backend.dev.csw == NULL)
+ panic("Unable to retrieve device switch");
+
+ error = VOP_GETATTR(xbb->vn, &vattr, NOCRED);
+ if (error) {
+ xenbus_dev_fatal(xbb->dev, error, "error getting "
+ "vnode attributes for device %s",
+ xbb->dev_name);
+ return (error);
+ }
-static struct xenbus_driver blkback = {
- .name = "blkback",
- .ids = blkback_ids,
- .probe = blkback_probe,
- .remove = blkback_remove,
- .resume = blkback_resume,
- .otherend_changed = frontend_changed,
-};
-static void
-blkback_init(void *unused)
-{
- int i;
-
- TASK_INIT(&blk_req_task, 0, blk_req_action, NULL);
- mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF);
-
- mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF);
-
- mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
- pending_reqs = malloc(sizeof(pending_reqs[0]) *
- blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT);
- pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) *
- mmap_pages, M_DEVBUF, M_NOWAIT);
- pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) *
- mmap_pages, M_DEVBUF, M_NOWAIT);
- mmap_vstart = alloc_empty_page_range(mmap_pages);
- if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) {
- if (pending_reqs)
- free(pending_reqs, M_DEVBUF);
- if (pending_grant_handles)
- free(pending_grant_handles, M_DEVBUF);
- if (pending_vaddrs)
- free(pending_vaddrs, M_DEVBUF);
- WPRINTF("out of memory\n");
- return;
+ dev = xbb->vn->v_rdev;
+ devsw = dev->si_devsw;
+ if (!devsw->d_ioctl) {
+ xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for "
+ "device %s!", xbb->dev_name);
+ return (ENODEV);
}
- for (i = 0; i < mmap_pages; i++) {
- pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
- pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+ error = devsw->d_ioctl(dev, DIOCGSECTORSIZE,
+ (caddr_t)&xbb->sector_size, FREAD,
+ curthread);
+ if (error) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "error calling ioctl DIOCGSECTORSIZE "
+ "for device %s", xbb->dev_name);
+ return (error);
}
- for (i = 0; i < blkif_reqs; i++) {
- STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list);
+ error = devsw->d_ioctl(dev, DIOCGMEDIASIZE,
+ (caddr_t)&xbb->media_size, FREAD,
+ curthread);
+ if (error) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "error calling ioctl DIOCGMEDIASIZE "
+ "for device %s", xbb->dev_name);
+ return (error);
}
- DPRINTF("registering %s\n", blkback.name);
- xenbus_register_backend(&blkback);
+ return (0);
}
-SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL)
-
-static void
-close_device(blkif_t *blkif)
+/**
+ * Open a file to be used for backend I/O.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_open_file(struct xbb_softc *xbb)
{
- DPRINTF("closing dev=%s\n", blkif->dev_name);
- if (blkif->vn) {
- int flags = FREAD;
-
- if (!blkif->read_only)
- flags |= FWRITE;
+ struct xbb_file_data *file_data;
+ struct vattr vattr;
+ int error;
+
+ file_data = &xbb->backend.file;
+ xbb->device_type = XBB_TYPE_FILE;
+ xbb->dispatch_io = xbb_dispatch_file;
+ error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "error calling VOP_GETATTR()"
+ "for file %s", xbb->dev_name);
+ return (error);
+ }
- if (blkif->csw) {
- dev_relthread(blkif->cdev);
- blkif->csw = NULL;
+ /*
+ * Verify that we have the ability to upgrade to exclusive
+ * access on this file so we can trap errors at open instead
+ * of reporting them during first access.
+ */
+ if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) {
+ vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY);
+ if (xbb->vn->v_iflag & VI_DOOMED) {
+ error = EBADF;
+ xenbus_dev_fatal(xbb->dev, error,
+ "error locking file %s",
+ xbb->dev_name);
+
+ return (error);
}
+ }
- (void)vn_close(blkif->vn, flags, NOCRED, curthread);
- blkif->vn = NULL;
+ file_data->cred = crhold(curthread->td_ucred);
+ xbb->media_size = vattr.va_size;
+
+ /*
+ * XXX KDM vattr.va_blocksize may be larger than 512 bytes here.
+ * With ZFS, it is 131072 bytes. Block sizes that large don't work
+ * with disklabel and UFS on FreeBSD at least. Large block sizes
+ * may not work with other OSes as well. So just export a sector
+ * size of 512 bytes, which should work with any OS or
+ * application. Since our backing is a file, any block size will
+ * work fine for the backing store.
+ */
+#if 0
+ xbb->sector_size = vattr.va_blocksize;
+#endif
+ xbb->sector_size = 512;
+
+ /*
+ * Sanity check. The media size has to be at least one
+ * sector long.
+ */
+ if (xbb->media_size < xbb->sector_size) {
+ error = EINVAL;
+ xenbus_dev_fatal(xbb->dev, error,
+ "file %s size %ju < block size %u",
+ xbb->dev_name,
+ (uintmax_t)xbb->media_size,
+ xbb->sector_size);
}
+ return (error);
}
+/**
+ * Open the backend provider for this connection.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
static int
-open_device(blkif_t *blkif)
+xbb_open_backend(struct xbb_softc *xbb)
{
struct nameidata nd;
- struct vattr vattr;
- struct cdev *dev;
- struct cdevsw *devsw;
- int flags = FREAD, err = 0;
+ int flags;
+ int error;
+ int vfs_is_locked;
- DPRINTF("opening dev=%s\n", blkif->dev_name);
+ flags = FREAD;
+ error = 0;
- if (!blkif->read_only)
+ DPRINTF("opening dev=%s\n", xbb->dev_name);
+
+ if ((xbb->flags & XBBF_READ_ONLY) == 0)
flags |= FWRITE;
if (!curthread->td_proc->p_fd->fd_cdir) {
@@ -1066,284 +1930,1045 @@ open_device(blkif_t *blkif)
}
again:
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread);
- err = vn_open(&nd, &flags, 0, -1);
- if (err) {
- if (blkif->dev_name[0] != '/') {
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread);
+ error = vn_open(&nd, &flags, 0, NULL);
+ if (error) {
+ /*
+ * This is the only reasonable guess we can make as far as
+ * path if the user doesn't give us a fully qualified path.
+ * If they want to specify a file, they need to specify the
+ * full path.
+ */
+ if (xbb->dev_name[0] != '/') {
char *dev_path = "/dev/";
char *dev_name;
/* Try adding device path at beginning of name */
- dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT);
+ dev_name = malloc(strlen(xbb->dev_name)
+ + strlen(dev_path) + 1,
+ M_XENBLOCKBACK, M_NOWAIT);
if (dev_name) {
- sprintf(dev_name, "%s%s", dev_path, blkif->dev_name);
- free(blkif->dev_name, M_DEVBUF);
- blkif->dev_name = dev_name;
+ sprintf(dev_name, "%s%s", dev_path,
+ xbb->dev_name);
+ free(xbb->dev_name, M_XENBLOCKBACK);
+ xbb->dev_name = dev_name;
goto again;
}
}
- xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name);
- return err;
+ xenbus_dev_fatal(xbb->dev, error, "error opening device %s",
+ xbb->dev_name);
+ return (error);
}
+
+ vfs_is_locked = NDHASGIANT(&nd);
+
NDFREE(&nd, NDF_ONLY_PNBUF);
- blkif->vn = nd.ni_vp;
+ xbb->vn = nd.ni_vp;
+
+ /* We only support disks and files. */
+ if (vn_isdisk(xbb->vn, &error)) {
+ error = xbb_open_dev(xbb);
+ } else if (xbb->vn->v_type == VREG) {
+ error = xbb_open_file(xbb);
+ } else {
+ error = EINVAL;
+ xenbus_dev_fatal(xbb->dev, error, "%s is not a disk "
+ "or file", xbb->dev_name);
+ }
+ VOP_UNLOCK(xbb->vn, 0);
+ VFS_UNLOCK_GIANT(vfs_is_locked);
- /* We only support disks for now */
- if (!vn_isdisk(blkif->vn, &err)) {
- xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name);
- VOP_UNLOCK(blkif->vn, 0, curthread);
- goto error;
+ if (error != 0) {
+ xbb_close_backend(xbb);
+ return (error);
}
- blkif->cdev = blkif->vn->v_rdev;
- blkif->csw = dev_refthread(blkif->cdev);
- PANIC_IF(blkif->csw == NULL);
+ xbb->sector_size_shift = fls(xbb->sector_size) - 1;
+ xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift;
+
+ DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n",
+ (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file",
+ xbb->dev_name, xbb->sector_size, xbb->media_size);
+
+ return (0);
+}
- err = VOP_GETATTR(blkif->vn, &vattr, NOCRED);
- if (err) {
- xenbus_dev_fatal(blkif->xdev, err,
- "error getting vnode attributes for device %s", blkif->dev_name);
- VOP_UNLOCK(blkif->vn, 0, curthread);
- goto error;
+/*------------------------ Inter-Domain Communication ------------------------*/
+/**
+ * Cleanup all inter-domain communication mechanisms.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static void
+xbb_disconnect(struct xbb_softc *xbb)
+{
+ struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES];
+ struct gnttab_unmap_grant_ref *op;
+ u_int ring_idx;
+ int error;
+
+ DPRINTF("\n");
+
+ if ((xbb->flags & XBBF_RING_CONNECTED) == 0)
+ return;
+
+ if (xbb->irq != 0) {
+ unbind_from_irqhandler(xbb->irq);
+ xbb->irq = 0;
}
- VOP_UNLOCK(blkif->vn, 0, curthread);
+ for (ring_idx = 0, op = ops;
+ ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++, op++) {
- dev = blkif->vn->v_rdev;
- devsw = dev->si_devsw;
- if (!devsw->d_ioctl) {
- err = ENODEV;
- xenbus_dev_fatal(blkif->xdev, err,
- "no d_ioctl for device %s!", blkif->dev_name);
- goto error;
+ op->host_addr = xbb->ring_config.gnt_addr
+ + (ring_idx * PAGE_SIZE);
+ op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx];
+ op->handle = xbb->ring_config.handle[ring_idx];
}
- err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread);
- if (err) {
- xenbus_dev_fatal(blkif->xdev, err,
- "error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name);
- goto error;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops,
+ xbb->ring_config.ring_pages);
+ if (error != 0)
+ panic("Grant table op failed (%d)", error);
+
+ xbb->flags &= ~XBBF_RING_CONNECTED;
+}
+
+/**
+ * Map shared memory ring into domain local address space, initialize
+ * ring control structures, and bind an interrupt to the event channel
+ * used to notify us of ring changes.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static int
+xbb_connect_ring(struct xbb_softc *xbb)
+{
+ struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES];
+ struct gnttab_map_grant_ref *gnt;
+ u_int ring_idx;
+ int error;
+
+ if ((xbb->flags & XBBF_RING_CONNECTED) != 0)
+ return (0);
+
+ /*
+ * Kva for our ring is at the tail of the region of kva allocated
+ * by xbb_alloc_communication_mem().
+ */
+ xbb->ring_config.va = xbb->kva
+ + (xbb->kva_size
+ - (xbb->ring_config.ring_pages * PAGE_SIZE));
+ xbb->ring_config.gnt_addr = xbb->gnt_base_addr
+ + (xbb->kva_size
+ - (xbb->ring_config.ring_pages * PAGE_SIZE));
+
+ for (ring_idx = 0, gnt = gnts;
+ ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++, gnt++) {
+
+ gnt->host_addr = xbb->ring_config.gnt_addr
+ + (ring_idx * PAGE_SIZE);
+ gnt->flags = GNTMAP_host_map;
+ gnt->ref = xbb->ring_config.ring_ref[ring_idx];
+ gnt->dom = xbb->otherend_id;
}
- blkif->sector_size_shift = fls(blkif->sector_size) - 1;
- err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread);
- if (err) {
- xenbus_dev_fatal(blkif->xdev, err,
- "error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name);
- goto error;
+ error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts,
+ xbb->ring_config.ring_pages);
+ if (error)
+ panic("blkback: Ring page grant table op failed (%d)", error);
+
+ for (ring_idx = 0, gnt = gnts;
+ ring_idx < xbb->ring_config.ring_pages;
+ ring_idx++, gnt++) {
+ if (gnt->status != 0) {
+ xbb->ring_config.va = 0;
+ xenbus_dev_fatal(xbb->dev, EACCES,
+ "Ring shared page mapping failed. "
+ "Status %d.", gnt->status);
+ return (EACCES);
+ }
+ xbb->ring_config.handle[ring_idx] = gnt->handle;
+ xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr;
}
- blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift;
- blkif->major = major(vattr.va_rdev);
- blkif->minor = minor(vattr.va_rdev);
+ /* Initialize the ring based on ABI. */
+ switch (xbb->abi) {
+ case BLKIF_PROTOCOL_NATIVE:
+ {
+ blkif_sring_t *sring;
+ sring = (blkif_sring_t *)xbb->ring_config.va;
+ BACK_RING_INIT(&xbb->rings.native, sring,
+ xbb->ring_config.ring_pages * PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_32:
+ {
+ blkif_x86_32_sring_t *sring_x86_32;
+ sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va;
+ BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32,
+ xbb->ring_config.ring_pages * PAGE_SIZE);
+ break;
+ }
+ case BLKIF_PROTOCOL_X86_64:
+ {
+ blkif_x86_64_sring_t *sring_x86_64;
+ sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va;
+ BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64,
+ xbb->ring_config.ring_pages * PAGE_SIZE);
+ break;
+ }
+ default:
+ panic("Unexpected blkif protocol ABI.");
+ }
- DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n",
- blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size);
+ xbb->flags |= XBBF_RING_CONNECTED;
+
+ error =
+ bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id,
+ xbb->ring_config.evtchn,
+ device_get_nameunit(xbb->dev),
+ xbb_intr, /*arg*/xbb,
+ INTR_TYPE_BIO | INTR_MPSAFE,
+ &xbb->irq);
+ if (error) {
+ xbb_disconnect(xbb);
+ xenbus_dev_fatal(xbb->dev, error, "binding event channel");
+ return (error);
+ }
- return 0;
+ DPRINTF("rings connected!\n");
- error:
- close_device(blkif);
- return err;
+ return 0;
}
+/**
+ * Size KVA and pseudo-physical address allocations based on negotiated
+ * values for the size and number of I/O requests, and the size of our
+ * communication ring.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * These address spaces are used to dynamically map pages in the
+ * front-end's domain into our own.
+ */
static int
-vbd_add_dev(struct xenbus_device *xdev)
+xbb_alloc_communication_mem(struct xbb_softc *xbb)
{
- blkif_t *blkif = xdev->data;
- device_t nexus, ndev;
- devclass_t dc;
- int err = 0;
+ xbb->kva_size = (xbb->ring_config.ring_pages
+ + (xbb->max_requests * xbb->max_request_segments))
+ * PAGE_SIZE;
+#ifndef XENHVM
+ xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size);
+ if (xbb->kva == 0)
+ return (ENOMEM);
+ xbb->gnt_base_addr = xbb->kva;
+#else /* XENHVM */
+ /*
+ * Reserve a range of pseudo physical memory that we can map
+ * into kva. These pages will only be backed by machine
+ * pages ("real memory") during the lifetime of front-end requests
+ * via grant table operations.
+ */
+ xbb->pseudo_phys_res_id = 0;
+ xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY,
+ &xbb->pseudo_phys_res_id,
+ 0, ~0, xbb->kva_size,
+ RF_ACTIVE);
+ if (xbb->pseudo_phys_res == NULL) {
+ xbb->kva = 0;
+ return (ENOMEM);
+ }
+ xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res);
+ xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res);
+#endif /* XENHVM */
+ return (0);
+}
- mtx_lock(&Giant);
+/**
+ * Free dynamically allocated KVA or pseudo-physical address allocations.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static void
+xbb_free_communication_mem(struct xbb_softc *xbb)
+{
+ if (xbb->kva != 0) {
+#ifndef XENHVM
+ kmem_free(kernel_map, xbb->kva, xbb->kva_size);
+#else
+ if (xbb->pseudo_phys_res != NULL) {
+ bus_release_resource(xbb->dev, SYS_RES_MEMORY,
+ xbb->pseudo_phys_res_id,
+ xbb->pseudo_phys_res);
+ xbb->pseudo_phys_res = NULL;
+ }
+#endif
+ }
+ xbb->kva = 0;
+ xbb->gnt_base_addr = 0;
+}
- /* We will add a vbd device as a child of nexus0 (for now) */
- if (!(dc = devclass_find("nexus")) ||
- !(nexus = devclass_get_device(dc, 0))) {
- WPRINTF("could not find nexus0!\n");
- err = ENOENT;
- goto done;
+/**
+ * Collect front-end information from the XenStore.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static int
+xbb_collect_frontend_info(struct xbb_softc *xbb)
+{
+ char protocol_abi[64];
+ const char *otherend_path;
+ int error;
+ u_int ring_idx;
+
+ otherend_path = xenbus_get_otherend_path(xbb->dev);
+
+ /*
+ * Mandatory data (used in all versions of the protocol) first.
+ */
+ error = xs_gather(XST_NIL, otherend_path,
+ "ring-ref", "%" PRIu32,
+ &xbb->ring_config.ring_ref[0],
+ "event-channel", "%" PRIu32,
+ &xbb->ring_config.evtchn,
+ NULL);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Unable to retrieve ring information from "
+ "frontend %s. Unable to connect.",
+ xenbus_get_otherend_path(xbb->dev));
+ return (error);
}
+ /*
+ * These fields are initialized to legacy protocol defaults
+ * so we only need to fail if reading the updated value succeeds
+ * and the new value is outside of its allowed range.
+ *
+ * \note xs_gather() returns on the first encountered error, so
+ * we must use independant calls in order to guarantee
+ * we don't miss information in a sparsly populated front-end
+ * tree.
+ */
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "ring-pages", NULL, "%" PRIu32,
+ &xbb->ring_config.ring_pages);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-requests", NULL, "%" PRIu32,
+ &xbb->max_requests);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-request-segments", NULL, "%" PRIu32,
+ &xbb->max_request_segments);
+
+ (void)xs_scanf(XST_NIL, otherend_path,
+ "max-request-size", NULL, "%" PRIu32,
+ &xbb->max_request_size);
+
+ if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed ring-pages of %u "
+ "exceeds backend limit of %zu. "
+ "Unable to connect.",
+ xbb->ring_config.ring_pages,
+ XBB_MAX_RING_PAGES);
+ return (EINVAL);
+ } else if (xbb->max_requests > XBB_MAX_REQUESTS) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed max_requests of %u "
+ "exceeds backend limit of %u. "
+ "Unable to connect.",
+ xbb->max_requests,
+ XBB_MAX_REQUESTS);
+ return (EINVAL);
+ } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed max_requests_segments "
+ "of %u exceeds backend limit of %u. "
+ "Unable to connect.",
+ xbb->max_request_segments,
+ XBB_MAX_SEGMENTS_PER_REQUEST);
+ return (EINVAL);
+ } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) {
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Front-end specificed max_request_size "
+ "of %u exceeds backend limit of %u. "
+ "Unable to connect.",
+ xbb->max_request_size,
+ XBB_MAX_REQUEST_SIZE);
+ return (EINVAL);
+ }
- /* Create a newbus device representing the vbd */
- ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle);
- if (!ndev) {
- WPRINTF("could not create newbus device vbd%d!\n", blkif->handle);
- err = EFAULT;
- goto done;
+ /* If using a multi-page ring, pull in the remaining references. */
+ for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) {
+ char ring_ref_name[]= "ring_refXX";
+
+ snprintf(ring_ref_name, sizeof(ring_ref_name),
+ "ring-ref%u", ring_idx);
+ error = xs_scanf(XST_NIL, otherend_path,
+ ring_ref_name, NULL, "%" PRIu32,
+ &xbb->ring_config.ring_ref[ring_idx]);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Failed to retriev grant reference "
+ "for page %u of shared ring. Unable "
+ "to connect.", ring_idx);
+ return (error);
+ }
}
-
- blkif_get(blkif);
- device_set_ivars(ndev, blkif);
- blkif->ndev = ndev;
- device_probe_and_attach(ndev);
+ error = xs_gather(XST_NIL, otherend_path,
+ "protocol", "%63s", protocol_abi,
+ NULL);
+ if (error != 0
+ || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) {
+ /*
+ * Assume native if the frontend has not
+ * published ABI data or it has published and
+ * matches our own ABI.
+ */
+ xbb->abi = BLKIF_PROTOCOL_NATIVE;
+ } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) {
- done:
+ xbb->abi = BLKIF_PROTOCOL_X86_32;
+ } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) {
- mtx_unlock(&Giant);
+ xbb->abi = BLKIF_PROTOCOL_X86_64;
+ } else {
- return err;
+ xenbus_dev_fatal(xbb->dev, EINVAL,
+ "Unknown protocol ABI (%s) published by "
+ "frontend. Unable to connect.", protocol_abi);
+ return (EINVAL);
+ }
+ return (0);
}
-enum {
- VBD_SYSCTL_DOMID,
- VBD_SYSCTL_ST_RD_REQ,
- VBD_SYSCTL_ST_WR_REQ,
- VBD_SYSCTL_ST_OO_REQ,
- VBD_SYSCTL_ST_ERR_REQ,
- VBD_SYSCTL_RING,
-};
-
-static char *
-vbd_sysctl_ring_info(blkif_t *blkif, int cmd)
+/**
+ * Allocate per-request data structures given request size and number
+ * information negotiated with the front-end.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static int
+xbb_alloc_requests(struct xbb_softc *xbb)
{
- char *buf = malloc(256, M_DEVBUF, M_WAITOK);
- if (buf) {
- if (!blkif->ring_connected)
- sprintf(buf, "ring not connected\n");
- else {
- blkif_back_ring_t *ring = &blkif->ring;
- sprintf(buf, "nr_ents=%x req_cons=%x"
- " req_prod=%x req_event=%x"
- " rsp_prod=%x rsp_event=%x",
- ring->nr_ents, ring->req_cons,
- ring->sring->req_prod, ring->sring->req_event,
- ring->sring->rsp_prod, ring->sring->rsp_event);
+ struct xbb_xen_req *req;
+ struct xbb_xen_req *last_req;
+ uint8_t *req_kva;
+ u_long gnt_base;
+
+ /*
+ * Allocate request book keeping datastructures.
+ */
+ xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests),
+ M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
+ if (xbb->requests == NULL) {
+ xenbus_dev_fatal(xbb->dev, ENOMEM,
+ "Unable to allocate request structures");
+ return (ENOMEM);
+ }
+
+ req_kva = (uint8_t *)xbb->kva;
+ gnt_base = xbb->gnt_base_addr;
+ req = xbb->requests;
+ last_req = &xbb->requests[xbb->max_requests - 1];
+ while (req <= last_req) {
+ int seg;
+
+ req->xbb = xbb;
+ req->kva = req_kva;
+ req->gnt_handles = malloc(xbb->max_request_segments
+ * sizeof(*req->gnt_handles),
+ M_XENBLOCKBACK, M_NOWAIT|M_ZERO);
+ if (req->gnt_handles == NULL) {
+ xenbus_dev_fatal(xbb->dev, ENOMEM,
+ "Unable to allocate request "
+ "grant references");
+ return (ENOMEM);
+ }
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ req->bounce = malloc(xbb->max_request_size,
+ M_XENBLOCKBACK, M_NOWAIT);
+ if (req->bounce == NULL) {
+ xenbus_dev_fatal(xbb->dev, ENOMEM,
+ "Unable to allocate request "
+ "bounce buffers");
+ return (ENOMEM);
}
+#endif /* XBB_USE_BOUNCE_BUFFERS */
+ req->gnt_base = gnt_base;
+ req_kva += xbb->max_request_segments * PAGE_SIZE;
+ gnt_base += xbb->max_request_segments * PAGE_SIZE;
+ SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links);
+
+ for (seg = 0; seg < xbb->max_request_segments; seg++)
+ req->gnt_handles[seg] = GRANT_REF_INVALID;
+
+ req++;
}
- return buf;
+ return (0);
}
+/**
+ * Supply information about the physical device to the frontend
+ * via XenBus.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
static int
-vbd_sysctl_handler(SYSCTL_HANDLER_ARGS)
+xbb_publish_backend_info(struct xbb_softc *xbb)
{
- device_t dev = (device_t)arg1;
- blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
- const char *value;
- char *buf = NULL;
- int err;
-
- switch (arg2) {
- case VBD_SYSCTL_DOMID:
- return sysctl_handle_int(oidp, NULL, blkif->domid, req);
- case VBD_SYSCTL_ST_RD_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req);
- case VBD_SYSCTL_ST_WR_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req);
- case VBD_SYSCTL_ST_OO_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req);
- case VBD_SYSCTL_ST_ERR_REQ:
- return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req);
- case VBD_SYSCTL_RING:
- value = buf = vbd_sysctl_ring_info(blkif, arg2);
- break;
- default:
- return (EINVAL);
+ struct xs_transaction xst;
+ const char *our_path;
+ const char *leaf;
+ int error;
+
+ our_path = xenbus_get_node(xbb->dev);
+ while (1) {
+ error = xs_transaction_start(&xst);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Error publishing backend info "
+ "(start transaction)");
+ return (error);
+ }
+
+ leaf = "sectors";
+ error = xs_printf(xst, our_path, leaf,
+ "%"PRIu64, xbb->media_num_sectors);
+ if (error != 0)
+ break;
+
+ /* XXX Support all VBD attributes here. */
+ leaf = "info";
+ error = xs_printf(xst, our_path, leaf, "%u",
+ xbb->flags & XBBF_READ_ONLY
+ ? VDISK_READONLY : 0);
+ if (error != 0)
+ break;
+
+ leaf = "sector-size";
+ error = xs_printf(xst, our_path, leaf, "%u",
+ xbb->sector_size);
+ if (error != 0)
+ break;
+
+ error = xs_transaction_end(xst, 0);
+ if (error == 0) {
+ return (0);
+ } else if (error != EAGAIN) {
+ xenbus_dev_fatal(xbb->dev, error, "ending transaction");
+ return (error);
+ }
}
- err = SYSCTL_OUT(req, value, strlen(value));
- if (buf != NULL)
- free(buf, M_DEVBUF);
+ xenbus_dev_fatal(xbb->dev, error, "writing %s/%s",
+ our_path, leaf);
+ xs_transaction_end(xst, 1);
+ return (error);
+}
+
+/**
+ * Connect to our blkfront peer now that it has completed publishing
+ * its configuration into the XenStore.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ */
+static void
+xbb_connect(struct xbb_softc *xbb)
+{
+ int error;
+
+ if (xenbus_get_state(xbb->dev) == XenbusStateConnected)
+ return;
+
+ if (xbb_collect_frontend_info(xbb) != 0)
+ return;
- return err;
+ /* Allocate resources whose size depends on front-end configuration. */
+ error = xbb_alloc_communication_mem(xbb);
+ if (error != 0) {
+ xenbus_dev_fatal(xbb->dev, error,
+ "Unable to allocate communication memory");
+ return;
+ }
+
+ error = xbb_alloc_requests(xbb);
+ if (error != 0) {
+ /* Specific errors are reported by xbb_alloc_requests(). */
+ return;
+ }
+
+ /*
+ * Connect communication channel.
+ */
+ error = xbb_connect_ring(xbb);
+ if (error != 0) {
+ /* Specific errors are reported by xbb_connect_ring(). */
+ return;
+ }
+
+ if (xbb_publish_backend_info(xbb) != 0) {
+ /*
+ * If we can't publish our data, we cannot participate
+ * in this connection, and waiting for a front-end state
+ * change will not help the situation.
+ */
+ xbb_disconnect(xbb);
+ return;
+ }
+
+ /* Ready for I/O. */
+ xenbus_set_state(xbb->dev, XenbusStateConnected);
}
-/* Newbus vbd device driver probe */
+/*-------------------------- Device Teardown Support -------------------------*/
+/**
+ * Perform device shutdown functions.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ *
+ * Mark this instance as shutting down, wait for any active I/O on the
+ * backend device/file to drain, disconnect from the front-end, and notify
+ * any waiters (e.g. a thread invoking our detach method) that detach can
+ * now proceed.
+ */
static int
-vbd_probe(device_t dev)
+xbb_shutdown(struct xbb_softc *xbb)
{
- DPRINTF("vbd%d\n", device_get_unit(dev));
- return 0;
+ static int in_shutdown;
+
+ DPRINTF("\n");
+
+ /*
+ * Due to the need to drop our mutex during some
+ * xenbus operations, it is possible for two threads
+ * to attempt to close out shutdown processing at
+ * the same time. Tell the caller that hits this
+ * race to try back later.
+ */
+ if (in_shutdown != 0)
+ return (EAGAIN);
+
+ DPRINTF("\n");
+
+ /* Indicate shutdown is in progress. */
+ xbb->flags |= XBBF_SHUTDOWN;
+
+ /* Wait for requests to complete. */
+ if (xbb->active_request_count != 0)
+ return (EAGAIN);
+
+ DPRINTF("\n");
+
+ /* Disconnect from the front-end. */
+ xbb_disconnect(xbb);
+
+ in_shutdown = 1;
+ mtx_unlock(&xbb->lock);
+ xenbus_set_state(xbb->dev, XenbusStateClosed);
+ mtx_lock(&xbb->lock);
+ in_shutdown = 0;
+
+ /* Indicate to xbb_detach() that is it safe to proceed. */
+ wakeup(xbb);
+
+ return (0);
+}
+
+/**
+ * Report an attach time error to the console and Xen, and cleanup
+ * this instance by forcing immediate detach processing.
+ *
+ * \param xbb Per-instance xbb configuration structure.
+ * \param err Errno describing the error.
+ * \param fmt Printf style format and arguments
+ */
+static void
+xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...)
+{
+ va_list ap;
+ va_list ap_hotplug;
+
+ va_start(ap, fmt);
+ va_copy(ap_hotplug, ap);
+ xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev),
+ "hotplug-error", fmt, ap_hotplug);
+ va_end(ap_hotplug);
+ xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "hotplug-status", "error");
+
+ xenbus_dev_vfatal(xbb->dev, err, fmt, ap);
+ va_end(ap);
+
+ xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "online", "0");
+ xbb_detach(xbb->dev);
}
-/* Newbus vbd device driver attach */
+/*---------------------------- NewBus Entrypoints ----------------------------*/
+/**
+ * Inspect a XenBus device and claim it if is of the appropriate type.
+ *
+ * \param dev NewBus device object representing a candidate XenBus device.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
static int
-vbd_attach(device_t dev)
+xbb_probe(device_t dev)
{
- blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
-
- DPRINTF("%s\n", blkif->dev_name);
-
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I",
- "domid of frontend");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I",
- "number of read reqs");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I",
- "number of write reqs");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I",
- "number of deferred reqs");
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD,
- dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I",
- "number of reqs that returned error");
-#if XEN_BLKBACK_DEBUG
- SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)),
- OID_AUTO, "ring", CTLFLAG_RD,
- dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A",
- "req ring info");
-#endif
+
+ if (!strcmp(xenbus_get_type(dev), "vbd")) {
+ device_set_desc(dev, "Backend Virtual Block Device");
+ device_quiet(dev);
+ return (0);
+ }
+
+ return (ENXIO);
+}
- if (!open_device(blkif))
- connect(blkif);
+/**
+ * Attach to a XenBus device that has been claimed by our probe routine.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_attach(device_t dev)
+{
+ struct xbb_softc *xbb;
+ int error;
+
+ DPRINTF("Attaching to %s\n", xenbus_get_node(dev));
+
+ /*
+ * Basic initialization.
+ * After this block it is safe to call xbb_detach()
+ * to clean up any allocated data for this instance.
+ */
+ xbb = device_get_softc(dev);
+ xbb->dev = dev;
+ xbb->otherend_id = xenbus_get_otherend_id(dev);
+ TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb);
+ mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF);
+ SLIST_INIT(&xbb->request_free_slist);
+
+ /*
+ * Protocol defaults valid even if all negotiation fails.
+ */
+ xbb->ring_config.ring_pages = 1;
+ xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE);
+ xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK;
+ xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE;
+
+ /*
+ * Publish protocol capabilities for consumption by the
+ * front-end.
+ */
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "feature-barrier", "1");
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/feature-barrier",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "feature-flush-cache", "1");
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-ring-pages", "%zu", XBB_MAX_RING_PAGES);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-ring-pages",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-requests", "%u", XBB_MAX_REQUESTS);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-requests",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-request-segments", "%u",
+ XBB_MAX_SEGMENTS_PER_REQUEST);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-request-segments",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "max-request-size", "%u",
+ XBB_MAX_REQUEST_SIZE);
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/max-request-size",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ /* Collect physical device information. */
+ error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev),
+ "device-type", NULL, &xbb->dev_type,
+ NULL);
+ if (error != 0)
+ xbb->dev_type = NULL;
+
+ error = xs_gather(XST_NIL, xenbus_get_node(dev),
+ "mode", NULL, &xbb->dev_mode,
+ "params", NULL, &xbb->dev_name,
+ NULL);
+ if (error != 0) {
+ xbb_attach_failed(xbb, error, "reading backend fields at %s",
+ xenbus_get_node(dev));
+ return (ENXIO);
+ }
+
+ /* Parse fopen style mode flags. */
+ if (strchr(xbb->dev_mode, 'w') == NULL)
+ xbb->flags |= XBBF_READ_ONLY;
+
+ /*
+ * Verify the physical device is present and can support
+ * the desired I/O mode.
+ */
+ DROP_GIANT();
+ error = xbb_open_backend(xbb);
+ PICKUP_GIANT();
+ if (error != 0) {
+ xbb_attach_failed(xbb, error, "Unable to open %s",
+ xbb->dev_name);
+ return (ENXIO);
+ }
- return bus_generic_attach(dev);
+ /* Use devstat(9) for recording statistics. */
+ xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev),
+ xbb->sector_size,
+ DEVSTAT_ALL_SUPPORTED,
+ DEVSTAT_TYPE_DIRECT
+ | DEVSTAT_TYPE_IF_OTHER,
+ DEVSTAT_PRIORITY_OTHER);
+ /*
+ * Create a taskqueue for doing work that must occur from a
+ * thread context.
+ */
+ xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT,
+ taskqueue_thread_enqueue,
+ /*context*/&xbb->io_taskqueue);
+ if (xbb->io_taskqueue == NULL) {
+ xbb_attach_failed(xbb, error, "Unable to create taskqueue");
+ return (ENOMEM);
+ }
+
+ taskqueue_start_threads(&xbb->io_taskqueue,
+ /*num threads*/1,
+ /*priority*/PWAIT,
+ /*thread name*/
+ "%s taskq", device_get_nameunit(dev));
+
+ /* Update hot-plug status to satisfy xend. */
+ error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev),
+ "hotplug-status", "connected");
+ if (error) {
+ xbb_attach_failed(xbb, error, "writing %s/hotplug-status",
+ xenbus_get_node(xbb->dev));
+ return (error);
+ }
+
+ /* Tell the front end that we are ready to connect. */
+ xenbus_set_state(dev, XenbusStateInitWait);
+
+ return (0);
}
-/* Newbus vbd device driver detach */
+/**
+ * Detach from a block back device instanced.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ *
+ * \note A block back device may be detached at any time in its life-cycle,
+ * including part way through the attach process. For this reason,
+ * initialization order and the intialization state checks in this
+ * routine must be carefully coupled so that attach time failures
+ * are gracefully handled.
+ */
static int
-vbd_detach(device_t dev)
+xbb_detach(device_t dev)
{
- blkif_t *blkif = (blkif_t *)device_get_ivars(dev);
+ struct xbb_softc *xbb;
- DPRINTF("%s\n", blkif->dev_name);
+ DPRINTF("\n");
- close_device(blkif);
+ xbb = device_get_softc(dev);
+ mtx_lock(&xbb->lock);
+ while (xbb_shutdown(xbb) == EAGAIN) {
+ msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0,
+ "xbb_shutdown", 0);
+ }
+ mtx_unlock(&xbb->lock);
+ mtx_destroy(&xbb->lock);
- bus_generic_detach(dev);
+ DPRINTF("\n");
- blkif_put(blkif);
+ taskqueue_free(xbb->io_taskqueue);
+ devstat_remove_entry(xbb->xbb_stats);
- return 0;
+ xbb_close_backend(xbb);
+ xbb_free_communication_mem(xbb);
+
+ if (xbb->dev_mode != NULL) {
+ free(xbb->dev_mode, M_XENBUS);
+ xbb->dev_mode = NULL;
+ }
+
+ if (xbb->dev_type != NULL) {
+ free(xbb->dev_type, M_XENBUS);
+ xbb->dev_type = NULL;
+ }
+
+ if (xbb->dev_name != NULL) {
+ free(xbb->dev_name, M_XENBUS);
+ xbb->dev_name = NULL;
+ }
+
+ if (xbb->requests != NULL) {
+ struct xbb_xen_req *req;
+ struct xbb_xen_req *last_req;
+
+ req = xbb->requests;
+ last_req = &xbb->requests[xbb->max_requests - 1];
+ while (req <= last_req) {
+#ifdef XBB_USE_BOUNCE_BUFFERS
+ if (req->bounce != NULL) {
+ free(req->bounce, M_XENBLOCKBACK);
+ req->bounce = NULL;
+ }
+#endif
+ if (req->gnt_handles != NULL) {
+ free (req->gnt_handles, M_XENBLOCKBACK);
+ req->gnt_handles = NULL;
+ }
+ req++;
+ }
+ free(xbb->requests, M_XENBLOCKBACK);
+ xbb->requests = NULL;
+ }
+
+ return (0);
}
-static device_method_t vbd_methods[] = {
+/**
+ * Prepare this block back device for suspension of this VM.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_suspend(device_t dev)
+{
+#ifdef NOT_YET
+ struct xbb_softc *sc = device_get_softc(dev);
+
+ /* Prevent new requests being issued until we fix things up. */
+ mtx_lock(&sc->xb_io_lock);
+ sc->connected = BLKIF_STATE_SUSPENDED;
+ mtx_unlock(&sc->xb_io_lock);
+#endif
+
+ return (0);
+}
+
+/**
+ * Perform any processing required to recover from a suspended state.
+ *
+ * \param dev NewBus device object representing this Xen Block Back instance.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_resume(device_t dev)
+{
+ return (0);
+}
+
+/**
+ * Handle state changes expressed via the XenStore by our front-end peer.
+ *
+ * \param dev NewBus device object representing this Xen
+ * Block Back instance.
+ * \param frontend_state The new state of the front-end.
+ *
+ * \return 0 for success, errno codes for failure.
+ */
+static int
+xbb_frontend_changed(device_t dev, XenbusState frontend_state)
+{
+ struct xbb_softc *xbb = device_get_softc(dev);
+
+ DPRINTF("state=%s\n", xenbus_strstate(frontend_state));
+
+ switch (frontend_state) {
+ case XenbusStateInitialising:
+ case XenbusStateClosing:
+ break;
+ case XenbusStateInitialised:
+ case XenbusStateConnected:
+ xbb_connect(xbb);
+ break;
+ case XenbusStateClosed:
+ case XenbusStateInitWait:
+
+ mtx_lock(&xbb->lock);
+ xbb_shutdown(xbb);
+ mtx_unlock(&xbb->lock);
+ break;
+ default:
+ xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend",
+ frontend_state);
+ break;
+ }
+ return (0);
+}
+
+/*---------------------------- NewBus Registration ---------------------------*/
+static device_method_t xbb_methods[] = {
/* Device interface */
- DEVMETHOD(device_probe, vbd_probe),
- DEVMETHOD(device_attach, vbd_attach),
- DEVMETHOD(device_detach, vbd_detach),
+ DEVMETHOD(device_probe, xbb_probe),
+ DEVMETHOD(device_attach, xbb_attach),
+ DEVMETHOD(device_detach, xbb_detach),
DEVMETHOD(device_shutdown, bus_generic_shutdown),
- DEVMETHOD(device_suspend, bus_generic_suspend),
- DEVMETHOD(device_resume, bus_generic_resume),
- {0, 0}
-};
+ DEVMETHOD(device_suspend, xbb_suspend),
+ DEVMETHOD(device_resume, xbb_resume),
-static devclass_t vbd_devclass;
+ /* Xenbus interface */
+ DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed),
-static driver_t vbd_driver = {
- "vbd",
- vbd_methods,
- 0,
+ { 0, 0 }
};
-DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0);
+static driver_t xbb_driver = {
+ "xbbd",
+ xbb_methods,
+ sizeof(struct xbb_softc),
+};
+devclass_t xbb_devclass;
-/*
- * Local variables:
- * mode: C
- * c-set-style: "BSD"
- * c-basic-offset: 4
- * tab-width: 4
- * indent-tabs-mode: t
- * End:
- */
+DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0);
OpenPOWER on IntegriCloud