diff options
Diffstat (limited to 'sys/dev/xen/blkback/blkback.c')
-rw-r--r-- | sys/dev/xen/blkback/blkback.c | 3663 |
1 files changed, 2644 insertions, 1019 deletions
diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c index 259f2f6..72087f5 100644 --- a/sys/dev/xen/blkback/blkback.c +++ b/sys/dev/xen/blkback/blkback.c @@ -1,1055 +1,1919 @@ -/* - * Copyright (c) 2006, Cisco Systems, Inc. +/*- + * Copyright (c) 2009-2010 Spectra Logic Corporation * All rights reserved. * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE - * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. + * Authors: Justin T. Gibbs (Spectra Logic Corporation) + * Ken Merry (Spectra Logic Corporation) */ - #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +/** + * \file blkback.c + * + * \brief Device driver supporting the vending of block storage from + * a FreeBSD domain to other domains. + */ + #include <sys/param.h> #include <sys/systm.h> -#include <sys/mbuf.h> -#include <sys/malloc.h> #include <sys/kernel.h> -#include <sys/socket.h> -#include <sys/queue.h> -#include <sys/taskqueue.h> +#include <sys/malloc.h> + +#include <sys/bio.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/devicestat.h> +#include <sys/disk.h> +#include <sys/fcntl.h> +#include <sys/filedesc.h> +#include <sys/kdb.h> +#include <sys/module.h> #include <sys/namei.h> #include <sys/proc.h> -#include <sys/filedesc.h> +#include <sys/rman.h> +#include <sys/taskqueue.h> +#include <sys/types.h> #include <sys/vnode.h> -#include <sys/fcntl.h> -#include <sys/disk.h> -#include <sys/bio.h> - -#include <sys/module.h> -#include <sys/bus.h> -#include <sys/sysctl.h> +#include <sys/mount.h> #include <geom/geom.h> +#include <machine/_inttypes.h> +#include <machine/xen/xen-os.h> + +#include <vm/vm.h> #include <vm/vm_extern.h> #include <vm/vm_kern.h> -#include <machine/xen-os.h> -#include <machine/hypervisor.h> -#include <machine/hypervisor-ifs.h> -#include <machine/xen_intr.h> -#include <machine/evtchn.h> -#include <machine/xenbus.h> -#include <machine/gnttab.h> -#include <machine/xen-public/memory.h> -#include <dev/xen/xenbus/xenbus_comms.h> +#include <xen/blkif.h> +#include <xen/evtchn.h> +#include <xen/gnttab.h> +#include <xen/xen_intr.h> +#include <xen/interface/event_channel.h> +#include <xen/interface/grant_table.h> -#if XEN_BLKBACK_DEBUG -#define DPRINTF(fmt, args...) \ - printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) -#else -#define DPRINTF(fmt, args...) ((void)0) -#endif - -#define WPRINTF(fmt, args...) \ - printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#include <xen/xenbus/xenbusvar.h> -#define BLKBACK_INVALID_HANDLE (~0) +/*--------------------------- Compile-time Tunables --------------------------*/ +/** + * The maximum number of outstanding request blocks (request headers plus + * additional segment blocks) we will allow in a negotiated block-front/back + * communication channel. + */ +#define XBB_MAX_REQUESTS 256 -struct ring_ref { - vm_offset_t va; - grant_handle_t handle; - uint64_t bus_addr; -}; +/** + * \brief Define to force all I/O to be performed on memory owned by the + * backend device, with a copy-in/out to the remote domain's memory. + * + * \note This option is currently required when this driver's domain is + * operating in HVM mode on a system using an IOMMU. + * + * This driver uses Xen's grant table API to gain access to the memory of + * the remote domains it serves. When our domain is operating in PV mode, + * the grant table mechanism directly updates our domain's page table entries + * to point to the physical pages of the remote domain. This scheme guarantees + * that blkback and the backing devices it uses can safely perform DMA + * operations to satisfy requests. In HVM mode, Xen may use a HW IOMMU to + * insure that our domain cannot DMA to pages owned by another domain. As + * of Xen 4.0, IOMMU mappings for HVM guests are not updated via the grant + * table API. For this reason, in HVM mode, we must bounce all requests into + * memory that is mapped into our domain at domain startup and thus has + * valid IOMMU mappings. + */ +#define XBB_USE_BOUNCE_BUFFERS -typedef struct blkback_info { +/** + * \brief Define to enable rudimentary request logging to the console. + */ +#undef XBB_DEBUG - /* Schedule lists */ - STAILQ_ENTRY(blkback_info) next_req; - int on_req_sched_list; +/*---------------------------------- Macros ----------------------------------*/ +/** + * Custom malloc type for all driver allocations. + */ +MALLOC_DEFINE(M_XENBLOCKBACK, "xbbd", "Xen Block Back Driver Data"); - struct xenbus_device *xdev; - XenbusState frontend_state; +#ifdef XBB_DEBUG +#define DPRINTF(fmt, args...) \ + printf("xbb(%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) do {} while(0) +#endif - domid_t domid; +/** + * The maximum mapped region size per request we will allow in a negotiated + * block-front/back communication channel. + */ +#define XBB_MAX_REQUEST_SIZE \ + MIN(MAXPHYS, BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) - int state; - int ring_connected; - struct ring_ref rr; - blkif_back_ring_t ring; - evtchn_port_t evtchn; - int irq; - void *irq_cookie; +/** + * The maximum number of segments (within a request header and accompanying + * segment blocks) per request we will allow in a negotiated block-front/back + * communication channel. + */ +#define XBB_MAX_SEGMENTS_PER_REQUEST \ + (MIN(UIO_MAXIOV, \ + MIN(BLKIF_MAX_SEGMENTS_PER_REQUEST, \ + (XBB_MAX_REQUEST_SIZE / PAGE_SIZE) + 1))) + +/** + * The maximum number of shared memory ring pages we will allow in a + * negotiated block-front/back communication channel. Allow enough + * ring space for all requests to be XBB_MAX_REQUEST_SIZE'd. + */ +#define XBB_MAX_RING_PAGES \ + BLKIF_RING_PAGES(BLKIF_SEGS_TO_BLOCKS(XBB_MAX_SEGMENTS_PER_REQUEST) \ + * XBB_MAX_REQUESTS) - int ref_cnt; +/*--------------------------- Forward Declarations ---------------------------*/ +struct xbb_softc; - int handle; - char *mode; - char *type; - char *dev_name; +static void xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, + ...) __attribute__((format(printf, 3, 4))); +static int xbb_shutdown(struct xbb_softc *xbb); +static int xbb_detach(device_t dev); - struct vnode *vn; - struct cdev *cdev; - struct cdevsw *csw; - u_int sector_size; - int sector_size_shift; - off_t media_size; - u_int media_num_sectors; - int major; - int minor; - int read_only; - - struct mtx blk_ring_lock; - - device_t ndev; - - /* Stats */ - int st_rd_req; - int st_wr_req; - int st_oo_req; - int st_err_req; -} blkif_t; - -/* - * These are rather arbitrary. They are fairly large because adjacent requests - * pulled from a communication ring are quite likely to end up being part of - * the same scatter/gather request at the disc. - * - * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** - * - * This will increase the chances of being able to write whole tracks. - * 64 should be enough to keep us competitive with Linux. +/*------------------------------ Data Structures -----------------------------*/ +/** + * \brief Object tracking an in-flight I/O from a Xen VBD consumer. */ -static int blkif_reqs = 64; -TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs); +struct xbb_xen_req { + /** + * Linked list links used to aggregate idle request in the + * request free pool (xbb->request_free_slist). + */ + SLIST_ENTRY(xbb_xen_req) links; + + /** + * Back reference to the parent block back instance for this + * request. Used during bio_done handling. + */ + struct xbb_softc *xbb; + + /** + * The remote domain's identifier for this I/O request. + */ + uint64_t id; + + /** + * Kernel virtual address space reserved for this request + * structure and used to map the remote domain's pages for + * this I/O, into our domain's address space. + */ + uint8_t *kva; + +#ifdef XBB_USE_BOUNCE_BUFFERS + /** + * Pre-allocated domain local memory used to proxy remote + * domain memory during I/O operations. + */ + uint8_t *bounce; +#endif -static int mmap_pages; + /** + * Base, psuedo-physical address, corresponding to the start + * of this request's kva region. + */ + uint64_t gnt_base; + + /** + * The number of pages currently mapped for this request. + */ + int nr_pages; + + /** + * The number of 512 byte sectors comprising this requests. + */ + int nr_512b_sectors; + + /** + * The number of struct bio requests still outstanding for this + * request on the backend device. This field is only used for + * device (rather than file) backed I/O. + */ + int pendcnt; + + /** + * BLKIF_OP code for this request. + */ + int operation; + + /** + * BLKIF_RSP status code for this request. + * + * This field allows an error status to be recorded even if the + * delivery of this status must be deferred. Deferred reporting + * is necessary, for example, when an error is detected during + * completion processing of one bio when other bios for this + * request are still outstanding. + */ + int status; + + /** + * Device statistics request ordering type (ordered or simple). + */ + devstat_tag_type ds_tag_type; + + /** + * Device statistics request type (read, write, no_data). + */ + devstat_trans_flags ds_trans_type; + + /** + * The start time for this request. + */ + struct bintime ds_t0; + + /** + * Array of grant handles (one per page) used to map this request. + */ + grant_handle_t *gnt_handles; +}; +SLIST_HEAD(xbb_xen_req_slist, xbb_xen_req); -/* - * Each outstanding request that we've passed to the lower device layers has a - * 'pending_req' allocated to it. Each buffer_head that completes decrements - * the pendcnt towards zero. When it hits zero, the specified domain has a - * response queued for it, with the saved 'id' passed back. +/** + * \brief Configuration data for the shared memory request ring + * used to communicate with the front-end client of this + * this driver. */ -typedef struct pending_req { - blkif_t *blkif; - uint64_t id; - int nr_pages; - int pendcnt; - unsigned short operation; - int status; - STAILQ_ENTRY(pending_req) free_list; -} pending_req_t; - -static pending_req_t *pending_reqs; -static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free = - STAILQ_HEAD_INITIALIZER(pending_free); -static struct mtx pending_free_lock; - -static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list = - STAILQ_HEAD_INITIALIZER(req_sched_list); -static struct mtx req_sched_list_lock; - -static unsigned long mmap_vstart; -static unsigned long *pending_vaddrs; -static grant_handle_t *pending_grant_handles; - -static struct task blk_req_task; - -/* Protos */ -static void disconnect_ring(blkif_t *blkif); -static int vbd_add_dev(struct xenbus_device *xdev); - -static inline int vaddr_pagenr(pending_req_t *req, int seg) -{ - return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; -} - -static inline unsigned long vaddr(pending_req_t *req, int seg) -{ - return pending_vaddrs[vaddr_pagenr(req, seg)]; -} - -#define pending_handle(_req, _seg) \ - (pending_grant_handles[vaddr_pagenr(_req, _seg)]) +struct xbb_ring_config { + /** KVA address where ring memory is mapped. */ + vm_offset_t va; + + /** The pseudo-physical address where ring memory is mapped.*/ + uint64_t gnt_addr; + + /** + * Grant table handles, one per-ring page, returned by the + * hyperpervisor upon mapping of the ring and required to + * unmap it when a connection is torn down. + */ + grant_handle_t handle[XBB_MAX_RING_PAGES]; + + /** + * The device bus address returned by the hypervisor when + * mapping the ring and required to unmap it when a connection + * is torn down. + */ + uint64_t bus_addr[XBB_MAX_RING_PAGES]; + + /** The number of ring pages mapped for the current connection. */ + u_int ring_pages; + + /** + * The grant references, one per-ring page, supplied by the + * front-end, allowing us to reference the ring pages in the + * front-end's domain and to map these pages into our own domain. + */ + grant_ref_t ring_ref[XBB_MAX_RING_PAGES]; + + /** The interrupt driven even channel used to signal ring events. */ + evtchn_port_t evtchn; +}; -static unsigned long -alloc_empty_page_range(unsigned long nr_pages) +/** + * Per-instance connection state flags. + */ +typedef enum { - void *pages; - int i = 0, j = 0; - multicall_entry_t mcl[17]; - unsigned long mfn_list[16]; - struct xen_memory_reservation reservation = { - .extent_start = mfn_list, - .nr_extents = 0, - .address_bits = 0, - .extent_order = 0, - .domid = DOMID_SELF - }; - - pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT); - if (pages == NULL) - return 0; + /** + * The front-end requested a read-only mount of the + * back-end device/file. + */ + XBBF_READ_ONLY = 0x01, + + /** Communication with the front-end has been established. */ + XBBF_RING_CONNECTED = 0x02, + + /** + * Front-end requests exist in the ring and are waiting for + * xbb_xen_req objects to free up. + */ + XBBF_RESOURCE_SHORTAGE = 0x04, + + /** Connection teardown in progress. */ + XBBF_SHUTDOWN = 0x08 +} xbb_flag_t; + +/** Backend device type. */ +typedef enum { + /** Backend type unknown. */ + XBB_TYPE_NONE = 0x00, + + /** + * Backend type disk (access via cdev switch + * strategy routine). + */ + XBB_TYPE_DISK = 0x01, + + /** Backend type file (access vnode operations.). */ + XBB_TYPE_FILE = 0x02 +} xbb_type; + +/** + * \brief Structure used to memoize information about a per-request + * scatter-gather list. + * + * The chief benefit of using this data structure is it avoids having + * to reparse the possibly discontiguous S/G list in the original + * request. Due to the way that the mapping of the memory backing an + * I/O transaction is handled by Xen, a second pass is unavoidable. + * At least this way the second walk is a simple array traversal. + * + * \note A single Scatter/Gather element in the block interface covers + * at most 1 machine page. In this context a sector (blkif + * nomenclature, not what I'd choose) is a 512b aligned unit + * of mapping within the machine page referenced by an S/G + * element. + */ +struct xbb_sg { + /** The number of 512b data chunks mapped in this S/G element. */ + int16_t nsect; + + /** + * The index (0 based) of the first 512b data chunk mapped + * in this S/G element. + */ + uint8_t first_sect; + + /** + * The index (0 based) of the last 512b data chunk mapped + * in this S/G element. + */ + uint8_t last_sect; +}; - memset(mcl, 0, sizeof(mcl)); +/** + * Character device backend specific configuration data. + */ +struct xbb_dev_data { + /** Cdev used for device backend access. */ + struct cdev *cdev; - while (i < nr_pages) { - unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE); + /** Cdev switch used for device backend access. */ + struct cdevsw *csw; - mcl[j].op = __HYPERVISOR_update_va_mapping; - mcl[j].args[0] = va; + /** Used to hold a reference on opened cdev backend devices. */ + int dev_ref; +}; - mfn_list[j++] = vtomach(va) >> PAGE_SHIFT; +/** + * File backend specific configuration data. + */ +struct xbb_file_data { + /** Credentials to use for vnode backed (file based) I/O. */ + struct ucred *cred; + + /** + * \brief Array of io vectors used to process file based I/O. + * + * Only a single file based request is outstanding per-xbb instance, + * so we only need one of these. + */ + struct iovec xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; +#ifdef XBB_USE_BOUNCE_BUFFERS + + /** + * \brief Array of io vectors used to handle bouncing of file reads. + * + * Vnode operations are free to modify uio data during their + * exectuion. In the case of a read with bounce buffering active, + * we need some of the data from the original uio in order to + * bounce-out the read data. This array serves as the temporary + * storage for this saved data. + */ + struct iovec saved_xiovecs[XBB_MAX_SEGMENTS_PER_REQUEST]; + + /** + * \brief Array of memoized bounce buffer kva offsets used + * in the file based backend. + * + * Due to the way that the mapping of the memory backing an + * I/O transaction is handled by Xen, a second pass through + * the request sg elements is unavoidable. We memoize the computed + * bounce address here to reduce the cost of the second walk. + */ + void *xiovecs_vaddr[XBB_MAX_SEGMENTS_PER_REQUEST]; +#endif /* XBB_USE_BOUNCE_BUFFERS */ +}; - xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY; +/** + * Collection of backend type specific data. + */ +union xbb_backend_data { + struct xbb_dev_data dev; + struct xbb_file_data file; +}; - if (j == 16 || i == nr_pages) { - mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL; +/** + * Function signature of backend specific I/O handlers. + */ +typedef int (*xbb_dispatch_t)(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, int nseg, + int operation, int flags); - reservation.nr_extents = j; +/** + * Per-instance configuration data. + */ +struct xbb_softc { + + /** + * Task-queue used to process I/O requests. + */ + struct taskqueue *io_taskqueue; + + /** + * Single "run the request queue" task enqueued + * on io_taskqueue. + */ + struct task io_task; + + /** Device type for this instance. */ + xbb_type device_type; + + /** NewBus device corresponding to this instance. */ + device_t dev; + + /** Backend specific dispatch routine for this instance. */ + xbb_dispatch_t dispatch_io; + + /** The number of requests outstanding on the backend device/file. */ + u_int active_request_count; + + /** Free pool of request tracking structures. */ + struct xbb_xen_req_slist request_free_slist; + + /** Array, sized at connection time, of request tracking structures. */ + struct xbb_xen_req *requests; + + /** + * Global pool of kva used for mapping remote domain ring + * and I/O transaction data. + */ + vm_offset_t kva; + + /** Psuedo-physical address corresponding to kva. */ + uint64_t gnt_base_addr; + + /** The size of the global kva pool. */ + int kva_size; + + /** + * \brief Cached value of the front-end's domain id. + * + * This value is used at once for each mapped page in + * a transaction. We cache it to avoid incuring the + * cost of an ivar access every time this is needed. + */ + domid_t otherend_id; + + /** + * \brief The blkif protocol abi in effect. + * + * There are situations where the back and front ends can + * have a different, native abi (e.g. intel x86_64 and + * 32bit x86 domains on the same machine). The back-end + * always accomodates the front-end's native abi. That + * value is pulled from the XenStore and recorded here. + */ + int abi; + + /** + * \brief The maximum number of requests allowed to be in + * flight at a time. + * + * This value is negotiated via the XenStore. + */ + uint32_t max_requests; + + /** + * \brief The maximum number of segments (1 page per segment) + * that can be mapped by a request. + * + * This value is negotiated via the XenStore. + */ + uint32_t max_request_segments; + + /** + * The maximum size of any request to this back-end + * device. + * + * This value is negotiated via the XenStore. + */ + uint32_t max_request_size; + + /** Various configuration and state bit flags. */ + xbb_flag_t flags; + + /** Ring mapping and interrupt configuration data. */ + struct xbb_ring_config ring_config; + + /** Runtime, cross-abi safe, structures for ring access. */ + blkif_back_rings_t rings; + + /** IRQ mapping for the communication ring event channel. */ + int irq; + + /** + * \brief Backend access mode flags (e.g. write, or read-only). + * + * This value is passed to us by the front-end via the XenStore. + */ + char *dev_mode; + + /** + * \brief Backend device type (e.g. "disk", "cdrom", "floppy"). + * + * This value is passed to us by the front-end via the XenStore. + * Currently unused. + */ + char *dev_type; + + /** + * \brief Backend device/file identifier. + * + * This value is passed to us by the front-end via the XenStore. + * We expect this to be a POSIX path indicating the file or + * device to open. + */ + char *dev_name; + + /** + * Vnode corresponding to the backend device node or file + * we are acessing. + */ + struct vnode *vn; + + union xbb_backend_data backend; + /** The native sector size of the backend. */ + u_int sector_size; + + /** log2 of sector_size. */ + u_int sector_size_shift; + + /** Size in bytes of the backend device or file. */ + off_t media_size; + + /** + * \brief media_size expressed in terms of the backend native + * sector size. + * + * (e.g. xbb->media_size >> xbb->sector_size_shift). + */ + uint64_t media_num_sectors; + + /** + * \brief Array of memoized scatter gather data computed during the + * conversion of blkif ring requests to internal xbb_xen_req + * structures. + * + * Ring processing is serialized so we only need one of these. + */ + struct xbb_sg xbb_sgs[XBB_MAX_SEGMENTS_PER_REQUEST]; + + /** Mutex protecting per-instance data. */ + struct mtx lock; + +#ifdef XENHVM + /** + * Resource representing allocated physical address space + * associated with our per-instance kva region. + */ + struct resource *pseudo_phys_res; + + /** Resource id for allocated physical address space. */ + int pseudo_phys_res_id; +#endif - mcl[j].op = __HYPERVISOR_memory_op; - mcl[j].args[0] = XENMEM_decrease_reservation; - mcl[j].args[1] = (unsigned long)&reservation; - - (void)HYPERVISOR_multicall(mcl, j+1); + /** I/O statistics. */ + struct devstat *xbb_stats; +}; - mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0; - j = 0; +/*---------------------------- Request Processing ----------------------------*/ +/** + * Allocate an internal transaction tracking structure from the free pool. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return On success, a pointer to the allocated xbb_xen_req structure. + * Otherwise NULL. + */ +static inline struct xbb_xen_req * +xbb_get_req(struct xbb_softc *xbb) +{ + struct xbb_xen_req *req; + + req = NULL; + mtx_lock(&xbb->lock); + + /* + * Do not allow new requests to be allocated while we + * are shutting down. + */ + if ((xbb->flags & XBBF_SHUTDOWN) == 0) { + if ((req = SLIST_FIRST(&xbb->request_free_slist)) != NULL) { + SLIST_REMOVE_HEAD(&xbb->request_free_slist, links); + xbb->active_request_count++; + } else { + xbb->flags |= XBBF_RESOURCE_SHORTAGE; } } - - return (unsigned long)pages; + mtx_unlock(&xbb->lock); + return (req); } -static pending_req_t * -alloc_req(void) -{ - pending_req_t *req; - mtx_lock(&pending_free_lock); - if ((req = STAILQ_FIRST(&pending_free))) { - STAILQ_REMOVE(&pending_free, req, pending_req, free_list); - STAILQ_NEXT(req, free_list) = NULL; - } - mtx_unlock(&pending_free_lock); - return req; -} - -static void -free_req(pending_req_t *req) +/** + * Return an allocated transaction tracking structure to the free pool. + * + * \param xbb Per-instance xbb configuration structure. + * \param req The request structure to free. + */ +static inline void +xbb_release_req(struct xbb_softc *xbb, struct xbb_xen_req *req) { - int was_empty; - - mtx_lock(&pending_free_lock); - was_empty = STAILQ_EMPTY(&pending_free); - STAILQ_INSERT_TAIL(&pending_free, req, free_list); - mtx_unlock(&pending_free_lock); - if (was_empty) - taskqueue_enqueue(taskqueue_swi, &blk_req_task); -} + int wake_thread; -static void -fast_flush_area(pending_req_t *req) -{ - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int i, invcount = 0; - grant_handle_t handle; - int ret; + mtx_lock(&xbb->lock); + wake_thread = xbb->flags & XBBF_RESOURCE_SHORTAGE; + xbb->flags &= ~XBBF_RESOURCE_SHORTAGE; + SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); + xbb->active_request_count--; - for (i = 0; i < req->nr_pages; i++) { - handle = pending_handle(req, i); - if (handle == BLKBACK_INVALID_HANDLE) - continue; - unmap[invcount].host_addr = vaddr(req, i); - unmap[invcount].dev_bus_addr = 0; - unmap[invcount].handle = handle; - pending_handle(req, i) = BLKBACK_INVALID_HANDLE; - invcount++; + if ((xbb->flags & XBBF_SHUTDOWN) != 0) { + /* + * Shutdown is in progress. See if we can + * progress further now that one more request + * has completed and been returned to the + * free pool. + */ + xbb_shutdown(xbb); } + mtx_unlock(&xbb->lock); - ret = HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, invcount); - PANIC_IF(ret); + if (wake_thread != 0) + taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } -static void -blkif_get(blkif_t *blkif) +/** + * Given a page index and 512b sector offset within that page, + * calculate an offset into a request's kva region. + * + * \param req The request structure whose kva region will be accessed. + * \param pagenr The page index used to compute the kva offset. + * \param sector The 512b sector index used to compute the page relative + * kva offset. + * + * \return The computed global KVA offset. + */ +static inline uint8_t * +xbb_req_vaddr(struct xbb_xen_req *req, int pagenr, int sector) { - atomic_add_int(&blkif->ref_cnt, 1); + return (req->kva + (PAGE_SIZE * pagenr) + (sector << 9)); } -static void -blkif_put(blkif_t *blkif) +#ifdef XBB_USE_BOUNCE_BUFFERS +/** + * Given a page index and 512b sector offset within that page, + * calculate an offset into a request's local bounce memory region. + * + * \param req The request structure whose bounce region will be accessed. + * \param pagenr The page index used to compute the bounce offset. + * \param sector The 512b sector index used to compute the page relative + * bounce offset. + * + * \return The computed global bounce buffer address. + */ +static inline uint8_t * +xbb_req_bounce_addr(struct xbb_xen_req *req, int pagenr, int sector) { - if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) { - DPRINTF("Removing %x\n", (unsigned int)blkif); - disconnect_ring(blkif); - if (blkif->mode) - free(blkif->mode, M_DEVBUF); - if (blkif->type) - free(blkif->type, M_DEVBUF); - if (blkif->dev_name) - free(blkif->dev_name, M_DEVBUF); - free(blkif, M_DEVBUF); - } + return (req->bounce + (PAGE_SIZE * pagenr) + (sector << 9)); } +#endif -static int -blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params) +/** + * Given a page number and 512b sector offset within that page, + * calculate an offset into the request's memory region that the + * underlying backend device/file should use for I/O. + * + * \param req The request structure whose I/O region will be accessed. + * \param pagenr The page index used to compute the I/O offset. + * \param sector The 512b sector index used to compute the page relative + * I/O offset. + * + * \return The computed global I/O address. + * + * Depending on configuration, this will either be a local bounce buffer + * or a pointer to the memory mapped in from the front-end domain for + * this request. + */ +static inline uint8_t * +xbb_req_ioaddr(struct xbb_xen_req *req, int pagenr, int sector) { - blkif_t *blkif; - - blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO); - if (!blkif) - return ENOMEM; - - DPRINTF("Created %x\n", (unsigned int)blkif); - - blkif->ref_cnt = 1; - blkif->domid = xdev->otherend_id; - blkif->handle = handle; - blkif->mode = mode; - blkif->type = type; - blkif->dev_name = params; - blkif->xdev = xdev; - xdev->data = blkif; - - mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF); - - if (strcmp(mode, "w")) - blkif->read_only = 1; - - return 0; +#ifdef XBB_USE_BOUNCE_BUFFERS + return (xbb_req_bounce_addr(req, pagenr, sector)); +#else + return (xbb_req_vaddr(req, pagenr, sector)); +#endif } -static void -add_to_req_schedule_list_tail(blkif_t *blkif) +/** + * Given a page index and 512b sector offset within that page, calculate + * an offset into the local psuedo-physical address space used to map a + * front-end's request data into a request. + * + * \param req The request structure whose pseudo-physical region + * will be accessed. + * \param pagenr The page index used to compute the pseudo-physical offset. + * \param sector The 512b sector index used to compute the page relative + * pseudo-physical offset. + * + * \return The computed global pseudo-phsyical address. + * + * Depending on configuration, this will either be a local bounce buffer + * or a pointer to the memory mapped in from the front-end domain for + * this request. + */ +static inline uintptr_t +xbb_req_gntaddr(struct xbb_xen_req *req, int pagenr, int sector) { - if (!blkif->on_req_sched_list) { - mtx_lock(&req_sched_list_lock); - if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) { - blkif_get(blkif); - STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); - blkif->on_req_sched_list = 1; - taskqueue_enqueue(taskqueue_swi, &blk_req_task); - } - mtx_unlock(&req_sched_list_lock); - } + return ((uintptr_t)(req->gnt_base + + (PAGE_SIZE * pagenr) + (sector << 9))); } -/* This routine does not call blkif_get(), does not schedule the blk_req_task to run, - and assumes that the state is connected */ +/** + * Unmap the front-end pages associated with this I/O request. + * + * \param req The request structure to unmap. + */ static void -add_to_req_schedule_list_tail2(blkif_t *blkif) +xbb_unmap_req(struct xbb_xen_req *req) { - mtx_lock(&req_sched_list_lock); - if (!blkif->on_req_sched_list) { - STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); - blkif->on_req_sched_list = 1; - } - mtx_unlock(&req_sched_list_lock); -} + struct gnttab_unmap_grant_ref unmap[XBB_MAX_SEGMENTS_PER_REQUEST]; + u_int i; + u_int invcount; + int error; -/* Removes blkif from front of list and does not call blkif_put() (caller must) */ -static blkif_t * -remove_from_req_schedule_list(void) -{ - blkif_t *blkif; + invcount = 0; + for (i = 0; i < req->nr_pages; i++) { - mtx_lock(&req_sched_list_lock); + if (req->gnt_handles[i] == GRANT_REF_INVALID) + continue; - if ((blkif = STAILQ_FIRST(&req_sched_list))) { - STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req); - STAILQ_NEXT(blkif, next_req) = NULL; - blkif->on_req_sched_list = 0; + unmap[invcount].host_addr = xbb_req_gntaddr(req, i, 0); + unmap[invcount].dev_bus_addr = 0; + unmap[invcount].handle = req->gnt_handles[i]; + req->gnt_handles[i] = GRANT_REF_INVALID; + invcount++; } - mtx_unlock(&req_sched_list_lock); - - return blkif; + error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, + unmap, invcount); + KASSERT(error == 0, ("Grant table operation failed")); } +/** + * Create and transmit a response to a blkif request. + * + * \param xbb Per-instance xbb configuration structure. + * \param req The request structure to which to respond. + * \param status The status code to report. See BLKIF_RSP_* + * in sys/xen/interface/io/blkif.h. + */ static void -make_response(blkif_t *blkif, uint64_t id, - unsigned short op, int st) +xbb_send_response(struct xbb_softc *xbb, struct xbb_xen_req *req, int status) { blkif_response_t *resp; - blkif_back_ring_t *blk_ring = &blkif->ring; - int more_to_do = 0; - int notify; + int more_to_do; + int notify; + + more_to_do = 0; + + /* + * Place on the response ring for the relevant domain. + * For now, only the spacing between entries is different + * in the different ABIs, not the response entry layout. + */ + mtx_lock(&xbb->lock); + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + resp = RING_GET_RESPONSE(&xbb->rings.native, + xbb->rings.native.rsp_prod_pvt); + break; + case BLKIF_PROTOCOL_X86_32: + resp = (blkif_response_t *) + RING_GET_RESPONSE(&xbb->rings.x86_32, + xbb->rings.x86_32.rsp_prod_pvt); + break; + case BLKIF_PROTOCOL_X86_64: + resp = (blkif_response_t *) + RING_GET_RESPONSE(&xbb->rings.x86_64, + xbb->rings.x86_64.rsp_prod_pvt); + break; + default: + panic("Unexpected blkif protocol ABI."); + } - mtx_lock(&blkif->blk_ring_lock); + resp->id = req->id; + resp->operation = req->operation; + resp->status = status; + xbb->rings.common.rsp_prod_pvt += BLKIF_SEGS_TO_BLOCKS(req->nr_pages); + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&xbb->rings.common, notify); - /* Place on the response ring for the relevant domain. */ - resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); - resp->id = id; - resp->operation = op; - resp->status = st; - blk_ring->rsp_prod_pvt++; - RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify); + if (xbb->rings.common.rsp_prod_pvt == xbb->rings.common.req_cons) { - if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) { /* * Tail check for pending requests. Allows frontend to avoid * notifications if requests are already in flight (lower * overheads and promotes batching). */ - RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do); + RING_FINAL_CHECK_FOR_REQUESTS(&xbb->rings.common, more_to_do); + } else if (RING_HAS_UNCONSUMED_REQUESTS(&xbb->rings.common)) { - } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) more_to_do = 1; + } - mtx_unlock(&blkif->blk_ring_lock); + mtx_unlock(&xbb->lock); if (more_to_do) - add_to_req_schedule_list_tail(blkif); + taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); if (notify) - notify_remote_via_irq(blkif->irq); + notify_remote_via_irq(xbb->irq); } +/** + * Completion handler for buffer I/O requests issued by the device + * backend driver. + * + * \param bio The buffer I/O request on which to perform completion + * processing. + */ static void -end_block_io_op(struct bio *bio) +xbb_bio_done(struct bio *bio) { - pending_req_t *pending_req = bio->bio_caller2; + struct xbb_softc *xbb; + struct xbb_xen_req *req; + + req = bio->bio_caller1; + xbb = req->xbb; + /* Only include transferred I/O in stats. */ + req->nr_512b_sectors -= bio->bio_resid >> 9; if (bio->bio_error) { DPRINTF("BIO returned error %d for operation on device %s\n", - bio->bio_error, pending_req->blkif->dev_name); - pending_req->status = BLKIF_RSP_ERROR; - pending_req->blkif->st_err_req++; + bio->bio_error, xbb->dev_name); + req->status = BLKIF_RSP_ERROR; + + if (bio->bio_error == ENXIO + && xenbus_get_state(xbb->dev) == XenbusStateConnected) { + + /* + * Backend device has disappeared. Signal the + * front-end that we (the device proxy) want to + * go away. + */ + xenbus_set_state(xbb->dev, XenbusStateClosing); + } } -#if 0 - printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n", - (unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags); -#endif +#ifdef XBB_USE_BOUNCE_BUFFERS + if (bio->bio_cmd == BIO_READ) { + vm_offset_t kva_offset; - if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) { - fast_flush_area(pending_req); - make_response(pending_req->blkif, pending_req->id, - pending_req->operation, pending_req->status); - blkif_put(pending_req->blkif); - free_req(pending_req); + kva_offset = (vm_offset_t)bio->bio_data + - (vm_offset_t)req->bounce; + memcpy((uint8_t *)req->kva + kva_offset, + bio->bio_data, bio->bio_bcount); + } +#endif /* XBB_USE_BOUNCE_BUFFERS */ + + if (atomic_fetchadd_int(&req->pendcnt, -1) == 1) { + xbb_unmap_req(req); + xbb_send_response(xbb, req, req->status); + devstat_end_transaction(xbb->xbb_stats, + /*bytes*/req->nr_512b_sectors << 9, + req->ds_tag_type, + req->ds_trans_type, + /*now*/NULL, + /*then*/&req->ds_t0); + xbb_release_req(xbb, req); } g_destroy_bio(bio); } +/** + * Parse a blkif request into an internal request structure and send + * it to the backend for processing. + * + * \param xbb Per-instance xbb configuration structure. + * \param ring_req Front-end's I/O request as pulled from the shared + * communication ring. + * \param req Allocated internal request structure. + * \param req_ring_idx The location of ring_req within the shared + * communication ring. + * + * This routine performs the backend common aspects of request parsing + * including compiling an internal request structure, parsing the S/G + * list and any secondary ring requests in which they may reside, and + * the mapping of front-end I/O pages into our domain. + */ static void -dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req) +xbb_dispatch_io(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, RING_IDX req_ring_idx) { - struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - struct { - unsigned long buf; unsigned int nsec; - } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - unsigned int nseg = req->nr_segments, nr_sects = 0; - struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - int operation, ret, i, nbio = 0; + struct gnttab_map_grant_ref maps[XBB_MAX_SEGMENTS_PER_REQUEST]; + struct xbb_sg *xbb_sg; + struct gnttab_map_grant_ref *map; + struct blkif_request_segment *sg; + struct blkif_request_segment *last_block_sg; + u_int nseg; + u_int seg_idx; + u_int block_segs; + int nr_sects; + int operation; + uint8_t bio_flags; + int error; + + nseg = ring_req->nr_segments; + nr_sects = 0; + req->xbb = xbb; + req->id = ring_req->id; + req->operation = ring_req->operation; + req->status = BLKIF_RSP_OKAY; + req->ds_tag_type = DEVSTAT_TAG_SIMPLE; + req->nr_pages = nseg; + req->nr_512b_sectors = 0; + bio_flags = 0; + sg = NULL; + + binuptime(&req->ds_t0); + devstat_start_transaction(xbb->xbb_stats, &req->ds_t0); + + switch (req->operation) { + case BLKIF_OP_WRITE_BARRIER: + bio_flags |= BIO_ORDERED; + req->ds_tag_type = DEVSTAT_TAG_ORDERED; + /* FALLTHROUGH */ + case BLKIF_OP_WRITE: + operation = BIO_WRITE; + req->ds_trans_type = DEVSTAT_WRITE; + if ((xbb->flags & XBBF_READ_ONLY) != 0) { + DPRINTF("Attempt to write to read only device %s\n", + xbb->dev_name); + goto fail_send_response; + } + break; + case BLKIF_OP_READ: + operation = BIO_READ; + req->ds_trans_type = DEVSTAT_READ; + break; + case BLKIF_OP_FLUSH_DISKCACHE: + operation = BIO_FLUSH; + req->ds_tag_type = DEVSTAT_TAG_ORDERED; + req->ds_trans_type = DEVSTAT_NO_DATA; + goto do_dispatch; + /*NOTREACHED*/ + default: + DPRINTF("error: unknown block io operation [%d]\n", + req->operation); + goto fail_send_response; + } /* Check that number of segments is sane. */ - if (unlikely(nseg == 0) || - unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + if (unlikely(nseg == 0) + || unlikely(nseg > xbb->max_request_segments)) { DPRINTF("Bad number of segments in request (%d)\n", nseg); - goto fail_response; + goto fail_send_response; } - if (req->operation == BLKIF_OP_WRITE) { - if (blkif->read_only) { - DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name); - goto fail_response; - } - operation = BIO_WRITE; - } else - operation = BIO_READ; - - pending_req->blkif = blkif; - pending_req->id = req->id; - pending_req->operation = req->operation; - pending_req->status = BLKIF_RSP_OKAY; - pending_req->nr_pages = nseg; + map = maps; + xbb_sg = xbb->xbb_sgs; + block_segs = MIN(req->nr_pages, BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK); + sg = ring_req->seg; + last_block_sg = sg + block_segs; + seg_idx = 0; + while (1) { - for (i = 0; i < nseg; i++) { - seg[i].nsec = req->seg[i].last_sect - - req->seg[i].first_sect + 1; + while (sg < last_block_sg) { + + xbb_sg->first_sect = sg->first_sect; + xbb_sg->last_sect = sg->last_sect; + xbb_sg->nsect = + (int8_t)(sg->last_sect - sg->first_sect + 1); + + if ((sg->last_sect >= (PAGE_SIZE >> 9)) + || (xbb_sg->nsect <= 0)) + goto fail_send_response; + + nr_sects += xbb_sg->nsect; + map->host_addr = xbb_req_gntaddr(req, seg_idx, + /*sector*/0); + map->flags = GNTMAP_host_map; + map->ref = sg->gref; + map->dom = xbb->otherend_id; + if (operation == BIO_WRITE) + map->flags |= GNTMAP_readonly; + sg++; + map++; + xbb_sg++; + seg_idx++; + } - if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || - (seg[i].nsec <= 0)) - goto fail_response; - nr_sects += seg[i].nsec; + block_segs = MIN(nseg - seg_idx, + BLKIF_MAX_SEGMENTS_PER_SEGMENT_BLOCK); + if (block_segs == 0) + break; - map[i].host_addr = vaddr(pending_req, i); - map[i].dom = blkif->domid; - map[i].ref = req->seg[i].gref; - map[i].flags = GNTMAP_host_map; - if (operation == BIO_WRITE) - map[i].flags |= GNTMAP_readonly; + /* + * Fetch the next request block full of SG elements. + * For now, only the spacing between entries is different + * in the different ABIs, not the sg entry layout. + */ + req_ring_idx++; + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.native, + req_ring_idx); + break; + case BLKIF_PROTOCOL_X86_32: + { + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_32, + req_ring_idx); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + sg = BLKRING_GET_SG_REQUEST(&xbb->rings.x86_64, + req_ring_idx); + break; + } + default: + panic("Unexpected blkif protocol ABI."); + /* NOTREACHED */ + } + last_block_sg = sg + block_segs; } /* Convert to the disk's sector size */ - nr_sects = (nr_sects << 9) >> blkif->sector_size_shift; - - ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); - PANIC_IF(ret); + req->nr_512b_sectors = nr_sects; + nr_sects = (nr_sects << 9) >> xbb->sector_size_shift; + + if ((req->nr_512b_sectors & ((xbb->sector_size >> 9) - 1)) != 0) { + device_printf(xbb->dev, "%s: I/O size (%d) is not a multiple " + "of the backing store sector size (%d)\n", + __func__, req->nr_512b_sectors << 9, + xbb->sector_size); + goto fail_send_response; + } - for (i = 0; i < nseg; i++) { - if (unlikely(map[i].status != 0)) { - DPRINTF("invalid buffer -- could not remap it\n"); - goto fail_flush; + error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, + maps, req->nr_pages); + if (error != 0) + panic("Grant table operation failed (%d)", error); + + for (seg_idx = 0, map = maps; seg_idx < nseg; seg_idx++, map++) { + + if (unlikely(map->status != 0)) { + DPRINTF("invalid buffer -- could not remap it (%d)\n", + map->status); + DPRINTF("Mapping(%d): Host Addr 0x%lx, flags 0x%x " + "ref 0x%x, dom %d\n", seg_idx, + map->host_addr, map->flags, map->ref, + map->dom); + goto fail_unmap_req; } - pending_handle(pending_req, i) = map[i].handle; -#if 0 - /* Can't do this in FreeBSD since vtophys() returns the pfn */ - /* of the remote domain who loaned us the machine page - DPT */ - xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] = - map[i]dev_bus_addr >> PAGE_SHIFT; -#endif - seg[i].buf = map[i].dev_bus_addr | - (req->seg[i].first_sect << 9); + req->gnt_handles[seg_idx] = map->handle; } + if (ring_req->sector_number + nr_sects > xbb->media_num_sectors) { - if (req->sector_number + nr_sects > blkif->media_num_sectors) { - DPRINTF("%s of [%llu,%llu] extends past end of device %s\n", + DPRINTF("%s of [%" PRIu64 ",%" PRIu64 "] " + "extends past end of device %s\n", operation == BIO_READ ? "read" : "write", - req->sector_number, - req->sector_number + nr_sects, blkif->dev_name); - goto fail_flush; + ring_req->sector_number, + ring_req->sector_number + nr_sects, xbb->dev_name); + goto fail_unmap_req; } - for (i = 0; i < nseg; i++) { - struct bio *bio; - - if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) { - DPRINTF("Misaligned I/O request from domain %d", blkif->domid); - goto fail_put_bio; - } - - bio = biolist[nbio++] = g_new_bio(); - if (unlikely(bio == NULL)) - goto fail_put_bio; +do_dispatch: - bio->bio_cmd = operation; - bio->bio_offset = req->sector_number << blkif->sector_size_shift; - bio->bio_length = seg[i].nsec << 9; - bio->bio_bcount = bio->bio_length; - bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK)); - bio->bio_done = end_block_io_op; - bio->bio_caller2 = pending_req; - bio->bio_dev = blkif->cdev; + error = xbb->dispatch_io(xbb, + ring_req, + req, + nseg, + operation, + bio_flags); - req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift; -#if 0 - printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n", - (unsigned int)bio, req->operation, req->sector_number, seg[i].nsec, - blkif->cdev->si_iosize_max, seg[i].buf); -#endif + if (error != 0) { + if (operation == BIO_FLUSH) + goto fail_send_response; + else + goto fail_unmap_req; } - pending_req->pendcnt = nbio; - blkif_get(blkif); + return; - for (i = 0; i < nbio; i++) - (*blkif->csw->d_strategy)(biolist[i]); - return; +fail_unmap_req: + xbb_unmap_req(req); + /* FALLTHROUGH */ - fail_put_bio: - for (i = 0; i < (nbio-1); i++) - g_destroy_bio(biolist[i]); - fail_flush: - fast_flush_area(pending_req); - fail_response: - make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); - free_req(pending_req); +fail_send_response: + xbb_send_response(xbb, req, BLKIF_RSP_ERROR); + xbb_release_req(xbb, req); + devstat_end_transaction(xbb->xbb_stats, + /*bytes*/0, + req->ds_tag_type, + req->ds_trans_type, + /*now*/NULL, + /*then*/&req->ds_t0); } +/** + * Process incoming requests from the shared communication ring in response + * to a signal on the ring's event channel. + * + * \param context Callback argument registerd during task initialization - + * the xbb_softc for this instance. + * \param pending The number of taskqueue_enqueue events that have + * occurred since this handler was last run. + */ static void -blk_req_action(void *context, int pending) +xbb_run_queue(void *context, int pending) { - blkif_t *blkif; - - DPRINTF("\n"); - - while (!STAILQ_EMPTY(&req_sched_list)) { - blkif_back_ring_t *blk_ring; - RING_IDX rc, rp; - - blkif = remove_from_req_schedule_list(); - - blk_ring = &blkif->ring; - rc = blk_ring->req_cons; - rp = blk_ring->sring->req_prod; - rmb(); /* Ensure we see queued requests up to 'rp'. */ - - while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) { - blkif_request_t *req; - pending_req_t *pending_req; - - pending_req = alloc_req(); - if (pending_req == NULL) - goto out_of_preqs; - - req = RING_GET_REQUEST(blk_ring, rc); - blk_ring->req_cons = ++rc; /* before make_response() */ - - switch (req->operation) { - case BLKIF_OP_READ: - blkif->st_rd_req++; - dispatch_rw_block_io(blkif, req, pending_req); - break; - case BLKIF_OP_WRITE: - blkif->st_wr_req++; - dispatch_rw_block_io(blkif, req, pending_req); - break; - default: - blkif->st_err_req++; - DPRINTF("error: unknown block io operation [%d]\n", - req->operation); - make_response(blkif, req->id, req->operation, - BLKIF_RSP_ERROR); - free_req(pending_req); - break; - } + struct xbb_softc *xbb; + blkif_back_rings_t *rings; + RING_IDX rp; + + + xbb = (struct xbb_softc *)context; + rings = &xbb->rings; + + /* + * Cache req_prod to avoid accessing a cache line shared + * with the frontend. + */ + rp = rings->common.sring->req_prod; + + /* Ensure we see queued requests up to 'rp'. */ + rmb(); + + /** + * Run so long as there is work to consume and the generation + * of a response will not overflow the ring. + * + * @note There's a 1 to 1 relationship between requests and responses, + * so an overflow should never occur. This test is to protect + * our domain from digesting bogus data. Shouldn't we log this? + */ + while (rings->common.req_cons != rp + && RING_REQUEST_CONS_OVERFLOW(&rings->common, + rings->common.req_cons) == 0) { + blkif_request_t ring_req_storage; + blkif_request_t *ring_req; + struct xbb_xen_req *req; + RING_IDX req_ring_idx; + + req = xbb_get_req(xbb); + if (req == NULL) { + /* + * Resource shortage has been recorded. + * We'll be scheduled to run once a request + * object frees up due to a completion. + */ + break; } - blkif_put(blkif); - } + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + ring_req = RING_GET_REQUEST(&xbb->rings.native, + rings->common.req_cons); + break; + case BLKIF_PROTOCOL_X86_32: + { + struct blkif_x86_32_request *ring_req32; + + ring_req32 = RING_GET_REQUEST(&xbb->rings.x86_32, + rings->common.req_cons); + blkif_get_x86_32_req(&ring_req_storage, ring_req32); + ring_req = &ring_req_storage; + break; + } + case BLKIF_PROTOCOL_X86_64: + { + struct blkif_x86_64_request *ring_req64; + + ring_req64 = RING_GET_REQUEST(&xbb->rings.x86_64, + rings->common.req_cons); + blkif_get_x86_64_req(&ring_req_storage, ring_req64); + ring_req = &ring_req_storage; + break; + } + default: + panic("Unexpected blkif protocol ABI."); + /* NOTREACHED */ + } - return; + /* + * Signify that we can overwrite this request with a + * response by incrementing our consumer index. The + * response won't be generated until after we've already + * consumed all necessary data out of the version of the + * request in the ring buffer (for native mode). We + * must update the consumer index before issueing back-end + * I/O so there is no possibility that it will complete + * and a response be generated before we make room in + * the queue for that response. + */ + req_ring_idx = xbb->rings.common.req_cons; + xbb->rings.common.req_cons += + BLKIF_SEGS_TO_BLOCKS(ring_req->nr_segments); - out_of_preqs: - /* We ran out of pending req structs */ - /* Just requeue interface and wait to be rescheduled to run when one is freed */ - add_to_req_schedule_list_tail2(blkif); - blkif->st_oo_req++; + xbb_dispatch_io(xbb, ring_req, req, req_ring_idx); + } } -/* Handle interrupt from a frontend */ +/** + * Interrupt handler bound to the shared ring's event channel. + * + * \param arg Callback argument registerd during event channel + * binding - the xbb_softc for this instance. + */ static void -blkback_intr(void *arg) +xbb_intr(void *arg) { - blkif_t *blkif = arg; - DPRINTF("%x\n", (unsigned int)blkif); - add_to_req_schedule_list_tail(blkif); + struct xbb_softc *xbb; + + /* Defer to kernel thread. */ + xbb = (struct xbb_softc *)arg; + taskqueue_enqueue(xbb->io_taskqueue, &xbb->io_task); } -/* Map grant ref for ring */ +/*----------------------------- Backend Handlers -----------------------------*/ +/** + * Backend handler for character device access. + * + * \param xbb Per-instance xbb configuration structure. + * \param ring_req Front-end's I/O request as pulled from the shared + * communication ring. + * \param req Allocated internal request structure. + * \param nseg The number of valid segments for this request in + * xbb->xbb_sgs. + * \param operation BIO_* I/O operation code. + * \param bio_flags Additional bio_flag data to pass to any generated + * bios (e.g. BIO_ORDERED).. + * + * \return 0 for success, errno codes for failure. + */ static int -map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring) +xbb_dispatch_dev(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, int nseg, int operation, + int bio_flags) { - struct gnttab_map_grant_ref op; + struct xbb_dev_data *dev_data; + struct bio *bios[XBB_MAX_SEGMENTS_PER_REQUEST]; + off_t bio_offset; + struct bio *bio; + struct xbb_sg *xbb_sg; + u_int nbio; + u_int bio_idx; + u_int seg_idx; + int error; + + dev_data = &xbb->backend.dev; + bio_offset = (off_t)ring_req->sector_number + << xbb->sector_size_shift; + error = 0; + nbio = 0; + bio_idx = 0; + + if (operation == BIO_FLUSH) { + bio = g_new_bio(); + if (unlikely(bio == NULL)) { + DPRINTF("Unable to allocate bio for BIO_FLUSH\n"); + error = ENOMEM; + return (error); + } + + bio->bio_cmd = BIO_FLUSH; + bio->bio_flags |= BIO_ORDERED; + bio->bio_dev = dev_data->cdev; + bio->bio_offset = 0; + bio->bio_data = 0; + bio->bio_done = xbb_bio_done; + bio->bio_caller1 = req; + bio->bio_pblkno = 0; - ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); - if (ring->va == 0) - return ENOMEM; + req->pendcnt = 1; - op.host_addr = ring->va; - op.flags = GNTMAP_host_map; - op.ref = ref; - op.dom = dom; - HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1); - if (op.status) { - WPRINTF("grant table op err=%d\n", op.status); - kmem_free(kernel_map, ring->va, PAGE_SIZE); - ring->va = 0; - return EACCES; + (*dev_data->csw->d_strategy)(bios[bio_idx]); + + return (0); } - ring->handle = op.handle; - ring->bus_addr = op.dev_bus_addr; + for (seg_idx = 0, bio = NULL, xbb_sg = xbb->xbb_sgs; + seg_idx < nseg; + seg_idx++, xbb_sg++) { - return 0; -} + /* + * KVA will not be contiguous, so any additional + * I/O will need to be represented in a new bio. + */ + if ((bio != NULL) + && (xbb_sg->first_sect != 0)) { + if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { + printf("%s: Discontiguous I/O request from " + "domain %d ends on non-sector " + "boundary\n", __func__, + xbb->otherend_id); + error = EINVAL; + goto fail_free_bios; + } + bio = NULL; + } -/* Unmap grant ref for ring */ -static void -unmap_ring(struct ring_ref *ring) -{ - struct gnttab_unmap_grant_ref op; + if (bio == NULL) { + /* + * Make sure that the start of this bio is aligned + * to a device sector. + */ + if ((bio_offset & (xbb->sector_size - 1)) != 0) { + printf("%s: Misaligned I/O request from " + "domain %d\n", __func__, + xbb->otherend_id); + error = EINVAL; + goto fail_free_bios; + } - op.host_addr = ring->va; - op.dev_bus_addr = ring->bus_addr; - op.handle = ring->handle; - HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1); - if (op.status) - WPRINTF("grant table op err=%d\n", op.status); + bio = bios[nbio++] = g_new_bio(); + if (unlikely(bio == NULL)) { + error = ENOMEM; + goto fail_free_bios; + } + bio->bio_cmd = operation; + bio->bio_flags |= bio_flags; + bio->bio_dev = dev_data->cdev; + bio->bio_offset = bio_offset; + bio->bio_data = xbb_req_ioaddr(req, seg_idx, + xbb_sg->first_sect); + bio->bio_done = xbb_bio_done; + bio->bio_caller1 = req; + bio->bio_pblkno = bio_offset + >> xbb->sector_size_shift; + } - kmem_free(kernel_map, ring->va, PAGE_SIZE); - ring->va = 0; -} + bio->bio_length += xbb_sg->nsect << 9; + bio->bio_bcount = bio->bio_length; + bio_offset += xbb_sg->nsect << 9; -static int -connect_ring(blkif_t *blkif) -{ - struct xenbus_device *xdev = blkif->xdev; - blkif_sring_t *ring; - unsigned long ring_ref; - evtchn_port_t evtchn; - evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; - int err; - - if (blkif->ring_connected) - return 0; - - // Grab FE data and map his memory - err = xenbus_gather(NULL, xdev->otherend, - "ring-ref", "%lu", &ring_ref, - "event-channel", "%u", &evtchn, NULL); - if (err) { - xenbus_dev_fatal(xdev, err, - "reading %s/ring-ref and event-channel", - xdev->otherend); - return err; - } - - err = map_ring(ring_ref, blkif->domid, &blkif->rr); - if (err) { - xenbus_dev_fatal(xdev, err, "mapping ring"); - return err; - } - ring = (blkif_sring_t *)blkif->rr.va; - BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE); - - op.u.bind_interdomain.remote_dom = blkif->domid; - op.u.bind_interdomain.remote_port = evtchn; - err = HYPERVISOR_event_channel_op(&op); - if (err) { - unmap_ring(&blkif->rr); - xenbus_dev_fatal(xdev, err, "binding event channel"); - return err; - } - blkif->evtchn = op.u.bind_interdomain.local_port; - - /* bind evtchn to irq handler */ - blkif->irq = - bind_evtchn_to_irqhandler(blkif->evtchn, "blkback", - blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie); - - blkif->ring_connected = 1; - - DPRINTF("%x rings connected! evtchn=%d irq=%d\n", - (unsigned int)blkif, blkif->evtchn, blkif->irq); + if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) { - return 0; -} + if ((bio->bio_length & (xbb->sector_size - 1)) != 0) { + printf("%s: Discontiguous I/O request from " + "domain %d ends on non-sector " + "boundary\n", __func__, + xbb->otherend_id); + error = EINVAL; + goto fail_free_bios; + } + /* + * KVA will not be contiguous, so any additional + * I/O will need to be represented in a new bio. + */ + bio = NULL; + } + } -static void -disconnect_ring(blkif_t *blkif) -{ - DPRINTF("\n"); + req->pendcnt = nbio; + + for (bio_idx = 0; bio_idx < nbio; bio_idx++) + { +#ifdef XBB_USE_BOUNCE_BUFFERS + vm_offset_t kva_offset; - if (blkif->ring_connected) { - unbind_from_irqhandler(blkif->irq, blkif->irq_cookie); - blkif->irq = 0; - unmap_ring(&blkif->rr); - blkif->ring_connected = 0; + kva_offset = (vm_offset_t)bios[bio_idx]->bio_data + - (vm_offset_t)req->bounce; + if (operation == BIO_WRITE) { + memcpy(bios[bio_idx]->bio_data, + (uint8_t *)req->kva + kva_offset, + bios[bio_idx]->bio_bcount); + } +#endif + (*dev_data->csw->d_strategy)(bios[bio_idx]); } + + return (error); + +fail_free_bios: + for (bio_idx = 0; bio_idx < (nbio-1); bio_idx++) + g_destroy_bio(bios[bio_idx]); + + return (error); } -static void -connect(blkif_t *blkif) +/** + * Backend handler for file access. + * + * \param xbb Per-instance xbb configuration structure. + * \param ring_req Front-end's I/O request as pulled from the shared + * communication ring. + * \param req Allocated internal request structure. + * \param nseg The number of valid segments for this request in + * xbb->xbb_sgs. + * \param operation BIO_* I/O operation code. + * \param bio_flags Additional bio_flag data to pass to any generated bios + * (e.g. BIO_ORDERED).. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_dispatch_file(struct xbb_softc *xbb, blkif_request_t *ring_req, + struct xbb_xen_req *req, int nseg, int operation, + int flags) { - struct xenbus_transaction *xbt; - struct xenbus_device *xdev = blkif->xdev; - int err; + struct xbb_file_data *file_data; + u_int seg_idx; + struct uio xuio; + struct xbb_sg *xbb_sg; + struct iovec *xiovec; +#ifdef XBB_USE_BOUNCE_BUFFERS + void **p_vaddr; + int saved_uio_iovcnt; +#endif /* XBB_USE_BOUNCE_BUFFERS */ + int vfs_is_locked; + int error; + + file_data = &xbb->backend.file; + error = 0; + bzero(&xuio, sizeof(xuio)); + + req->pendcnt = 0; + + switch (operation) { + case BIO_READ: + xuio.uio_rw = UIO_READ; + break; + case BIO_WRITE: + xuio.uio_rw = UIO_WRITE; + break; + case BIO_FLUSH: { + struct mount *mountpoint; - if (!blkif->ring_connected || - blkif->vn == NULL || - blkif->state == XenbusStateConnected) - return; + vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); - DPRINTF("%s\n", xdev->otherend); + (void) vn_start_write(xbb->vn, &mountpoint, V_WAIT); - /* Supply the information about the device the frontend needs */ -again: - xbt = xenbus_transaction_start(); - if (IS_ERR(xbt)) { - xenbus_dev_fatal(xdev, PTR_ERR(xbt), - "Error writing configuration for backend " - "(start transaction)"); - return; - } + vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(xbb->vn, MNT_WAIT, curthread); + VOP_UNLOCK(xbb->vn, 0); - err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u", - blkif->media_num_sectors); - if (err) { - xenbus_dev_fatal(xdev, err, "writing %s/sectors", - xdev->nodename); - goto abort; - } + vn_finished_write(mountpoint); + + VFS_UNLOCK_GIANT(vfs_is_locked); - err = xenbus_printf(xbt, xdev->nodename, "info", "%u", - blkif->read_only ? VDISK_READONLY : 0); - if (err) { - xenbus_dev_fatal(xdev, err, "writing %s/info", - xdev->nodename); - goto abort; + goto bailout_send_response; + /* NOTREACHED */ } - err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u", - blkif->sector_size); - if (err) { - xenbus_dev_fatal(xdev, err, "writing %s/sector-size", - xdev->nodename); - goto abort; + default: + panic("invalid operation %d", operation); + /* NOTREACHED */ } + xuio.uio_offset = (vm_offset_t)ring_req->sector_number + << xbb->sector_size_shift; - err = xenbus_transaction_end(xbt, 0); - if (err == -EAGAIN) - goto again; - if (err) - xenbus_dev_fatal(xdev, err, "ending transaction"); + xuio.uio_segflg = UIO_SYSSPACE; + xuio.uio_iov = file_data->xiovecs; + xuio.uio_iovcnt = 0; - err = xenbus_switch_state(xdev, NULL, XenbusStateConnected); - if (err) - xenbus_dev_fatal(xdev, err, "switching to Connected state", - xdev->nodename); + for (seg_idx = 0, xiovec = NULL, xbb_sg = xbb->xbb_sgs; + seg_idx < nseg; seg_idx++, xbb_sg++) { - blkif->state = XenbusStateConnected; + /* + * If the first sector is not 0, the KVA will not be + * contiguous and we'll need to go on to another segment. + */ + if (xbb_sg->first_sect != 0) + xiovec = NULL; + + if (xiovec == NULL) { + xiovec = &file_data->xiovecs[xuio.uio_iovcnt]; + xiovec->iov_base = xbb_req_ioaddr(req, seg_idx, + xbb_sg->first_sect); +#ifdef XBB_USE_BOUNCE_BUFFERS + /* + * Store the address of the incoming buffer at this + * particular offset as well, so we can do the copy + * later without having to do more work to + * recalculate this address. + */ + p_vaddr = &file_data->xiovecs_vaddr[xuio.uio_iovcnt]; + *p_vaddr = xbb_req_vaddr(req, seg_idx, + xbb_sg->first_sect); +#endif /* XBB_USE_BOUNCE_BUFFERS */ + xiovec->iov_len = 0; + xuio.uio_iovcnt++; + } - return; + xiovec->iov_len += xbb_sg->nsect << 9; - abort: - xenbus_transaction_end(xbt, 1); -} + xuio.uio_resid += xbb_sg->nsect << 9; -static int -blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id) -{ - int err; - char *p, *mode = NULL, *type = NULL, *params = NULL; - long handle; + /* + * If the last sector is not the full page size count, + * the next segment will not be contiguous in KVA and we + * need a new iovec. + */ + if (xbb_sg->last_sect != (PAGE_SIZE - 512) >> 9) + xiovec = NULL; + } - DPRINTF("node=%s\n", xdev->nodename); + xuio.uio_td = curthread; - p = strrchr(xdev->otherend, '/') + 1; - handle = strtoul(p, NULL, 0); +#ifdef XBB_USE_BOUNCE_BUFFERS + saved_uio_iovcnt = xuio.uio_iovcnt; - mode = xenbus_read(NULL, xdev->nodename, "mode", NULL); - if (IS_ERR(mode)) { - xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode"); - err = PTR_ERR(mode); - goto error; - } - - type = xenbus_read(NULL, xdev->nodename, "type", NULL); - if (IS_ERR(type)) { - xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type"); - err = PTR_ERR(type); - goto error; - } - - params = xenbus_read(NULL, xdev->nodename, "params", NULL); - if (IS_ERR(type)) { - xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params"); - err = PTR_ERR(params); - goto error; - } - - err = blkif_create(xdev, handle, mode, type, params); - if (err) { - xenbus_dev_fatal(xdev, err, "creating blkif"); - goto error; - } + if (operation == BIO_WRITE) { + /* Copy the write data to the local buffer. */ + for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, + xiovec = xuio.uio_iov; seg_idx < xuio.uio_iovcnt; + seg_idx++, xiovec++, p_vaddr++) { - err = vbd_add_dev(xdev); - if (err) { - blkif_put((blkif_t *)xdev->data); - xenbus_dev_fatal(xdev, err, "adding vbd device"); + memcpy(xiovec->iov_base, *p_vaddr, xiovec->iov_len); + } + } else { + /* + * We only need to save off the iovecs in the case of a + * read, because the copy for the read happens after the + * VOP_READ(). (The uio will get modified in that call + * sequence.) + */ + memcpy(file_data->saved_xiovecs, xuio.uio_iov, + xuio.uio_iovcnt * sizeof(xuio.uio_iov[0])); } +#endif /* XBB_USE_BOUNCE_BUFFERS */ - return err; + vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); + switch (operation) { + case BIO_READ: - error: - if (mode) - free(mode, M_DEVBUF); - if (type) - free(type, M_DEVBUF); - if (params) - free(params, M_DEVBUF); - return err; -} + vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); -static int -blkback_remove(struct xenbus_device *xdev) -{ - blkif_t *blkif = xdev->data; - device_t ndev; + /* + * UFS pays attention to IO_DIRECT for reads. If the + * DIRECTIO option is configured into the kernel, it calls + * ffs_rawread(). But that only works for single-segment + * uios with user space addresses. In our case, with a + * kernel uio, it still reads into the buffer cache, but it + * will just try to release the buffer from the cache later + * on in ffs_read(). + * + * ZFS does not pay attention to IO_DIRECT for reads. + * + * UFS does not pay attention to IO_SYNC for reads. + * + * ZFS pays attention to IO_SYNC (which translates into the + * Solaris define FRSYNC for zfs_read()) for reads. It + * attempts to sync the file before reading. + * + * So, to attempt to provide some barrier semantics in the + * BIO_ORDERED case, set both IO_DIRECT and IO_SYNC. + */ + error = VOP_READ(xbb->vn, &xuio, (flags & BIO_ORDERED) ? + (IO_DIRECT|IO_SYNC) : 0, file_data->cred); - DPRINTF("node=%s\n", xdev->nodename); + VOP_UNLOCK(xbb->vn, 0); + break; + case BIO_WRITE: { + struct mount *mountpoint; - blkif->state = XenbusStateClosing; + (void)vn_start_write(xbb->vn, &mountpoint, V_WAIT); - if ((ndev = blkif->ndev)) { - blkif->ndev = NULL; - mtx_lock(&Giant); - device_detach(ndev); - mtx_unlock(&Giant); - } + vn_lock(xbb->vn, LK_EXCLUSIVE | LK_RETRY); - xdev->data = NULL; - blkif->xdev = NULL; - blkif_put(blkif); + /* + * UFS pays attention to IO_DIRECT for writes. The write + * is done asynchronously. (Normally the write would just + * get put into cache. + * + * UFS pays attention to IO_SYNC for writes. It will + * attempt to write the buffer out synchronously if that + * flag is set. + * + * ZFS does not pay attention to IO_DIRECT for writes. + * + * ZFS pays attention to IO_SYNC (a.k.a. FSYNC or FRSYNC) + * for writes. It will flush the transaction from the + * cache before returning. + * + * So if we've got the BIO_ORDERED flag set, we want + * IO_SYNC in either the UFS or ZFS case. + */ + error = VOP_WRITE(xbb->vn, &xuio, (flags & BIO_ORDERED) ? + IO_SYNC : 0, file_data->cred); + VOP_UNLOCK(xbb->vn, 0); - return 0; -} + vn_finished_write(mountpoint); -static int -blkback_resume(struct xenbus_device *xdev) -{ - DPRINTF("node=%s\n", xdev->nodename); - return 0; + break; + } + default: + panic("invalid operation %d", operation); + /* NOTREACHED */ + } + VFS_UNLOCK_GIANT(vfs_is_locked); + +#ifdef XBB_USE_BOUNCE_BUFFERS + /* We only need to copy here for read operations */ + if (operation == BIO_READ) { + + for (seg_idx = 0, p_vaddr = file_data->xiovecs_vaddr, + xiovec = file_data->saved_xiovecs; + seg_idx < saved_uio_iovcnt; seg_idx++, + xiovec++, p_vaddr++) { + + /* + * Note that we have to use the copy of the + * io vector we made above. uiomove() modifies + * the uio and its referenced vector as uiomove + * performs the copy, so we can't rely on any + * state from the original uio. + */ + memcpy(*p_vaddr, xiovec->iov_base, xiovec->iov_len); + } + } +#endif /* XBB_USE_BOUNCE_BUFFERS */ + +bailout_send_response: + + /* + * All I/O is already done, send the response. A lock is not + * necessary here because we're single threaded, and therefore the + * only context accessing this request right now. If that changes, + * we may need some locking here. + */ + xbb_unmap_req(req); + xbb_send_response(xbb, req, (error == 0) ? BLKIF_RSP_OKAY : + BLKIF_RSP_ERROR); + devstat_end_transaction(xbb->xbb_stats, + /*bytes*/error == 0 ? req->nr_512b_sectors << 9 + : 0, + req->ds_tag_type, + req->ds_trans_type, + /*now*/NULL, + /*then*/&req->ds_t0); + xbb_release_req(xbb, req); + + return (0); } +/*--------------------------- Backend Configuration --------------------------*/ +/** + * Close and cleanup any backend device/file specific state for this + * block back instance. + * + * \param xbb Per-instance xbb configuration structure. + */ static void -frontend_changed(struct xenbus_device *xdev, - XenbusState frontend_state) +xbb_close_backend(struct xbb_softc *xbb) { - blkif_t *blkif = xdev->data; + DROP_GIANT(); + DPRINTF("closing dev=%s\n", xbb->dev_name); + if (xbb->vn) { + int flags = FREAD; + int vfs_is_locked = 0; - DPRINTF("state=%d\n", frontend_state); + if ((xbb->flags & XBBF_READ_ONLY) == 0) + flags |= FWRITE; - blkif->frontend_state = frontend_state; + switch (xbb->device_type) { + case XBB_TYPE_DISK: + if (xbb->backend.dev.csw) { + dev_relthread(xbb->backend.dev.cdev, + xbb->backend.dev.dev_ref); + xbb->backend.dev.csw = NULL; + xbb->backend.dev.cdev = NULL; + } + break; + case XBB_TYPE_FILE: + vfs_is_locked = VFS_LOCK_GIANT(xbb->vn->v_mount); + break; + case XBB_TYPE_NONE: + default: + panic("Unexpected backend type."); + break; + } - switch (frontend_state) { - case XenbusStateInitialising: - break; - case XenbusStateInitialised: - case XenbusStateConnected: - connect_ring(blkif); - connect(blkif); - break; - case XenbusStateClosing: - xenbus_switch_state(xdev, NULL, XenbusStateClosing); - break; - case XenbusStateClosed: - xenbus_remove_device(xdev); - break; - case XenbusStateUnknown: - case XenbusStateInitWait: - xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend", - frontend_state); - break; + (void)vn_close(xbb->vn, flags, NOCRED, curthread); + xbb->vn = NULL; + + switch (xbb->device_type) { + case XBB_TYPE_DISK: + break; + case XBB_TYPE_FILE: + VFS_UNLOCK_GIANT(vfs_is_locked); + if (xbb->backend.file.cred != NULL) { + crfree(xbb->backend.file.cred); + xbb->backend.file.cred = NULL; + } + break; + case XBB_TYPE_NONE: + default: + panic("Unexpected backend type."); + break; + } } + PICKUP_GIANT(); } -/* ** Driver registration ** */ - -static struct xenbus_device_id blkback_ids[] = { - { "vbd" }, - { "" } -}; +/** + * Open a character device to be used for backend I/O. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_open_dev(struct xbb_softc *xbb) +{ + struct vattr vattr; + struct cdev *dev; + struct cdevsw *devsw; + int error; + + xbb->device_type = XBB_TYPE_DISK; + xbb->dispatch_io = xbb_dispatch_dev; + xbb->backend.dev.cdev = xbb->vn->v_rdev; + xbb->backend.dev.csw = dev_refthread(xbb->backend.dev.cdev, + &xbb->backend.dev.dev_ref); + if (xbb->backend.dev.csw == NULL) + panic("Unable to retrieve device switch"); + + error = VOP_GETATTR(xbb->vn, &vattr, NOCRED); + if (error) { + xenbus_dev_fatal(xbb->dev, error, "error getting " + "vnode attributes for device %s", + xbb->dev_name); + return (error); + } -static struct xenbus_driver blkback = { - .name = "blkback", - .ids = blkback_ids, - .probe = blkback_probe, - .remove = blkback_remove, - .resume = blkback_resume, - .otherend_changed = frontend_changed, -}; -static void -blkback_init(void *unused) -{ - int i; - - TASK_INIT(&blk_req_task, 0, blk_req_action, NULL); - mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF); - - mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF); - - mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; - pending_reqs = malloc(sizeof(pending_reqs[0]) * - blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT); - pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) * - mmap_pages, M_DEVBUF, M_NOWAIT); - pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) * - mmap_pages, M_DEVBUF, M_NOWAIT); - mmap_vstart = alloc_empty_page_range(mmap_pages); - if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) { - if (pending_reqs) - free(pending_reqs, M_DEVBUF); - if (pending_grant_handles) - free(pending_grant_handles, M_DEVBUF); - if (pending_vaddrs) - free(pending_vaddrs, M_DEVBUF); - WPRINTF("out of memory\n"); - return; + dev = xbb->vn->v_rdev; + devsw = dev->si_devsw; + if (!devsw->d_ioctl) { + xenbus_dev_fatal(xbb->dev, ENODEV, "no d_ioctl for " + "device %s!", xbb->dev_name); + return (ENODEV); } - for (i = 0; i < mmap_pages; i++) { - pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); - pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; + error = devsw->d_ioctl(dev, DIOCGSECTORSIZE, + (caddr_t)&xbb->sector_size, FREAD, + curthread); + if (error) { + xenbus_dev_fatal(xbb->dev, error, + "error calling ioctl DIOCGSECTORSIZE " + "for device %s", xbb->dev_name); + return (error); } - for (i = 0; i < blkif_reqs; i++) { - STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list); + error = devsw->d_ioctl(dev, DIOCGMEDIASIZE, + (caddr_t)&xbb->media_size, FREAD, + curthread); + if (error) { + xenbus_dev_fatal(xbb->dev, error, + "error calling ioctl DIOCGMEDIASIZE " + "for device %s", xbb->dev_name); + return (error); } - DPRINTF("registering %s\n", blkback.name); - xenbus_register_backend(&blkback); + return (0); } -SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL) - -static void -close_device(blkif_t *blkif) +/** + * Open a file to be used for backend I/O. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_open_file(struct xbb_softc *xbb) { - DPRINTF("closing dev=%s\n", blkif->dev_name); - if (blkif->vn) { - int flags = FREAD; - - if (!blkif->read_only) - flags |= FWRITE; + struct xbb_file_data *file_data; + struct vattr vattr; + int error; + + file_data = &xbb->backend.file; + xbb->device_type = XBB_TYPE_FILE; + xbb->dispatch_io = xbb_dispatch_file; + error = VOP_GETATTR(xbb->vn, &vattr, curthread->td_ucred); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "error calling VOP_GETATTR()" + "for file %s", xbb->dev_name); + return (error); + } - if (blkif->csw) { - dev_relthread(blkif->cdev); - blkif->csw = NULL; + /* + * Verify that we have the ability to upgrade to exclusive + * access on this file so we can trap errors at open instead + * of reporting them during first access. + */ + if (VOP_ISLOCKED(xbb->vn) != LK_EXCLUSIVE) { + vn_lock(xbb->vn, LK_UPGRADE | LK_RETRY); + if (xbb->vn->v_iflag & VI_DOOMED) { + error = EBADF; + xenbus_dev_fatal(xbb->dev, error, + "error locking file %s", + xbb->dev_name); + + return (error); } + } - (void)vn_close(blkif->vn, flags, NOCRED, curthread); - blkif->vn = NULL; + file_data->cred = crhold(curthread->td_ucred); + xbb->media_size = vattr.va_size; + + /* + * XXX KDM vattr.va_blocksize may be larger than 512 bytes here. + * With ZFS, it is 131072 bytes. Block sizes that large don't work + * with disklabel and UFS on FreeBSD at least. Large block sizes + * may not work with other OSes as well. So just export a sector + * size of 512 bytes, which should work with any OS or + * application. Since our backing is a file, any block size will + * work fine for the backing store. + */ +#if 0 + xbb->sector_size = vattr.va_blocksize; +#endif + xbb->sector_size = 512; + + /* + * Sanity check. The media size has to be at least one + * sector long. + */ + if (xbb->media_size < xbb->sector_size) { + error = EINVAL; + xenbus_dev_fatal(xbb->dev, error, + "file %s size %ju < block size %u", + xbb->dev_name, + (uintmax_t)xbb->media_size, + xbb->sector_size); } + return (error); } +/** + * Open the backend provider for this connection. + * + * \param xbb Per-instance xbb configuration structure. + * + * \return 0 for success, errno codes for failure. + */ static int -open_device(blkif_t *blkif) +xbb_open_backend(struct xbb_softc *xbb) { struct nameidata nd; - struct vattr vattr; - struct cdev *dev; - struct cdevsw *devsw; - int flags = FREAD, err = 0; + int flags; + int error; + int vfs_is_locked; - DPRINTF("opening dev=%s\n", blkif->dev_name); + flags = FREAD; + error = 0; - if (!blkif->read_only) + DPRINTF("opening dev=%s\n", xbb->dev_name); + + if ((xbb->flags & XBBF_READ_ONLY) == 0) flags |= FWRITE; if (!curthread->td_proc->p_fd->fd_cdir) { @@ -1066,284 +1930,1045 @@ open_device(blkif_t *blkif) } again: - NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread); - err = vn_open(&nd, &flags, 0, -1); - if (err) { - if (blkif->dev_name[0] != '/') { + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, xbb->dev_name, curthread); + error = vn_open(&nd, &flags, 0, NULL); + if (error) { + /* + * This is the only reasonable guess we can make as far as + * path if the user doesn't give us a fully qualified path. + * If they want to specify a file, they need to specify the + * full path. + */ + if (xbb->dev_name[0] != '/') { char *dev_path = "/dev/"; char *dev_name; /* Try adding device path at beginning of name */ - dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT); + dev_name = malloc(strlen(xbb->dev_name) + + strlen(dev_path) + 1, + M_XENBLOCKBACK, M_NOWAIT); if (dev_name) { - sprintf(dev_name, "%s%s", dev_path, blkif->dev_name); - free(blkif->dev_name, M_DEVBUF); - blkif->dev_name = dev_name; + sprintf(dev_name, "%s%s", dev_path, + xbb->dev_name); + free(xbb->dev_name, M_XENBLOCKBACK); + xbb->dev_name = dev_name; goto again; } } - xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name); - return err; + xenbus_dev_fatal(xbb->dev, error, "error opening device %s", + xbb->dev_name); + return (error); } + + vfs_is_locked = NDHASGIANT(&nd); + NDFREE(&nd, NDF_ONLY_PNBUF); - blkif->vn = nd.ni_vp; + xbb->vn = nd.ni_vp; + + /* We only support disks and files. */ + if (vn_isdisk(xbb->vn, &error)) { + error = xbb_open_dev(xbb); + } else if (xbb->vn->v_type == VREG) { + error = xbb_open_file(xbb); + } else { + error = EINVAL; + xenbus_dev_fatal(xbb->dev, error, "%s is not a disk " + "or file", xbb->dev_name); + } + VOP_UNLOCK(xbb->vn, 0); + VFS_UNLOCK_GIANT(vfs_is_locked); - /* We only support disks for now */ - if (!vn_isdisk(blkif->vn, &err)) { - xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name); - VOP_UNLOCK(blkif->vn, 0, curthread); - goto error; + if (error != 0) { + xbb_close_backend(xbb); + return (error); } - blkif->cdev = blkif->vn->v_rdev; - blkif->csw = dev_refthread(blkif->cdev); - PANIC_IF(blkif->csw == NULL); + xbb->sector_size_shift = fls(xbb->sector_size) - 1; + xbb->media_num_sectors = xbb->media_size >> xbb->sector_size_shift; + + DPRINTF("opened %s=%s sector_size=%u media_size=%" PRId64 "\n", + (xbb->device_type == XBB_TYPE_DISK) ? "dev" : "file", + xbb->dev_name, xbb->sector_size, xbb->media_size); + + return (0); +} - err = VOP_GETATTR(blkif->vn, &vattr, NOCRED); - if (err) { - xenbus_dev_fatal(blkif->xdev, err, - "error getting vnode attributes for device %s", blkif->dev_name); - VOP_UNLOCK(blkif->vn, 0, curthread); - goto error; +/*------------------------ Inter-Domain Communication ------------------------*/ +/** + * Cleanup all inter-domain communication mechanisms. + * + * \param xbb Per-instance xbb configuration structure. + */ +static void +xbb_disconnect(struct xbb_softc *xbb) +{ + struct gnttab_unmap_grant_ref ops[XBB_MAX_RING_PAGES]; + struct gnttab_unmap_grant_ref *op; + u_int ring_idx; + int error; + + DPRINTF("\n"); + + if ((xbb->flags & XBBF_RING_CONNECTED) == 0) + return; + + if (xbb->irq != 0) { + unbind_from_irqhandler(xbb->irq); + xbb->irq = 0; } - VOP_UNLOCK(blkif->vn, 0, curthread); + for (ring_idx = 0, op = ops; + ring_idx < xbb->ring_config.ring_pages; + ring_idx++, op++) { - dev = blkif->vn->v_rdev; - devsw = dev->si_devsw; - if (!devsw->d_ioctl) { - err = ENODEV; - xenbus_dev_fatal(blkif->xdev, err, - "no d_ioctl for device %s!", blkif->dev_name); - goto error; + op->host_addr = xbb->ring_config.gnt_addr + + (ring_idx * PAGE_SIZE); + op->dev_bus_addr = xbb->ring_config.bus_addr[ring_idx]; + op->handle = xbb->ring_config.handle[ring_idx]; } - err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread); - if (err) { - xenbus_dev_fatal(blkif->xdev, err, - "error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name); - goto error; + error = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ops, + xbb->ring_config.ring_pages); + if (error != 0) + panic("Grant table op failed (%d)", error); + + xbb->flags &= ~XBBF_RING_CONNECTED; +} + +/** + * Map shared memory ring into domain local address space, initialize + * ring control structures, and bind an interrupt to the event channel + * used to notify us of ring changes. + * + * \param xbb Per-instance xbb configuration structure. + */ +static int +xbb_connect_ring(struct xbb_softc *xbb) +{ + struct gnttab_map_grant_ref gnts[XBB_MAX_RING_PAGES]; + struct gnttab_map_grant_ref *gnt; + u_int ring_idx; + int error; + + if ((xbb->flags & XBBF_RING_CONNECTED) != 0) + return (0); + + /* + * Kva for our ring is at the tail of the region of kva allocated + * by xbb_alloc_communication_mem(). + */ + xbb->ring_config.va = xbb->kva + + (xbb->kva_size + - (xbb->ring_config.ring_pages * PAGE_SIZE)); + xbb->ring_config.gnt_addr = xbb->gnt_base_addr + + (xbb->kva_size + - (xbb->ring_config.ring_pages * PAGE_SIZE)); + + for (ring_idx = 0, gnt = gnts; + ring_idx < xbb->ring_config.ring_pages; + ring_idx++, gnt++) { + + gnt->host_addr = xbb->ring_config.gnt_addr + + (ring_idx * PAGE_SIZE); + gnt->flags = GNTMAP_host_map; + gnt->ref = xbb->ring_config.ring_ref[ring_idx]; + gnt->dom = xbb->otherend_id; } - blkif->sector_size_shift = fls(blkif->sector_size) - 1; - err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread); - if (err) { - xenbus_dev_fatal(blkif->xdev, err, - "error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name); - goto error; + error = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, gnts, + xbb->ring_config.ring_pages); + if (error) + panic("blkback: Ring page grant table op failed (%d)", error); + + for (ring_idx = 0, gnt = gnts; + ring_idx < xbb->ring_config.ring_pages; + ring_idx++, gnt++) { + if (gnt->status != 0) { + xbb->ring_config.va = 0; + xenbus_dev_fatal(xbb->dev, EACCES, + "Ring shared page mapping failed. " + "Status %d.", gnt->status); + return (EACCES); + } + xbb->ring_config.handle[ring_idx] = gnt->handle; + xbb->ring_config.bus_addr[ring_idx] = gnt->dev_bus_addr; } - blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift; - blkif->major = major(vattr.va_rdev); - blkif->minor = minor(vattr.va_rdev); + /* Initialize the ring based on ABI. */ + switch (xbb->abi) { + case BLKIF_PROTOCOL_NATIVE: + { + blkif_sring_t *sring; + sring = (blkif_sring_t *)xbb->ring_config.va; + BACK_RING_INIT(&xbb->rings.native, sring, + xbb->ring_config.ring_pages * PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + blkif_x86_32_sring_t *sring_x86_32; + sring_x86_32 = (blkif_x86_32_sring_t *)xbb->ring_config.va; + BACK_RING_INIT(&xbb->rings.x86_32, sring_x86_32, + xbb->ring_config.ring_pages * PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + blkif_x86_64_sring_t *sring_x86_64; + sring_x86_64 = (blkif_x86_64_sring_t *)xbb->ring_config.va; + BACK_RING_INIT(&xbb->rings.x86_64, sring_x86_64, + xbb->ring_config.ring_pages * PAGE_SIZE); + break; + } + default: + panic("Unexpected blkif protocol ABI."); + } - DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n", - blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size); + xbb->flags |= XBBF_RING_CONNECTED; + + error = + bind_interdomain_evtchn_to_irqhandler(xbb->otherend_id, + xbb->ring_config.evtchn, + device_get_nameunit(xbb->dev), + xbb_intr, /*arg*/xbb, + INTR_TYPE_BIO | INTR_MPSAFE, + &xbb->irq); + if (error) { + xbb_disconnect(xbb); + xenbus_dev_fatal(xbb->dev, error, "binding event channel"); + return (error); + } - return 0; + DPRINTF("rings connected!\n"); - error: - close_device(blkif); - return err; + return 0; } +/** + * Size KVA and pseudo-physical address allocations based on negotiated + * values for the size and number of I/O requests, and the size of our + * communication ring. + * + * \param xbb Per-instance xbb configuration structure. + * + * These address spaces are used to dynamically map pages in the + * front-end's domain into our own. + */ static int -vbd_add_dev(struct xenbus_device *xdev) +xbb_alloc_communication_mem(struct xbb_softc *xbb) { - blkif_t *blkif = xdev->data; - device_t nexus, ndev; - devclass_t dc; - int err = 0; + xbb->kva_size = (xbb->ring_config.ring_pages + + (xbb->max_requests * xbb->max_request_segments)) + * PAGE_SIZE; +#ifndef XENHVM + xbb->kva = kmem_alloc_nofault(kernel_map, xbb->kva_size); + if (xbb->kva == 0) + return (ENOMEM); + xbb->gnt_base_addr = xbb->kva; +#else /* XENHVM */ + /* + * Reserve a range of pseudo physical memory that we can map + * into kva. These pages will only be backed by machine + * pages ("real memory") during the lifetime of front-end requests + * via grant table operations. + */ + xbb->pseudo_phys_res_id = 0; + xbb->pseudo_phys_res = bus_alloc_resource(xbb->dev, SYS_RES_MEMORY, + &xbb->pseudo_phys_res_id, + 0, ~0, xbb->kva_size, + RF_ACTIVE); + if (xbb->pseudo_phys_res == NULL) { + xbb->kva = 0; + return (ENOMEM); + } + xbb->kva = (vm_offset_t)rman_get_virtual(xbb->pseudo_phys_res); + xbb->gnt_base_addr = rman_get_start(xbb->pseudo_phys_res); +#endif /* XENHVM */ + return (0); +} - mtx_lock(&Giant); +/** + * Free dynamically allocated KVA or pseudo-physical address allocations. + * + * \param xbb Per-instance xbb configuration structure. + */ +static void +xbb_free_communication_mem(struct xbb_softc *xbb) +{ + if (xbb->kva != 0) { +#ifndef XENHVM + kmem_free(kernel_map, xbb->kva, xbb->kva_size); +#else + if (xbb->pseudo_phys_res != NULL) { + bus_release_resource(xbb->dev, SYS_RES_MEMORY, + xbb->pseudo_phys_res_id, + xbb->pseudo_phys_res); + xbb->pseudo_phys_res = NULL; + } +#endif + } + xbb->kva = 0; + xbb->gnt_base_addr = 0; +} - /* We will add a vbd device as a child of nexus0 (for now) */ - if (!(dc = devclass_find("nexus")) || - !(nexus = devclass_get_device(dc, 0))) { - WPRINTF("could not find nexus0!\n"); - err = ENOENT; - goto done; +/** + * Collect front-end information from the XenStore. + * + * \param xbb Per-instance xbb configuration structure. + */ +static int +xbb_collect_frontend_info(struct xbb_softc *xbb) +{ + char protocol_abi[64]; + const char *otherend_path; + int error; + u_int ring_idx; + + otherend_path = xenbus_get_otherend_path(xbb->dev); + + /* + * Mandatory data (used in all versions of the protocol) first. + */ + error = xs_gather(XST_NIL, otherend_path, + "ring-ref", "%" PRIu32, + &xbb->ring_config.ring_ref[0], + "event-channel", "%" PRIu32, + &xbb->ring_config.evtchn, + NULL); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Unable to retrieve ring information from " + "frontend %s. Unable to connect.", + xenbus_get_otherend_path(xbb->dev)); + return (error); } + /* + * These fields are initialized to legacy protocol defaults + * so we only need to fail if reading the updated value succeeds + * and the new value is outside of its allowed range. + * + * \note xs_gather() returns on the first encountered error, so + * we must use independant calls in order to guarantee + * we don't miss information in a sparsly populated front-end + * tree. + */ + (void)xs_scanf(XST_NIL, otherend_path, + "ring-pages", NULL, "%" PRIu32, + &xbb->ring_config.ring_pages); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-requests", NULL, "%" PRIu32, + &xbb->max_requests); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-request-segments", NULL, "%" PRIu32, + &xbb->max_request_segments); + + (void)xs_scanf(XST_NIL, otherend_path, + "max-request-size", NULL, "%" PRIu32, + &xbb->max_request_size); + + if (xbb->ring_config.ring_pages > XBB_MAX_RING_PAGES) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed ring-pages of %u " + "exceeds backend limit of %zu. " + "Unable to connect.", + xbb->ring_config.ring_pages, + XBB_MAX_RING_PAGES); + return (EINVAL); + } else if (xbb->max_requests > XBB_MAX_REQUESTS) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed max_requests of %u " + "exceeds backend limit of %u. " + "Unable to connect.", + xbb->max_requests, + XBB_MAX_REQUESTS); + return (EINVAL); + } else if (xbb->max_request_segments > XBB_MAX_SEGMENTS_PER_REQUEST) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed max_requests_segments " + "of %u exceeds backend limit of %u. " + "Unable to connect.", + xbb->max_request_segments, + XBB_MAX_SEGMENTS_PER_REQUEST); + return (EINVAL); + } else if (xbb->max_request_size > XBB_MAX_REQUEST_SIZE) { + xenbus_dev_fatal(xbb->dev, EINVAL, + "Front-end specificed max_request_size " + "of %u exceeds backend limit of %u. " + "Unable to connect.", + xbb->max_request_size, + XBB_MAX_REQUEST_SIZE); + return (EINVAL); + } - /* Create a newbus device representing the vbd */ - ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle); - if (!ndev) { - WPRINTF("could not create newbus device vbd%d!\n", blkif->handle); - err = EFAULT; - goto done; + /* If using a multi-page ring, pull in the remaining references. */ + for (ring_idx = 1; ring_idx < xbb->ring_config.ring_pages; ring_idx++) { + char ring_ref_name[]= "ring_refXX"; + + snprintf(ring_ref_name, sizeof(ring_ref_name), + "ring-ref%u", ring_idx); + error = xs_scanf(XST_NIL, otherend_path, + ring_ref_name, NULL, "%" PRIu32, + &xbb->ring_config.ring_ref[ring_idx]); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Failed to retriev grant reference " + "for page %u of shared ring. Unable " + "to connect.", ring_idx); + return (error); + } } - - blkif_get(blkif); - device_set_ivars(ndev, blkif); - blkif->ndev = ndev; - device_probe_and_attach(ndev); + error = xs_gather(XST_NIL, otherend_path, + "protocol", "%63s", protocol_abi, + NULL); + if (error != 0 + || !strcmp(protocol_abi, XEN_IO_PROTO_ABI_NATIVE)) { + /* + * Assume native if the frontend has not + * published ABI data or it has published and + * matches our own ABI. + */ + xbb->abi = BLKIF_PROTOCOL_NATIVE; + } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_32)) { - done: + xbb->abi = BLKIF_PROTOCOL_X86_32; + } else if (!strcmp(protocol_abi, XEN_IO_PROTO_ABI_X86_64)) { - mtx_unlock(&Giant); + xbb->abi = BLKIF_PROTOCOL_X86_64; + } else { - return err; + xenbus_dev_fatal(xbb->dev, EINVAL, + "Unknown protocol ABI (%s) published by " + "frontend. Unable to connect.", protocol_abi); + return (EINVAL); + } + return (0); } -enum { - VBD_SYSCTL_DOMID, - VBD_SYSCTL_ST_RD_REQ, - VBD_SYSCTL_ST_WR_REQ, - VBD_SYSCTL_ST_OO_REQ, - VBD_SYSCTL_ST_ERR_REQ, - VBD_SYSCTL_RING, -}; - -static char * -vbd_sysctl_ring_info(blkif_t *blkif, int cmd) +/** + * Allocate per-request data structures given request size and number + * information negotiated with the front-end. + * + * \param xbb Per-instance xbb configuration structure. + */ +static int +xbb_alloc_requests(struct xbb_softc *xbb) { - char *buf = malloc(256, M_DEVBUF, M_WAITOK); - if (buf) { - if (!blkif->ring_connected) - sprintf(buf, "ring not connected\n"); - else { - blkif_back_ring_t *ring = &blkif->ring; - sprintf(buf, "nr_ents=%x req_cons=%x" - " req_prod=%x req_event=%x" - " rsp_prod=%x rsp_event=%x", - ring->nr_ents, ring->req_cons, - ring->sring->req_prod, ring->sring->req_event, - ring->sring->rsp_prod, ring->sring->rsp_event); + struct xbb_xen_req *req; + struct xbb_xen_req *last_req; + uint8_t *req_kva; + u_long gnt_base; + + /* + * Allocate request book keeping datastructures. + */ + xbb->requests = malloc(xbb->max_requests * sizeof(*xbb->requests), + M_XENBLOCKBACK, M_NOWAIT|M_ZERO); + if (xbb->requests == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request structures"); + return (ENOMEM); + } + + req_kva = (uint8_t *)xbb->kva; + gnt_base = xbb->gnt_base_addr; + req = xbb->requests; + last_req = &xbb->requests[xbb->max_requests - 1]; + while (req <= last_req) { + int seg; + + req->xbb = xbb; + req->kva = req_kva; + req->gnt_handles = malloc(xbb->max_request_segments + * sizeof(*req->gnt_handles), + M_XENBLOCKBACK, M_NOWAIT|M_ZERO); + if (req->gnt_handles == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request " + "grant references"); + return (ENOMEM); + } +#ifdef XBB_USE_BOUNCE_BUFFERS + req->bounce = malloc(xbb->max_request_size, + M_XENBLOCKBACK, M_NOWAIT); + if (req->bounce == NULL) { + xenbus_dev_fatal(xbb->dev, ENOMEM, + "Unable to allocate request " + "bounce buffers"); + return (ENOMEM); } +#endif /* XBB_USE_BOUNCE_BUFFERS */ + req->gnt_base = gnt_base; + req_kva += xbb->max_request_segments * PAGE_SIZE; + gnt_base += xbb->max_request_segments * PAGE_SIZE; + SLIST_INSERT_HEAD(&xbb->request_free_slist, req, links); + + for (seg = 0; seg < xbb->max_request_segments; seg++) + req->gnt_handles[seg] = GRANT_REF_INVALID; + + req++; } - return buf; + return (0); } +/** + * Supply information about the physical device to the frontend + * via XenBus. + * + * \param xbb Per-instance xbb configuration structure. + */ static int -vbd_sysctl_handler(SYSCTL_HANDLER_ARGS) +xbb_publish_backend_info(struct xbb_softc *xbb) { - device_t dev = (device_t)arg1; - blkif_t *blkif = (blkif_t *)device_get_ivars(dev); - const char *value; - char *buf = NULL; - int err; - - switch (arg2) { - case VBD_SYSCTL_DOMID: - return sysctl_handle_int(oidp, NULL, blkif->domid, req); - case VBD_SYSCTL_ST_RD_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req); - case VBD_SYSCTL_ST_WR_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req); - case VBD_SYSCTL_ST_OO_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req); - case VBD_SYSCTL_ST_ERR_REQ: - return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req); - case VBD_SYSCTL_RING: - value = buf = vbd_sysctl_ring_info(blkif, arg2); - break; - default: - return (EINVAL); + struct xs_transaction xst; + const char *our_path; + const char *leaf; + int error; + + our_path = xenbus_get_node(xbb->dev); + while (1) { + error = xs_transaction_start(&xst); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Error publishing backend info " + "(start transaction)"); + return (error); + } + + leaf = "sectors"; + error = xs_printf(xst, our_path, leaf, + "%"PRIu64, xbb->media_num_sectors); + if (error != 0) + break; + + /* XXX Support all VBD attributes here. */ + leaf = "info"; + error = xs_printf(xst, our_path, leaf, "%u", + xbb->flags & XBBF_READ_ONLY + ? VDISK_READONLY : 0); + if (error != 0) + break; + + leaf = "sector-size"; + error = xs_printf(xst, our_path, leaf, "%u", + xbb->sector_size); + if (error != 0) + break; + + error = xs_transaction_end(xst, 0); + if (error == 0) { + return (0); + } else if (error != EAGAIN) { + xenbus_dev_fatal(xbb->dev, error, "ending transaction"); + return (error); + } } - err = SYSCTL_OUT(req, value, strlen(value)); - if (buf != NULL) - free(buf, M_DEVBUF); + xenbus_dev_fatal(xbb->dev, error, "writing %s/%s", + our_path, leaf); + xs_transaction_end(xst, 1); + return (error); +} + +/** + * Connect to our blkfront peer now that it has completed publishing + * its configuration into the XenStore. + * + * \param xbb Per-instance xbb configuration structure. + */ +static void +xbb_connect(struct xbb_softc *xbb) +{ + int error; + + if (xenbus_get_state(xbb->dev) == XenbusStateConnected) + return; + + if (xbb_collect_frontend_info(xbb) != 0) + return; - return err; + /* Allocate resources whose size depends on front-end configuration. */ + error = xbb_alloc_communication_mem(xbb); + if (error != 0) { + xenbus_dev_fatal(xbb->dev, error, + "Unable to allocate communication memory"); + return; + } + + error = xbb_alloc_requests(xbb); + if (error != 0) { + /* Specific errors are reported by xbb_alloc_requests(). */ + return; + } + + /* + * Connect communication channel. + */ + error = xbb_connect_ring(xbb); + if (error != 0) { + /* Specific errors are reported by xbb_connect_ring(). */ + return; + } + + if (xbb_publish_backend_info(xbb) != 0) { + /* + * If we can't publish our data, we cannot participate + * in this connection, and waiting for a front-end state + * change will not help the situation. + */ + xbb_disconnect(xbb); + return; + } + + /* Ready for I/O. */ + xenbus_set_state(xbb->dev, XenbusStateConnected); } -/* Newbus vbd device driver probe */ +/*-------------------------- Device Teardown Support -------------------------*/ +/** + * Perform device shutdown functions. + * + * \param xbb Per-instance xbb configuration structure. + * + * Mark this instance as shutting down, wait for any active I/O on the + * backend device/file to drain, disconnect from the front-end, and notify + * any waiters (e.g. a thread invoking our detach method) that detach can + * now proceed. + */ static int -vbd_probe(device_t dev) +xbb_shutdown(struct xbb_softc *xbb) { - DPRINTF("vbd%d\n", device_get_unit(dev)); - return 0; + static int in_shutdown; + + DPRINTF("\n"); + + /* + * Due to the need to drop our mutex during some + * xenbus operations, it is possible for two threads + * to attempt to close out shutdown processing at + * the same time. Tell the caller that hits this + * race to try back later. + */ + if (in_shutdown != 0) + return (EAGAIN); + + DPRINTF("\n"); + + /* Indicate shutdown is in progress. */ + xbb->flags |= XBBF_SHUTDOWN; + + /* Wait for requests to complete. */ + if (xbb->active_request_count != 0) + return (EAGAIN); + + DPRINTF("\n"); + + /* Disconnect from the front-end. */ + xbb_disconnect(xbb); + + in_shutdown = 1; + mtx_unlock(&xbb->lock); + xenbus_set_state(xbb->dev, XenbusStateClosed); + mtx_lock(&xbb->lock); + in_shutdown = 0; + + /* Indicate to xbb_detach() that is it safe to proceed. */ + wakeup(xbb); + + return (0); +} + +/** + * Report an attach time error to the console and Xen, and cleanup + * this instance by forcing immediate detach processing. + * + * \param xbb Per-instance xbb configuration structure. + * \param err Errno describing the error. + * \param fmt Printf style format and arguments + */ +static void +xbb_attach_failed(struct xbb_softc *xbb, int err, const char *fmt, ...) +{ + va_list ap; + va_list ap_hotplug; + + va_start(ap, fmt); + va_copy(ap_hotplug, ap); + xs_vprintf(XST_NIL, xenbus_get_node(xbb->dev), + "hotplug-error", fmt, ap_hotplug); + va_end(ap_hotplug); + xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "hotplug-status", "error"); + + xenbus_dev_vfatal(xbb->dev, err, fmt, ap); + va_end(ap); + + xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "online", "0"); + xbb_detach(xbb->dev); } -/* Newbus vbd device driver attach */ +/*---------------------------- NewBus Entrypoints ----------------------------*/ +/** + * Inspect a XenBus device and claim it if is of the appropriate type. + * + * \param dev NewBus device object representing a candidate XenBus device. + * + * \return 0 for success, errno codes for failure. + */ static int -vbd_attach(device_t dev) +xbb_probe(device_t dev) { - blkif_t *blkif = (blkif_t *)device_get_ivars(dev); - - DPRINTF("%s\n", blkif->dev_name); - - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I", - "domid of frontend"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I", - "number of read reqs"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I", - "number of write reqs"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I", - "number of deferred reqs"); - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD, - dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I", - "number of reqs that returned error"); -#if XEN_BLKBACK_DEBUG - SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), - OID_AUTO, "ring", CTLFLAG_RD, - dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A", - "req ring info"); -#endif + + if (!strcmp(xenbus_get_type(dev), "vbd")) { + device_set_desc(dev, "Backend Virtual Block Device"); + device_quiet(dev); + return (0); + } + + return (ENXIO); +} - if (!open_device(blkif)) - connect(blkif); +/** + * Attach to a XenBus device that has been claimed by our probe routine. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_attach(device_t dev) +{ + struct xbb_softc *xbb; + int error; + + DPRINTF("Attaching to %s\n", xenbus_get_node(dev)); + + /* + * Basic initialization. + * After this block it is safe to call xbb_detach() + * to clean up any allocated data for this instance. + */ + xbb = device_get_softc(dev); + xbb->dev = dev; + xbb->otherend_id = xenbus_get_otherend_id(dev); + TASK_INIT(&xbb->io_task, /*priority*/0, xbb_run_queue, xbb); + mtx_init(&xbb->lock, device_get_nameunit(dev), NULL, MTX_DEF); + SLIST_INIT(&xbb->request_free_slist); + + /* + * Protocol defaults valid even if all negotiation fails. + */ + xbb->ring_config.ring_pages = 1; + xbb->max_requests = BLKIF_MAX_RING_REQUESTS(PAGE_SIZE); + xbb->max_request_segments = BLKIF_MAX_SEGMENTS_PER_HEADER_BLOCK; + xbb->max_request_size = xbb->max_request_segments * PAGE_SIZE; + + /* + * Publish protocol capabilities for consumption by the + * front-end. + */ + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "feature-barrier", "1"); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/feature-barrier", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "feature-flush-cache", "1"); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/feature-flush-cache", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-ring-pages", "%zu", XBB_MAX_RING_PAGES); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-ring-pages", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-requests", "%u", XBB_MAX_REQUESTS); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-requests", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-request-segments", "%u", + XBB_MAX_SEGMENTS_PER_REQUEST); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-request-segments", + xenbus_get_node(xbb->dev)); + return (error); + } + + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "max-request-size", "%u", + XBB_MAX_REQUEST_SIZE); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/max-request-size", + xenbus_get_node(xbb->dev)); + return (error); + } + + /* Collect physical device information. */ + error = xs_gather(XST_NIL, xenbus_get_otherend_path(xbb->dev), + "device-type", NULL, &xbb->dev_type, + NULL); + if (error != 0) + xbb->dev_type = NULL; + + error = xs_gather(XST_NIL, xenbus_get_node(dev), + "mode", NULL, &xbb->dev_mode, + "params", NULL, &xbb->dev_name, + NULL); + if (error != 0) { + xbb_attach_failed(xbb, error, "reading backend fields at %s", + xenbus_get_node(dev)); + return (ENXIO); + } + + /* Parse fopen style mode flags. */ + if (strchr(xbb->dev_mode, 'w') == NULL) + xbb->flags |= XBBF_READ_ONLY; + + /* + * Verify the physical device is present and can support + * the desired I/O mode. + */ + DROP_GIANT(); + error = xbb_open_backend(xbb); + PICKUP_GIANT(); + if (error != 0) { + xbb_attach_failed(xbb, error, "Unable to open %s", + xbb->dev_name); + return (ENXIO); + } - return bus_generic_attach(dev); + /* Use devstat(9) for recording statistics. */ + xbb->xbb_stats = devstat_new_entry("xbb", device_get_unit(xbb->dev), + xbb->sector_size, + DEVSTAT_ALL_SUPPORTED, + DEVSTAT_TYPE_DIRECT + | DEVSTAT_TYPE_IF_OTHER, + DEVSTAT_PRIORITY_OTHER); + /* + * Create a taskqueue for doing work that must occur from a + * thread context. + */ + xbb->io_taskqueue = taskqueue_create(device_get_nameunit(dev), M_NOWAIT, + taskqueue_thread_enqueue, + /*context*/&xbb->io_taskqueue); + if (xbb->io_taskqueue == NULL) { + xbb_attach_failed(xbb, error, "Unable to create taskqueue"); + return (ENOMEM); + } + + taskqueue_start_threads(&xbb->io_taskqueue, + /*num threads*/1, + /*priority*/PWAIT, + /*thread name*/ + "%s taskq", device_get_nameunit(dev)); + + /* Update hot-plug status to satisfy xend. */ + error = xs_printf(XST_NIL, xenbus_get_node(xbb->dev), + "hotplug-status", "connected"); + if (error) { + xbb_attach_failed(xbb, error, "writing %s/hotplug-status", + xenbus_get_node(xbb->dev)); + return (error); + } + + /* Tell the front end that we are ready to connect. */ + xenbus_set_state(dev, XenbusStateInitWait); + + return (0); } -/* Newbus vbd device driver detach */ +/** + * Detach from a block back device instanced. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + * + * \note A block back device may be detached at any time in its life-cycle, + * including part way through the attach process. For this reason, + * initialization order and the intialization state checks in this + * routine must be carefully coupled so that attach time failures + * are gracefully handled. + */ static int -vbd_detach(device_t dev) +xbb_detach(device_t dev) { - blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + struct xbb_softc *xbb; - DPRINTF("%s\n", blkif->dev_name); + DPRINTF("\n"); - close_device(blkif); + xbb = device_get_softc(dev); + mtx_lock(&xbb->lock); + while (xbb_shutdown(xbb) == EAGAIN) { + msleep(xbb, &xbb->lock, /*wakeup prio unchanged*/0, + "xbb_shutdown", 0); + } + mtx_unlock(&xbb->lock); + mtx_destroy(&xbb->lock); - bus_generic_detach(dev); + DPRINTF("\n"); - blkif_put(blkif); + taskqueue_free(xbb->io_taskqueue); + devstat_remove_entry(xbb->xbb_stats); - return 0; + xbb_close_backend(xbb); + xbb_free_communication_mem(xbb); + + if (xbb->dev_mode != NULL) { + free(xbb->dev_mode, M_XENBUS); + xbb->dev_mode = NULL; + } + + if (xbb->dev_type != NULL) { + free(xbb->dev_type, M_XENBUS); + xbb->dev_type = NULL; + } + + if (xbb->dev_name != NULL) { + free(xbb->dev_name, M_XENBUS); + xbb->dev_name = NULL; + } + + if (xbb->requests != NULL) { + struct xbb_xen_req *req; + struct xbb_xen_req *last_req; + + req = xbb->requests; + last_req = &xbb->requests[xbb->max_requests - 1]; + while (req <= last_req) { +#ifdef XBB_USE_BOUNCE_BUFFERS + if (req->bounce != NULL) { + free(req->bounce, M_XENBLOCKBACK); + req->bounce = NULL; + } +#endif + if (req->gnt_handles != NULL) { + free (req->gnt_handles, M_XENBLOCKBACK); + req->gnt_handles = NULL; + } + req++; + } + free(xbb->requests, M_XENBLOCKBACK); + xbb->requests = NULL; + } + + return (0); } -static device_method_t vbd_methods[] = { +/** + * Prepare this block back device for suspension of this VM. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_suspend(device_t dev) +{ +#ifdef NOT_YET + struct xbb_softc *sc = device_get_softc(dev); + + /* Prevent new requests being issued until we fix things up. */ + mtx_lock(&sc->xb_io_lock); + sc->connected = BLKIF_STATE_SUSPENDED; + mtx_unlock(&sc->xb_io_lock); +#endif + + return (0); +} + +/** + * Perform any processing required to recover from a suspended state. + * + * \param dev NewBus device object representing this Xen Block Back instance. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_resume(device_t dev) +{ + return (0); +} + +/** + * Handle state changes expressed via the XenStore by our front-end peer. + * + * \param dev NewBus device object representing this Xen + * Block Back instance. + * \param frontend_state The new state of the front-end. + * + * \return 0 for success, errno codes for failure. + */ +static int +xbb_frontend_changed(device_t dev, XenbusState frontend_state) +{ + struct xbb_softc *xbb = device_get_softc(dev); + + DPRINTF("state=%s\n", xenbus_strstate(frontend_state)); + + switch (frontend_state) { + case XenbusStateInitialising: + case XenbusStateClosing: + break; + case XenbusStateInitialised: + case XenbusStateConnected: + xbb_connect(xbb); + break; + case XenbusStateClosed: + case XenbusStateInitWait: + + mtx_lock(&xbb->lock); + xbb_shutdown(xbb); + mtx_unlock(&xbb->lock); + break; + default: + xenbus_dev_fatal(xbb->dev, EINVAL, "saw state %d at frontend", + frontend_state); + break; + } + return (0); +} + +/*---------------------------- NewBus Registration ---------------------------*/ +static device_method_t xbb_methods[] = { /* Device interface */ - DEVMETHOD(device_probe, vbd_probe), - DEVMETHOD(device_attach, vbd_attach), - DEVMETHOD(device_detach, vbd_detach), + DEVMETHOD(device_probe, xbb_probe), + DEVMETHOD(device_attach, xbb_attach), + DEVMETHOD(device_detach, xbb_detach), DEVMETHOD(device_shutdown, bus_generic_shutdown), - DEVMETHOD(device_suspend, bus_generic_suspend), - DEVMETHOD(device_resume, bus_generic_resume), - {0, 0} -}; + DEVMETHOD(device_suspend, xbb_suspend), + DEVMETHOD(device_resume, xbb_resume), -static devclass_t vbd_devclass; + /* Xenbus interface */ + DEVMETHOD(xenbus_otherend_changed, xbb_frontend_changed), -static driver_t vbd_driver = { - "vbd", - vbd_methods, - 0, + { 0, 0 } }; -DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0); +static driver_t xbb_driver = { + "xbbd", + xbb_methods, + sizeof(struct xbb_softc), +}; +devclass_t xbb_devclass; -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: t - * End: - */ +DRIVER_MODULE(xbbd, xenbusb_back, xbb_driver, xbb_devclass, 0, 0); |