diff options
Diffstat (limited to 'sys')
-rw-r--r-- | sys/dev/xen/balloon/balloon.c | 446 | ||||
-rw-r--r-- | sys/dev/xen/blkback/blkback.c | 1349 | ||||
-rw-r--r-- | sys/dev/xen/blkfront/blkfront.c | 1021 | ||||
-rw-r--r-- | sys/dev/xen/blkfront/block.h | 97 | ||||
-rw-r--r-- | sys/dev/xen/console/console.c | 564 | ||||
-rw-r--r-- | sys/dev/xen/console/xencons_ring.c | 154 | ||||
-rw-r--r-- | sys/dev/xen/console/xencons_ring.h | 20 | ||||
-rw-r--r-- | sys/dev/xen/evtchn/evtchn_dev.c | 394 | ||||
-rw-r--r-- | sys/dev/xen/netback/netback.c | 1585 | ||||
-rw-r--r-- | sys/dev/xen/netfront/mbufq.h | 123 | ||||
-rw-r--r-- | sys/dev/xen/netfront/netfront.c | 1829 | ||||
-rw-r--r-- | sys/dev/xen/pcifront/pcifront.c | 688 |
12 files changed, 8270 insertions, 0 deletions
diff --git a/sys/dev/xen/balloon/balloon.c b/sys/dev/xen/balloon/balloon.c new file mode 100644 index 0000000..fa49196 --- /dev/null +++ b/sys/dev/xen/balloon/balloon.c @@ -0,0 +1,446 @@ +/****************************************************************************** + * balloon.c + * + * Xen balloon driver - enables returning/claiming memory to/from Xen. + * + * Copyright (c) 2003, B Dragovic + * Copyright (c) 2003-2004, M Williamson, K Fraser + * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * + * This file may be distributed separately from the Linux kernel, or + * incorporated into other software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/lock.h> +#include <sys/mutex.h> + +#include <machine/hypervisor-ifs.h> +#include <machine/xen-os.h> +#include <machine/xenbus.h> + +/* + * Protects atomic reservation decrease/increase against concurrent increases. + * Also protects non-atomic updates of current_pages and driver_pages, and + * balloon lists. + */ +struct mtx balloon_lock; +#ifdef notyet + +/* We aim for 'current allocation' == 'target allocation'. */ +static unsigned long current_pages; +static unsigned long target_pages; + +/* VM /proc information for memory */ +extern unsigned long totalram_pages; + +/* We may hit the hard limit in Xen. If we do then we remember it. */ +static unsigned long hard_limit; + +/* + * Drivers may alter the memory reservation independently, but they must + * inform the balloon driver so that we can avoid hitting the hard limit. + */ +static unsigned long driver_pages; + +struct balloon_entry { + vm_page_t page; + STAILQ_ENTRY(balloon_entry) list; +}; + +/* List of ballooned pages, threaded through the mem_map array. */ +static STAILQ_HEAD(,balloon_entry) ballooned_pages; + +static unsigned long balloon_low, balloon_high; + + +/* Main work function, always executed in process context. */ +static void balloon_process(void *unused); + +#define IPRINTK(fmt, args...) \ + printk(KERN_INFO "xen_mem: " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printk(KERN_WARNING "xen_mem: " fmt, ##args) + +/* balloon_append: add the given page to the balloon. */ +static void +balloon_append(vm_page_t page) +{ + struct balloon_entry *entry; + + entry = malloc(sizeof(struct balloon_entry), M_WAITOK); + + STAILQ_INSERT_HEAD(&ballooned_pages, entry, list); + balloon_low++; +} + +/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ +static vm_page_t +balloon_retrieve(void) +{ + vm_page_t page; + struct balloon_entry *entry; + + if (STAILQ_EMPTY(&ballooned_pages)) + return NULL; + + entry = STAILQ_FIRST(&ballooned_pages); + STAILQ_REMOVE_HEAD(&ballooned_pages, list); + + page = entry->page; + free(entry, M_DEVBUF); + + balloon_low--; + + return page; +} + +static void +balloon_alarm(unsigned long unused) +{ + wakeup(balloon_process); +} + +static unsigned long +current_target(void) +{ + unsigned long target = min(target_pages, hard_limit); + if (target > (current_pages + balloon_low + balloon_high)) + target = current_pages + balloon_low + balloon_high; + return target; +} + +static int +increase_reservation(unsigned long nr_pages) +{ + unsigned long *mfn_list, pfn, i, flags; + struct page *page; + long rc; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > (PAGE_SIZE / sizeof(unsigned long))) + nr_pages = PAGE_SIZE / sizeof(unsigned long); + + mfn_list = (unsigned long *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (mfn_list == NULL) + return ENOMEM; + + + reservation.extent_start = mfn_list; + reservation.nr_extents = nr_pages; + rc = HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation); + if (rc < nr_pages) { + int ret; + /* We hit the Xen hard limit: reprobe. */ + reservation.extent_start = mfn_list; + reservation.nr_extents = rc; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, + &reservation); + PANIC_IF(ret != rc); + hard_limit = current_pages + rc - driver_pages; + goto out; + } + + for (i = 0; i < nr_pages; i++) { + page = balloon_retrieve(); + PANIC_IF(page == NULL); + + pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); + PANIC_IF(phys_to_machine_mapping_valid(pfn)); + + /* Update P->M and M->P tables. */ + PFNTOMFN(pfn) = mfn_list[i]; + xen_machphys_update(mfn_list[i], pfn); + + /* Relinquish the page back to the allocator. */ + ClearPageReserved(page); + set_page_count(page, 1); + vm_page_free(page); + } + + current_pages += nr_pages; + totalram_pages = current_pages; + + out: + balloon_unlock(flags); + + free((mfn_list); + + return 0; +} + +static int +decrease_reservation(unsigned long nr_pages) +{ + unsigned long *mfn_list, pfn, i, flags; + struct page *page; + void *v; + int need_sleep = 0; + int ret; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + if (nr_pages > (PAGE_SIZE / sizeof(unsigned long))) + nr_pages = PAGE_SIZE / sizeof(unsigned long); + + mfn_list = (unsigned long *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (mfn_list == NULL) + return ENOMEM; + + for (i = 0; i < nr_pages; i++) { + int color = 0; + if ((page = vm_page_alloc(NULL, color++, + VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ | + VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) { + nr_pages = i; + need_sleep = 1; + break; + } + pfn = (VM_PAGE_TO_PHYS(page) >> PAGE_SHIFT); + mfn_list[i] = PFNTOMFN(pfn); + } + + balloon_lock(flags); + + /* No more mappings: invalidate P2M and add to balloon. */ + for (i = 0; i < nr_pages; i++) { + pfn = MFNTOPFN(mfn_list[i]); + PFNTOMFN(pfn) = INVALID_P2M_ENTRY; + balloon_append(PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT)); + } + + reservation.extent_start = mfn_list; + reservation.nr_extents = nr_pages; + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + PANIC_IF(ret != nr_pages); + + current_pages -= nr_pages; + totalram_pages = current_pages; + + balloon_unlock(flags); + + free(mfn_list, M_DEVBUF); + + return need_sleep; +} + +/* + * We avoid multiple worker processes conflicting via the balloon mutex. + * We may of course race updates of the target counts (which are protected + * by the balloon lock), or with changes to the Xen hard limit, but we will + * recover from these in time. + */ +static void +balloon_process(void *unused) +{ + int need_sleep = 0; + long credit; + + for (;;) { + do { + credit = current_target() - current_pages; + if (credit > 0) + need_sleep = (increase_reservation(credit) != 0); + if (credit < 0) + need_sleep = (decrease_reservation(-credit) != 0); + +#ifndef CONFIG_PREEMPT + if (need_resched()) + schedule(); +#endif + } while ((credit != 0) && !need_sleep); + + /* Schedule more work if there is some still to be done. */ + if (current_target() != current_pages) + timeout(balloon_alarm, NULL, ticks + HZ); + + msleep(balloon_process, balloon_lock, 0, "balloon", -1); + } + +} + +/* Resets the Xen limit, sets new target, and kicks off processing. */ +static void +set_new_target(unsigned long target) +{ + /* No need for lock. Not read-modify-write updates. */ + hard_limit = ~0UL; + target_pages = target; + wakeup(balloon_process); +} + +static struct xenbus_watch target_watch = +{ + .node = "memory/target" +}; + +/* React to a change in the target key */ +static void +watch_target(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + unsigned long long new_target; + int err; + + err = xenbus_scanf(NULL, "memory", "target", "%llu", &new_target); + if (err != 1) { + /* This is ok (for domain0 at least) - so just return */ + return; + } + + /* The given memory/target value is in KiB, so it needs converting to + pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. + */ + set_new_target(new_target >> (PAGE_SHIFT - 10)); + +} + +static void +balloon_init_watcher(void *) +{ + int err; + + err = register_xenbus_watch(&target_watch); + if (err) + printf("Failed to set balloon watcher\n"); + +} + +static void +balloon_init(void *) +{ + unsigned long pfn; + struct page *page; + + IPRINTK("Initialising balloon driver.\n"); + + if (xen_init() < 0) + return -1; + + current_pages = min(xen_start_info->nr_pages, max_pfn); + target_pages = current_pages; + balloon_low = 0; + balloon_high = 0; + driver_pages = 0UL; + hard_limit = ~0UL; + + init_timer(&balloon_timer); + balloon_timer.data = 0; + balloon_timer.function = balloon_alarm; + + /* Initialise the balloon with excess memory space. */ + for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { + page = PHYS_TO_VM_PAGE(pfn << PAGE_SHIFT); + balloon_append(page); + } + + target_watch.callback = watch_target; + + return 0; +} + +void +balloon_update_driver_allowance(long delta) +{ + unsigned long flags; + + balloon_lock(flags); + driver_pages += delta; + balloon_unlock(flags); +} + +#if 0 +static int dealloc_pte_fn( + pte_t *pte, struct page *pte_page, unsigned long addr, void *data) +{ + unsigned long mfn = pte_mfn(*pte); + int ret; + struct xen_memory_reservation reservation = { + .extent_start = &mfn, + .nr_extents = 1, + .extent_order = 0, + .domid = DOMID_SELF + }; + set_pte_at(&init_mm, addr, pte, __pte_ma(0)); + set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); + ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); + PANIC_IF(ret != 1); + return 0; +} + +#endif +vm_page_t +balloon_alloc_empty_page_range(unsigned long nr_pages) +{ + unsigned long flags; + vm_page_t pages; + int i; + unsigned long *mfn_list; + struct xen_memory_reservation reservation = { + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + pages = vm_page_alloc_contig(nr_pages, 0, -1, 4, 4) + if (pages == NULL) + return NULL; + + mfn_list = malloc(nr_pages*sizeof(unsigned long), M_DEVBUF, M_WAITOK); + + for (i = 0; i < nr_pages; i++) { + mfn_list[i] = PFNTOMFN(VM_PAGE_TO_PHYS(pages[i]) >> PAGE_SHIFT); + PFNTOMFN(i) = INVALID_P2M_ENTRY; + reservation.extent_start = mfn_list; + reservation.nr_extents = nr_pages; + PANIC_IF(HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation) != nr_pages); + } + + current_pages -= nr_pages; + + wakeup(balloon_process); + + return pages; +} + +void +balloon_dealloc_empty_page_range(vm_page_t page, unsigned long nr_pages) +{ + unsigned long i, flags; + + for (i = 0; i < nr_pages; i++) + balloon_append(page + i); + + wakeup(balloon_process); +} + +#endif diff --git a/sys/dev/xen/blkback/blkback.c b/sys/dev/xen/blkback/blkback.c new file mode 100644 index 0000000..630a0bd --- /dev/null +++ b/sys/dev/xen/blkback/blkback.c @@ -0,0 +1,1349 @@ +/* + * Copyright (c) 2006, Cisco Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/filedesc.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/disk.h> +#include <sys/bio.h> + +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/sysctl.h> + +#include <geom/geom.h> + +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> + +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs.h> +#include <machine/xen_intr.h> +#include <machine/evtchn.h> +#include <machine/xenbus.h> +#include <machine/gnttab.h> +#include <machine/xen-public/memory.h> +#include <dev/xen/xenbus/xenbus_comms.h> + + +#if XEN_BLKBACK_DEBUG +#define DPRINTF(fmt, args...) \ + printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) ((void)0) +#endif + +#define WPRINTF(fmt, args...) \ + printf("blkback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) + +#define BLKBACK_INVALID_HANDLE (~0) + +struct ring_ref { + vm_offset_t va; + grant_handle_t handle; + uint64_t bus_addr; +}; + +typedef struct blkback_info { + + /* Schedule lists */ + STAILQ_ENTRY(blkback_info) next_req; + int on_req_sched_list; + + struct xenbus_device *xdev; + XenbusState frontend_state; + + domid_t domid; + + int state; + int ring_connected; + struct ring_ref rr; + blkif_back_ring_t ring; + evtchn_port_t evtchn; + int irq; + void *irq_cookie; + + int ref_cnt; + + int handle; + char *mode; + char *type; + char *dev_name; + + struct vnode *vn; + struct cdev *cdev; + struct cdevsw *csw; + u_int sector_size; + int sector_size_shift; + off_t media_size; + u_int media_num_sectors; + int major; + int minor; + int read_only; + + struct mtx blk_ring_lock; + + device_t ndev; + + /* Stats */ + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_err_req; +} blkif_t; + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** + * + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +static int blkif_reqs = 64; +TUNABLE_INT("xen.vbd.blkif_reqs", &blkif_reqs); + +static int mmap_pages; + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +typedef struct pending_req { + blkif_t *blkif; + uint64_t id; + int nr_pages; + int pendcnt; + unsigned short operation; + int status; + STAILQ_ENTRY(pending_req) free_list; +} pending_req_t; + +static pending_req_t *pending_reqs; +static STAILQ_HEAD(pending_reqs_list, pending_req) pending_free = + STAILQ_HEAD_INITIALIZER(pending_free); +static struct mtx pending_free_lock; + +static STAILQ_HEAD(blkback_req_sched_list, blkback_info) req_sched_list = + STAILQ_HEAD_INITIALIZER(req_sched_list); +static struct mtx req_sched_list_lock; + +static unsigned long mmap_vstart; +static unsigned long *pending_vaddrs; +static grant_handle_t *pending_grant_handles; + +static struct task blk_req_task; + +/* Protos */ +static void disconnect_ring(blkif_t *blkif); +static int vbd_add_dev(struct xenbus_device *xdev); + +static inline int vaddr_pagenr(pending_req_t *req, int seg) +{ + return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; +} + +static inline unsigned long vaddr(pending_req_t *req, int seg) +{ + return pending_vaddrs[vaddr_pagenr(req, seg)]; +} + +#define pending_handle(_req, _seg) \ + (pending_grant_handles[vaddr_pagenr(_req, _seg)]) + +static unsigned long +alloc_empty_page_range(unsigned long nr_pages) +{ + void *pages; + int i = 0, j = 0; + multicall_entry_t mcl[17]; + unsigned long mfn_list[16]; + struct xen_memory_reservation reservation = { + .extent_start = mfn_list, + .nr_extents = 0, + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (pages == NULL) + return 0; + + memset(mcl, 0, sizeof(mcl)); + + while (i < nr_pages) { + unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE); + + mcl[j].op = __HYPERVISOR_update_va_mapping; + mcl[j].args[0] = va; + + mfn_list[j++] = vtomach(va) >> PAGE_SHIFT; + + xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY; + + if (j == 16 || i == nr_pages) { + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL; + + reservation.nr_extents = j; + + mcl[j].op = __HYPERVISOR_memory_op; + mcl[j].args[0] = XENMEM_decrease_reservation; + mcl[j].args[1] = (unsigned long)&reservation; + + (void)HYPERVISOR_multicall(mcl, j+1); + + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0; + j = 0; + } + } + + return (unsigned long)pages; +} + +static pending_req_t * +alloc_req(void) +{ + pending_req_t *req; + mtx_lock(&pending_free_lock); + if ((req = STAILQ_FIRST(&pending_free))) { + STAILQ_REMOVE(&pending_free, req, pending_req, free_list); + STAILQ_NEXT(req, free_list) = NULL; + } + mtx_unlock(&pending_free_lock); + return req; +} + +static void +free_req(pending_req_t *req) +{ + int was_empty; + + mtx_lock(&pending_free_lock); + was_empty = STAILQ_EMPTY(&pending_free); + STAILQ_INSERT_TAIL(&pending_free, req, free_list); + mtx_unlock(&pending_free_lock); + if (was_empty) + taskqueue_enqueue(taskqueue_swi, &blk_req_task); +} + +static void +fast_flush_area(pending_req_t *req) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int ret; + + for (i = 0; i < req->nr_pages; i++) { + handle = pending_handle(req, i); + if (handle == BLKBACK_INVALID_HANDLE) + continue; + unmap[invcount].host_addr = vaddr(req, i); + unmap[invcount].dev_bus_addr = 0; + unmap[invcount].handle = handle; + pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + invcount++; + } + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + PANIC_IF(ret); +} + +static void +blkif_get(blkif_t *blkif) +{ + atomic_add_int(&blkif->ref_cnt, 1); +} + +static void +blkif_put(blkif_t *blkif) +{ + if (atomic_fetchadd_int(&blkif->ref_cnt, -1) == 1) { + DPRINTF("Removing %x\n", (unsigned int)blkif); + disconnect_ring(blkif); + if (blkif->mode) + free(blkif->mode, M_DEVBUF); + if (blkif->type) + free(blkif->type, M_DEVBUF); + if (blkif->dev_name) + free(blkif->dev_name, M_DEVBUF); + free(blkif, M_DEVBUF); + } +} + +static int +blkif_create(struct xenbus_device *xdev, long handle, char *mode, char *type, char *params) +{ + blkif_t *blkif; + + blkif = (blkif_t *)malloc(sizeof(*blkif), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!blkif) + return ENOMEM; + + DPRINTF("Created %x\n", (unsigned int)blkif); + + blkif->ref_cnt = 1; + blkif->domid = xdev->otherend_id; + blkif->handle = handle; + blkif->mode = mode; + blkif->type = type; + blkif->dev_name = params; + blkif->xdev = xdev; + xdev->data = blkif; + + mtx_init(&blkif->blk_ring_lock, "blk_ring_ock", "blkback ring lock", MTX_DEF); + + if (strcmp(mode, "w")) + blkif->read_only = 1; + + return 0; +} + +static void +add_to_req_schedule_list_tail(blkif_t *blkif) +{ + if (!blkif->on_req_sched_list) { + mtx_lock(&req_sched_list_lock); + if (!blkif->on_req_sched_list && (blkif->state == XenbusStateConnected)) { + blkif_get(blkif); + STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); + blkif->on_req_sched_list = 1; + taskqueue_enqueue(taskqueue_swi, &blk_req_task); + } + mtx_unlock(&req_sched_list_lock); + } +} + +/* This routine does not call blkif_get(), does not schedule the blk_req_task to run, + and assumes that the state is connected */ +static void +add_to_req_schedule_list_tail2(blkif_t *blkif) +{ + mtx_lock(&req_sched_list_lock); + if (!blkif->on_req_sched_list) { + STAILQ_INSERT_TAIL(&req_sched_list, blkif, next_req); + blkif->on_req_sched_list = 1; + } + mtx_unlock(&req_sched_list_lock); +} + +/* Removes blkif from front of list and does not call blkif_put() (caller must) */ +static blkif_t * +remove_from_req_schedule_list(void) +{ + blkif_t *blkif; + + mtx_lock(&req_sched_list_lock); + + if ((blkif = STAILQ_FIRST(&req_sched_list))) { + STAILQ_REMOVE(&req_sched_list, blkif, blkback_info, next_req); + STAILQ_NEXT(blkif, next_req) = NULL; + blkif->on_req_sched_list = 0; + } + + mtx_unlock(&req_sched_list_lock); + + return blkif; +} + +static void +make_response(blkif_t *blkif, uint64_t id, + unsigned short op, int st) +{ + blkif_response_t *resp; + blkif_back_ring_t *blk_ring = &blkif->ring; + int more_to_do = 0; + int notify; + + mtx_lock(&blkif->blk_ring_lock); + + + /* Place on the response ring for the relevant domain. */ + resp = RING_GET_RESPONSE(blk_ring, blk_ring->rsp_prod_pvt); + resp->id = id; + resp->operation = op; + resp->status = st; + blk_ring->rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(blk_ring, notify); + + if (blk_ring->rsp_prod_pvt == blk_ring->req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do); + + } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) + more_to_do = 1; + + mtx_unlock(&blkif->blk_ring_lock); + + if (more_to_do) + add_to_req_schedule_list_tail(blkif); + + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static void +end_block_io_op(struct bio *bio) +{ + pending_req_t *pending_req = bio->bio_caller2; + + if (bio->bio_error) { + DPRINTF("BIO returned error %d for operation on device %s\n", + bio->bio_error, pending_req->blkif->dev_name); + pending_req->status = BLKIF_RSP_ERROR; + pending_req->blkif->st_err_req++; + } + +#if 0 + printf("done: bio=%x error=%x completed=%llu resid=%lu flags=%x\n", + (unsigned int)bio, bio->bio_error, bio->bio_completed, bio->bio_resid, bio->bio_flags); +#endif + + if (atomic_fetchadd_int(&pending_req->pendcnt, -1) == 1) { + fast_flush_area(pending_req); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + blkif_put(pending_req->blkif); + free_req(pending_req); + } + + g_destroy_bio(bio); +} + +static void +dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req, pending_req_t *pending_req) +{ + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct { + unsigned long buf; unsigned int nsec; + } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int nseg = req->nr_segments, nr_sects = 0; + struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int operation, ret, i, nbio = 0; + + /* Check that number of segments is sane. */ + if (unlikely(nseg == 0) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + DPRINTF("Bad number of segments in request (%d)\n", nseg); + goto fail_response; + } + + if (req->operation == BLKIF_OP_WRITE) { + if (blkif->read_only) { + DPRINTF("Attempt to write to read only device %s\n", blkif->dev_name); + goto fail_response; + } + operation = BIO_WRITE; + } else + operation = BIO_READ; + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = req->operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + + for (i = 0; i < nseg; i++) { + seg[i].nsec = req->seg[i].last_sect - + req->seg[i].first_sect + 1; + + if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || + (seg[i].nsec <= 0)) + goto fail_response; + nr_sects += seg[i].nsec; + + map[i].host_addr = vaddr(pending_req, i); + map[i].dom = blkif->domid; + map[i].ref = req->seg[i].gref; + map[i].flags = GNTMAP_host_map; + if (operation == BIO_WRITE) + map[i].flags |= GNTMAP_readonly; + } + + /* Convert to the disk's sector size */ + nr_sects = (nr_sects << 9) >> blkif->sector_size_shift; + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); + PANIC_IF(ret); + + for (i = 0; i < nseg; i++) { + if (unlikely(map[i].status != 0)) { + DPRINTF("invalid buffer -- could not remap it\n"); + goto fail_flush; + } + + pending_handle(pending_req, i) = map[i].handle; +#if 0 + /* Can't do this in FreeBSD since vtophys() returns the pfn */ + /* of the remote domain who loaned us the machine page - DPT */ + xen_phys_machine[(vtophys(vaddr(pending_req, i)) >> PAGE_SHIFT)] = + map[i]dev_bus_addr >> PAGE_SHIFT; +#endif + seg[i].buf = map[i].dev_bus_addr | + (req->seg[i].first_sect << 9); + } + + if (req->sector_number + nr_sects > blkif->media_num_sectors) { + DPRINTF("%s of [%llu,%llu] extends past end of device %s\n", + operation == BIO_READ ? "read" : "write", + req->sector_number, + req->sector_number + nr_sects, blkif->dev_name); + goto fail_flush; + } + + for (i = 0; i < nseg; i++) { + struct bio *bio; + + if ((int)seg[i].nsec & ((blkif->sector_size >> 9) - 1)) { + DPRINTF("Misaligned I/O request from domain %d", blkif->domid); + goto fail_put_bio; + } + + bio = biolist[nbio++] = g_new_bio(); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + bio->bio_cmd = operation; + bio->bio_offset = req->sector_number << blkif->sector_size_shift; + bio->bio_length = seg[i].nsec << 9; + bio->bio_bcount = bio->bio_length; + bio->bio_data = (caddr_t)(vaddr(pending_req, i) | (seg[i].buf & PAGE_MASK)); + bio->bio_done = end_block_io_op; + bio->bio_caller2 = pending_req; + bio->bio_dev = blkif->cdev; + + req->sector_number += (seg[i].nsec << 9) >> blkif->sector_size_shift; +#if 0 + printf("new: bio=%x cmd=%d sect=%llu nsect=%u iosize_max=%u @ %08lx\n", + (unsigned int)bio, req->operation, req->sector_number, seg[i].nsec, + blkif->cdev->si_iosize_max, seg[i].buf); +#endif + } + + pending_req->pendcnt = nbio; + blkif_get(blkif); + + for (i = 0; i < nbio; i++) + (*blkif->csw->d_strategy)(biolist[i]); + + return; + + fail_put_bio: + for (i = 0; i < (nbio-1); i++) + g_destroy_bio(biolist[i]); + fail_flush: + fast_flush_area(pending_req); + fail_response: + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); +} + +static void +blk_req_action(void *context, int pending) +{ + blkif_t *blkif; + + DPRINTF("\n"); + + while (!STAILQ_EMPTY(&req_sched_list)) { + blkif_back_ring_t *blk_ring; + RING_IDX rc, rp; + + blkif = remove_from_req_schedule_list(); + + blk_ring = &blkif->ring; + rc = blk_ring->req_cons; + rp = blk_ring->sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) { + blkif_request_t *req; + pending_req_t *pending_req; + + pending_req = alloc_req(); + if (pending_req == NULL) + goto out_of_preqs; + + req = RING_GET_REQUEST(blk_ring, rc); + blk_ring->req_cons = ++rc; /* before make_response() */ + + switch (req->operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + dispatch_rw_block_io(blkif, req, pending_req); + break; + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + dispatch_rw_block_io(blkif, req, pending_req); + break; + default: + blkif->st_err_req++; + DPRINTF("error: unknown block io operation [%d]\n", + req->operation); + make_response(blkif, req->id, req->operation, + BLKIF_RSP_ERROR); + free_req(pending_req); + break; + } + } + + blkif_put(blkif); + } + + return; + + out_of_preqs: + /* We ran out of pending req structs */ + /* Just requeue interface and wait to be rescheduled to run when one is freed */ + add_to_req_schedule_list_tail2(blkif); + blkif->st_oo_req++; +} + +/* Handle interrupt from a frontend */ +static void +blkback_intr(void *arg) +{ + blkif_t *blkif = arg; + DPRINTF("%x\n", (unsigned int)blkif); + add_to_req_schedule_list_tail(blkif); +} + +/* Map grant ref for ring */ +static int +map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring) +{ + struct gnttab_map_grant_ref op; + + ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); + if (ring->va == 0) + return ENOMEM; + + op.host_addr = ring->va; + op.flags = GNTMAP_host_map; + op.ref = ref; + op.dom = dom; + HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1); + if (op.status) { + WPRINTF("grant table op err=%d\n", op.status); + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; + return EACCES; + } + + ring->handle = op.handle; + ring->bus_addr = op.dev_bus_addr; + + return 0; +} + +/* Unmap grant ref for ring */ +static void +unmap_ring(struct ring_ref *ring) +{ + struct gnttab_unmap_grant_ref op; + + op.host_addr = ring->va; + op.dev_bus_addr = ring->bus_addr; + op.handle = ring->handle; + HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1); + if (op.status) + WPRINTF("grant table op err=%d\n", op.status); + + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; +} + +static int +connect_ring(blkif_t *blkif) +{ + struct xenbus_device *xdev = blkif->xdev; + blkif_sring_t *ring; + unsigned long ring_ref; + evtchn_port_t evtchn; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; + + if (blkif->ring_connected) + return 0; + + // Grab FE data and map his memory + err = xenbus_gather(NULL, xdev->otherend, + "ring-ref", "%lu", &ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(xdev, err, + "reading %s/ring-ref and event-channel", + xdev->otherend); + return err; + } + + err = map_ring(ring_ref, blkif->domid, &blkif->rr); + if (err) { + xenbus_dev_fatal(xdev, err, "mapping ring"); + return err; + } + ring = (blkif_sring_t *)blkif->rr.va; + BACK_RING_INIT(&blkif->ring, ring, PAGE_SIZE); + + op.u.bind_interdomain.remote_dom = blkif->domid; + op.u.bind_interdomain.remote_port = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_ring(&blkif->rr); + xenbus_dev_fatal(xdev, err, "binding event channel"); + return err; + } + blkif->evtchn = op.u.bind_interdomain.local_port; + + /* bind evtchn to irq handler */ + blkif->irq = + bind_evtchn_to_irqhandler(blkif->evtchn, "blkback", + blkback_intr, blkif, INTR_TYPE_NET|INTR_MPSAFE, &blkif->irq_cookie); + + blkif->ring_connected = 1; + + DPRINTF("%x rings connected! evtchn=%d irq=%d\n", + (unsigned int)blkif, blkif->evtchn, blkif->irq); + + return 0; +} + +static void +disconnect_ring(blkif_t *blkif) +{ + DPRINTF("\n"); + + if (blkif->ring_connected) { + unbind_from_irqhandler(blkif->irq, blkif->irq_cookie); + blkif->irq = 0; + unmap_ring(&blkif->rr); + blkif->ring_connected = 0; + } +} + +static void +connect(blkif_t *blkif) +{ + struct xenbus_transaction *xbt; + struct xenbus_device *xdev = blkif->xdev; + int err; + + if (!blkif->ring_connected || + blkif->vn == NULL || + blkif->state == XenbusStateConnected) + return; + + DPRINTF("%s\n", xdev->otherend); + + /* Supply the information about the device the frontend needs */ +again: + xbt = xenbus_transaction_start(); + if (IS_ERR(xbt)) { + xenbus_dev_fatal(xdev, PTR_ERR(xbt), + "Error writing configuration for backend " + "(start transaction)"); + return; + } + + err = xenbus_printf(xbt, xdev->nodename, "sectors", "%u", + blkif->media_num_sectors); + if (err) { + xenbus_dev_fatal(xdev, err, "writing %s/sectors", + xdev->nodename); + goto abort; + } + + err = xenbus_printf(xbt, xdev->nodename, "info", "%u", + blkif->read_only ? VDISK_READONLY : 0); + if (err) { + xenbus_dev_fatal(xdev, err, "writing %s/info", + xdev->nodename); + goto abort; + } + err = xenbus_printf(xbt, xdev->nodename, "sector-size", "%u", + blkif->sector_size); + if (err) { + xenbus_dev_fatal(xdev, err, "writing %s/sector-size", + xdev->nodename); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(xdev, err, "ending transaction"); + + err = xenbus_switch_state(xdev, NULL, XenbusStateConnected); + if (err) + xenbus_dev_fatal(xdev, err, "switching to Connected state", + xdev->nodename); + + blkif->state = XenbusStateConnected; + + return; + + abort: + xenbus_transaction_end(xbt, 1); +} + +static int +blkback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id) +{ + int err; + char *p, *mode = NULL, *type = NULL, *params = NULL; + long handle; + + DPRINTF("node=%s\n", xdev->nodename); + + p = strrchr(xdev->otherend, '/') + 1; + handle = strtoul(p, NULL, 0); + + mode = xenbus_read(NULL, xdev->nodename, "mode", NULL); + if (IS_ERR(mode)) { + xenbus_dev_fatal(xdev, PTR_ERR(mode), "reading mode"); + err = PTR_ERR(mode); + goto error; + } + + type = xenbus_read(NULL, xdev->nodename, "type", NULL); + if (IS_ERR(type)) { + xenbus_dev_fatal(xdev, PTR_ERR(type), "reading type"); + err = PTR_ERR(type); + goto error; + } + + params = xenbus_read(NULL, xdev->nodename, "params", NULL); + if (IS_ERR(type)) { + xenbus_dev_fatal(xdev, PTR_ERR(params), "reading params"); + err = PTR_ERR(params); + goto error; + } + + err = blkif_create(xdev, handle, mode, type, params); + if (err) { + xenbus_dev_fatal(xdev, err, "creating blkif"); + goto error; + } + + err = vbd_add_dev(xdev); + if (err) { + blkif_put((blkif_t *)xdev->data); + xenbus_dev_fatal(xdev, err, "adding vbd device"); + } + + return err; + + error: + if (mode) + free(mode, M_DEVBUF); + if (type) + free(type, M_DEVBUF); + if (params) + free(params, M_DEVBUF); + return err; +} + +static int +blkback_remove(struct xenbus_device *xdev) +{ + blkif_t *blkif = xdev->data; + device_t ndev; + + DPRINTF("node=%s\n", xdev->nodename); + + blkif->state = XenbusStateClosing; + + if ((ndev = blkif->ndev)) { + blkif->ndev = NULL; + mtx_lock(&Giant); + device_detach(ndev); + mtx_unlock(&Giant); + } + + xdev->data = NULL; + blkif->xdev = NULL; + blkif_put(blkif); + + return 0; +} + +static int +blkback_resume(struct xenbus_device *xdev) +{ + DPRINTF("node=%s\n", xdev->nodename); + return 0; +} + +static void +frontend_changed(struct xenbus_device *xdev, + XenbusState frontend_state) +{ + blkif_t *blkif = xdev->data; + + DPRINTF("state=%d\n", frontend_state); + + blkif->frontend_state = frontend_state; + + switch (frontend_state) { + case XenbusStateInitialising: + break; + case XenbusStateInitialised: + case XenbusStateConnected: + connect_ring(blkif); + connect(blkif); + break; + case XenbusStateClosing: + xenbus_switch_state(xdev, NULL, XenbusStateClosing); + break; + case XenbusStateClosed: + xenbus_remove_device(xdev); + break; + case XenbusStateUnknown: + case XenbusStateInitWait: + xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* ** Driver registration ** */ + +static struct xenbus_device_id blkback_ids[] = { + { "vbd" }, + { "" } +}; + +static struct xenbus_driver blkback = { + .name = "blkback", + .ids = blkback_ids, + .probe = blkback_probe, + .remove = blkback_remove, + .resume = blkback_resume, + .otherend_changed = frontend_changed, +}; + +static void +blkback_init(void *unused) +{ + int i; + + TASK_INIT(&blk_req_task, 0, blk_req_action, NULL); + mtx_init(&req_sched_list_lock, "blk_req_sched_lock", "blkback req sched lock", MTX_DEF); + + mtx_init(&pending_free_lock, "blk_pending_req_ock", "blkback pending request lock", MTX_DEF); + + mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; + pending_reqs = malloc(sizeof(pending_reqs[0]) * + blkif_reqs, M_DEVBUF, M_ZERO|M_NOWAIT); + pending_grant_handles = malloc(sizeof(pending_grant_handles[0]) * + mmap_pages, M_DEVBUF, M_NOWAIT); + pending_vaddrs = malloc(sizeof(pending_vaddrs[0]) * + mmap_pages, M_DEVBUF, M_NOWAIT); + mmap_vstart = alloc_empty_page_range(mmap_pages); + if (!pending_reqs || !pending_grant_handles || !pending_vaddrs || !mmap_vstart) { + if (pending_reqs) + free(pending_reqs, M_DEVBUF); + if (pending_grant_handles) + free(pending_grant_handles, M_DEVBUF); + if (pending_vaddrs) + free(pending_vaddrs, M_DEVBUF); + WPRINTF("out of memory\n"); + return; + } + + for (i = 0; i < mmap_pages; i++) { + pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT); + pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; + } + + for (i = 0; i < blkif_reqs; i++) { + STAILQ_INSERT_TAIL(&pending_free, &pending_reqs[i], free_list); + } + + DPRINTF("registering %s\n", blkback.name); + xenbus_register_backend(&blkback); +} + +SYSINIT(xbbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, blkback_init, NULL) + +static void +close_device(blkif_t *blkif) +{ + DPRINTF("closing dev=%s\n", blkif->dev_name); + if (blkif->vn) { + int flags = FREAD; + + if (!blkif->read_only) + flags |= FWRITE; + + if (blkif->csw) { + dev_relthread(blkif->cdev); + blkif->csw = NULL; + } + + (void)vn_close(blkif->vn, flags, NOCRED, curthread); + blkif->vn = NULL; + } +} + +static int +open_device(blkif_t *blkif) +{ + struct nameidata nd; + struct vattr vattr; + struct cdev *dev; + struct cdevsw *devsw; + int flags = FREAD, err = 0; + + DPRINTF("opening dev=%s\n", blkif->dev_name); + + if (!blkif->read_only) + flags |= FWRITE; + + if (!curthread->td_proc->p_fd->fd_cdir) { + curthread->td_proc->p_fd->fd_cdir = rootvnode; + VREF(rootvnode); + } + if (!curthread->td_proc->p_fd->fd_rdir) { + curthread->td_proc->p_fd->fd_rdir = rootvnode; + VREF(rootvnode); + } + if (!curthread->td_proc->p_fd->fd_jdir) { + curthread->td_proc->p_fd->fd_jdir = rootvnode; + VREF(rootvnode); + } + + again: + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, blkif->dev_name, curthread); + err = vn_open(&nd, &flags, 0, -1); + if (err) { + if (blkif->dev_name[0] != '/') { + char *dev_path = "/dev/"; + char *dev_name; + + /* Try adding device path at beginning of name */ + dev_name = malloc(strlen(blkif->dev_name) + strlen(dev_path) + 1, M_DEVBUF, M_NOWAIT); + if (dev_name) { + sprintf(dev_name, "%s%s", dev_path, blkif->dev_name); + free(blkif->dev_name, M_DEVBUF); + blkif->dev_name = dev_name; + goto again; + } + } + xenbus_dev_fatal(blkif->xdev, err, "error opening device %s", blkif->dev_name); + return err; + } + NDFREE(&nd, NDF_ONLY_PNBUF); + + blkif->vn = nd.ni_vp; + + /* We only support disks for now */ + if (!vn_isdisk(blkif->vn, &err)) { + xenbus_dev_fatal(blkif->xdev, err, "device %s is not a disk", blkif->dev_name); + VOP_UNLOCK(blkif->vn, 0, curthread); + goto error; + } + + blkif->cdev = blkif->vn->v_rdev; + blkif->csw = dev_refthread(blkif->cdev); + PANIC_IF(blkif->csw == NULL); + + err = VOP_GETATTR(blkif->vn, &vattr, NOCRED, curthread); + if (err) { + xenbus_dev_fatal(blkif->xdev, err, + "error getting vnode attributes for device %s", blkif->dev_name); + VOP_UNLOCK(blkif->vn, 0, curthread); + goto error; + } + + VOP_UNLOCK(blkif->vn, 0, curthread); + + dev = blkif->vn->v_rdev; + devsw = dev->si_devsw; + if (!devsw->d_ioctl) { + err = ENODEV; + xenbus_dev_fatal(blkif->xdev, err, + "no d_ioctl for device %s!", blkif->dev_name); + goto error; + } + + err = (*devsw->d_ioctl)(dev, DIOCGSECTORSIZE, (caddr_t)&blkif->sector_size, FREAD, curthread); + if (err) { + xenbus_dev_fatal(blkif->xdev, err, + "error calling ioctl DIOCGSECTORSIZE for device %s", blkif->dev_name); + goto error; + } + blkif->sector_size_shift = fls(blkif->sector_size) - 1; + + err = (*devsw->d_ioctl)(dev, DIOCGMEDIASIZE, (caddr_t)&blkif->media_size, FREAD, curthread); + if (err) { + xenbus_dev_fatal(blkif->xdev, err, + "error calling ioctl DIOCGMEDIASIZE for device %s", blkif->dev_name); + goto error; + } + blkif->media_num_sectors = blkif->media_size >> blkif->sector_size_shift; + + blkif->major = umajor(vattr.va_rdev); + blkif->minor = uminor(vattr.va_rdev); + + DPRINTF("opened dev=%s major=%d minor=%d sector_size=%u media_size=%lld\n", + blkif->dev_name, blkif->major, blkif->minor, blkif->sector_size, blkif->media_size); + + return 0; + + error: + close_device(blkif); + return err; +} + +static int +vbd_add_dev(struct xenbus_device *xdev) +{ + blkif_t *blkif = xdev->data; + device_t nexus, ndev; + devclass_t dc; + int err = 0; + + mtx_lock(&Giant); + + /* We will add a vbd device as a child of nexus0 (for now) */ + if (!(dc = devclass_find("nexus")) || + !(nexus = devclass_get_device(dc, 0))) { + WPRINTF("could not find nexus0!\n"); + err = ENOENT; + goto done; + } + + + /* Create a newbus device representing the vbd */ + ndev = BUS_ADD_CHILD(nexus, 0, "vbd", blkif->handle); + if (!ndev) { + WPRINTF("could not create newbus device vbd%d!\n", blkif->handle); + err = EFAULT; + goto done; + } + + blkif_get(blkif); + device_set_ivars(ndev, blkif); + blkif->ndev = ndev; + + device_probe_and_attach(ndev); + + done: + + mtx_unlock(&Giant); + + return err; +} + +enum { + VBD_SYSCTL_DOMID, + VBD_SYSCTL_ST_RD_REQ, + VBD_SYSCTL_ST_WR_REQ, + VBD_SYSCTL_ST_OO_REQ, + VBD_SYSCTL_ST_ERR_REQ, + VBD_SYSCTL_RING, +}; + +static char * +vbd_sysctl_ring_info(blkif_t *blkif, int cmd) +{ + char *buf = malloc(256, M_DEVBUF, M_WAITOK); + if (buf) { + if (!blkif->ring_connected) + sprintf(buf, "ring not connected\n"); + else { + blkif_back_ring_t *ring = &blkif->ring; + sprintf(buf, "nr_ents=%x req_cons=%x" + " req_prod=%x req_event=%x" + " rsp_prod=%x rsp_event=%x", + ring->nr_ents, ring->req_cons, + ring->sring->req_prod, ring->sring->req_event, + ring->sring->rsp_prod, ring->sring->rsp_event); + } + } + return buf; +} + +static int +vbd_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + device_t dev = (device_t)arg1; + blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + const char *value; + char *buf = NULL; + int err; + + switch (arg2) { + case VBD_SYSCTL_DOMID: + return sysctl_handle_int(oidp, NULL, blkif->domid, req); + case VBD_SYSCTL_ST_RD_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_rd_req, req); + case VBD_SYSCTL_ST_WR_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_wr_req, req); + case VBD_SYSCTL_ST_OO_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_oo_req, req); + case VBD_SYSCTL_ST_ERR_REQ: + return sysctl_handle_int(oidp, NULL, blkif->st_err_req, req); + case VBD_SYSCTL_RING: + value = buf = vbd_sysctl_ring_info(blkif, arg2); + break; + default: + return (EINVAL); + } + + err = SYSCTL_OUT(req, value, strlen(value)); + if (buf != NULL) + free(buf, M_DEVBUF); + + return err; +} + +/* Newbus vbd device driver probe */ +static int +vbd_probe(device_t dev) +{ + DPRINTF("vbd%d\n", device_get_unit(dev)); + return 0; +} + +/* Newbus vbd device driver attach */ +static int +vbd_attach(device_t dev) +{ + blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + + DPRINTF("%s\n", blkif->dev_name); + + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_DOMID, vbd_sysctl_handler, "I", + "domid of frontend"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "rd_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_RD_REQ, vbd_sysctl_handler, "I", + "number of read reqs"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "wr_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_WR_REQ, vbd_sysctl_handler, "I", + "number of write reqs"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "oo_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_OO_REQ, vbd_sysctl_handler, "I", + "number of deferred reqs"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "err_reqs", CTLTYPE_INT|CTLFLAG_RD, + dev, VBD_SYSCTL_ST_ERR_REQ, vbd_sysctl_handler, "I", + "number of reqs that returned error"); +#if XEN_BLKBACK_DEBUG + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "ring", CTLFLAG_RD, + dev, VBD_SYSCTL_RING, vbd_sysctl_handler, "A", + "req ring info"); +#endif + + if (!open_device(blkif)) + connect(blkif); + + return bus_generic_attach(dev); +} + +/* Newbus vbd device driver detach */ +static int +vbd_detach(device_t dev) +{ + blkif_t *blkif = (blkif_t *)device_get_ivars(dev); + + DPRINTF("%s\n", blkif->dev_name); + + close_device(blkif); + + bus_generic_detach(dev); + + blkif_put(blkif); + + return 0; +} + +static device_method_t vbd_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vbd_probe), + DEVMETHOD(device_attach, vbd_attach), + DEVMETHOD(device_detach, vbd_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + {0, 0} +}; + +static devclass_t vbd_devclass; + +static driver_t vbd_driver = { + "vbd", + vbd_methods, + 0, +}; + +DRIVER_MODULE(vbd, nexus, vbd_driver, vbd_devclass, 0, 0); + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ diff --git a/sys/dev/xen/blkfront/blkfront.c b/sys/dev/xen/blkfront/blkfront.c new file mode 100644 index 0000000..c448b81 --- /dev/null +++ b/sys/dev/xen/blkfront/blkfront.c @@ -0,0 +1,1021 @@ +/*- + * All rights reserved. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* + * XenoBSD block device driver + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <sys/bio.h> +#include <sys/bus.h> +#include <sys/conf.h> +#include <sys/module.h> + +#include <machine/bus.h> +#include <sys/rman.h> +#include <machine/resource.h> +#include <machine/intr_machdep.h> +#include <machine/vmparam.h> + +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/xen_intr.h> +#include <machine/xen/xenbus.h> +#include <machine/xen/evtchn.h> +#include <xen/interface/grant_table.h> + +#include <geom/geom_disk.h> +#include <machine/xen/xenfunc.h> +#include <xen/gnttab.h> + +#include <dev/xen/blkfront/block.h> + +#define ASSERT(S) KASSERT(S, (#S)) +/* prototypes */ +struct xb_softc; +static void xb_startio(struct xb_softc *sc); +static void connect(struct blkfront_info *); +static void blkfront_closing(struct xenbus_device *); +static int blkfront_remove(struct xenbus_device *); +static int talk_to_backend(struct xenbus_device *, struct blkfront_info *); +static int setup_blkring(struct xenbus_device *, struct blkfront_info *); +static void blkif_int(void *); +#if 0 +static void blkif_restart_queue(void *arg); +#endif +static void blkif_recover(struct blkfront_info *); +static void blkif_completion(struct blk_shadow *); +static void blkif_free(struct blkfront_info *, int); + +#define GRANT_INVALID_REF 0 +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) + +LIST_HEAD(xb_softc_list_head, xb_softc) xbsl_head; + +/* Control whether runtime update of vbds is enabled. */ +#define ENABLE_VBD_UPDATE 0 + +#if ENABLE_VBD_UPDATE +static void vbd_update(void); +#endif + + +#define BLKIF_STATE_DISCONNECTED 0 +#define BLKIF_STATE_CONNECTED 1 +#define BLKIF_STATE_SUSPENDED 2 + +#ifdef notyet +static char *blkif_state_name[] = { + [BLKIF_STATE_DISCONNECTED] = "disconnected", + [BLKIF_STATE_CONNECTED] = "connected", + [BLKIF_STATE_SUSPENDED] = "closed", +}; + +static char * blkif_status_name[] = { + [BLKIF_INTERFACE_STATUS_CLOSED] = "closed", + [BLKIF_INTERFACE_STATUS_DISCONNECTED] = "disconnected", + [BLKIF_INTERFACE_STATUS_CONNECTED] = "connected", + [BLKIF_INTERFACE_STATUS_CHANGED] = "changed", +}; +#endif +#define WPRINTK(fmt, args...) printf("[XEN] " fmt, ##args) +#if 0 +#define DPRINTK(fmt, args...) printf("[XEN] %s:%d" fmt ".\n", __FUNCTION__, __LINE__,##args) +#else +#define DPRINTK(fmt, args...) +#endif + +static grant_ref_t gref_head; +#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ + (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) + +static void kick_pending_request_queues(struct blkfront_info *); +static int blkif_open(struct disk *dp); +static int blkif_close(struct disk *dp); +static int blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td); +static int blkif_queue_request(struct bio *bp); +static void xb_strategy(struct bio *bp); + + + +/* XXX move to xb_vbd.c when VBD update support is added */ +#define MAX_VBDS 64 + +#define XBD_SECTOR_SIZE 512 /* XXX: assume for now */ +#define XBD_SECTOR_SHFT 9 + +static struct mtx blkif_io_lock; + +static unsigned long +pfn_to_mfn(unsigned long pfn) +{ + return (phystomach(pfn << PAGE_SHIFT) >> PAGE_SHIFT); +} + + +int +xlvbd_add(blkif_sector_t capacity, int unit, uint16_t vdisk_info, uint16_t sector_size, + struct blkfront_info *info) +{ + struct xb_softc *sc; + int error = 0; + + sc = (struct xb_softc *)malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_ZERO); + sc->xb_unit = unit; + sc->xb_info = info; + info->sc = sc; + + memset(&sc->xb_disk, 0, sizeof(sc->xb_disk)); + sc->xb_disk = disk_alloc(); + sc->xb_disk->d_unit = unit; + sc->xb_disk->d_open = blkif_open; + sc->xb_disk->d_close = blkif_close; + sc->xb_disk->d_ioctl = blkif_ioctl; + sc->xb_disk->d_strategy = xb_strategy; + sc->xb_disk->d_name = "xbd"; + sc->xb_disk->d_drv1 = sc; + sc->xb_disk->d_sectorsize = sector_size; + + /* XXX */ + sc->xb_disk->d_mediasize = capacity << XBD_SECTOR_SHFT; +#if 0 + sc->xb_disk->d_maxsize = DFLTPHYS; +#else /* XXX: xen can't handle large single i/o requests */ + sc->xb_disk->d_maxsize = 4096; +#endif +#ifdef notyet + XENPRINTF("attaching device 0x%x unit %d capacity %llu\n", + xb_diskinfo[sc->xb_unit].device, sc->xb_unit, + sc->xb_disk->d_mediasize); +#endif + sc->xb_disk->d_flags = 0; + disk_create(sc->xb_disk, DISK_VERSION_00); + bioq_init(&sc->xb_bioq); + + return error; +} + +void +xlvbd_del(struct blkfront_info *info) +{ + struct xb_softc *sc; + + sc = info->sc; + disk_destroy(sc->xb_disk); +} +/************************ end VBD support *****************/ + +/* + * Read/write routine for a buffer. Finds the proper unit, place it on + * the sortq and kick the controller. + */ +static void +xb_strategy(struct bio *bp) +{ + struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1; + + /* bogus disk? */ + if (sc == NULL) { + bp->bio_error = EINVAL; + bp->bio_flags |= BIO_ERROR; + goto bad; + } + + DPRINTK(""); + + /* + * Place it in the queue of disk activities for this disk + */ + mtx_lock(&blkif_io_lock); + bioq_disksort(&sc->xb_bioq, bp); + + xb_startio(sc); + mtx_unlock(&blkif_io_lock); + return; + + bad: + /* + * Correctly set the bio to indicate a failed tranfer. + */ + bp->bio_resid = bp->bio_bcount; + biodone(bp); + return; +} + + +/* Setup supplies the backend dir, virtual device. + +We place an event channel and shared frame entries. +We watch backend to wait if it's ok. */ +static int blkfront_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err, vdevice, i; + struct blkfront_info *info; + + /* FIXME: Use dynamic device id if this is not set. */ + err = xenbus_scanf(XBT_NIL, dev->nodename, + "virtual-device", "%i", &vdevice); + if (err != 1) { + xenbus_dev_fatal(dev, err, "reading virtual-device"); + return err; + } + + info = malloc(sizeof(*info), M_DEVBUF, M_NOWAIT|M_ZERO); + if (info == NULL) { + xenbus_dev_fatal(dev, ENOMEM, "allocating info structure"); + return ENOMEM; + } + + /* + * XXX debug only + */ + for (i = 0; i < sizeof(*info); i++) + if (((uint8_t *)info)[i] != 0) + panic("non-null memory"); + + info->shadow_free = 0; + info->xbdev = dev; + info->vdevice = vdevice; + info->connected = BLKIF_STATE_DISCONNECTED; + + /* work queue needed ? */ + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Front end dir is a number, which is used as the id. */ + info->handle = strtoul(strrchr(dev->nodename,'/')+1, NULL, 0); + dev->dev_driver_data = info; + + err = talk_to_backend(dev, info); + if (err) { + free(info, M_DEVBUF); + dev->dev_driver_data = NULL; + return err; + } + + return 0; +} + + +static int blkfront_resume(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev_driver_data; + int err; + + DPRINTK("blkfront_resume: %s\n", dev->nodename); + + blkif_free(info, 1); + + err = talk_to_backend(dev, info); + if (!err) + blkif_recover(info); + + return err; +} + +/* Common code used when first setting up, and when resuming. */ +static int talk_to_backend(struct xenbus_device *dev, + struct blkfront_info *info) +{ + const char *message = NULL; + struct xenbus_transaction xbt; + int err; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, info); + if (err) + goto out; + + again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_blkring; + } + + err = xenbus_printf(xbt, dev->nodename, + "ring-ref","%u", info->ring_ref); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, + "event-channel", "%u", irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_blkring; + } + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(dev, err, "%s", message); + destroy_blkring: + blkif_free(info, 0); + out: + return err; +} + +static int +setup_blkring(struct xenbus_device *dev, struct blkfront_info *info) +{ + blkif_sring_t *sring; + int err; + + info->ring_ref = GRANT_INVALID_REF; + + sring = (blkif_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); + if (sring == NULL) { + xenbus_dev_fatal(dev, ENOMEM, "allocating shared ring"); + return ENOMEM; + } + SHARED_RING_INIT(sring); + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); + + err = xenbus_grant_ring(dev, (vtomach(info->ring.sring) >> PAGE_SHIFT)); + if (err < 0) { + free(sring, M_DEVBUF); + info->ring.sring = NULL; + goto fail; + } + info->ring_ref = err; + + err = bind_listening_port_to_irqhandler(dev->otherend_id, + "xbd", (driver_intr_t *)blkif_int, info, + INTR_TYPE_BIO | INTR_MPSAFE, NULL); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_evtchn_to_irqhandler failed"); + goto fail; + } + info->irq = err; + + return 0; + fail: + blkif_free(info, 0); + return err; +} + + +/** + * Callback received when the backend's state changes. + */ +static void backend_changed(struct xenbus_device *dev, + XenbusState backend_state) +{ + struct blkfront_info *info = dev->dev_driver_data; + + DPRINTK("blkfront:backend_changed.\n"); + + switch (backend_state) { + case XenbusStateUnknown: + case XenbusStateInitialising: + case XenbusStateInitWait: + case XenbusStateInitialised: + case XenbusStateClosed: + break; + + case XenbusStateConnected: + connect(info); + break; + + case XenbusStateClosing: + if (info->users > 0) + xenbus_dev_error(dev, -EBUSY, + "Device in use; refusing to close"); + else + blkfront_closing(dev); +#ifdef notyet + bd = bdget(info->dev); + if (bd == NULL) + xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); + + down(&bd->bd_sem); + if (info->users > 0) + xenbus_dev_error(dev, -EBUSY, + "Device in use; refusing to close"); + else + blkfront_closing(dev); + up(&bd->bd_sem); + bdput(bd); +#endif + } +} + +/* +** Invoked when the backend is finally 'ready' (and has told produced +** the details about the physical device - #sectors, size, etc). +*/ +static void +connect(struct blkfront_info *info) +{ + unsigned long sectors, sector_size; + unsigned int binfo; + int err; + + if( (info->connected == BLKIF_STATE_CONNECTED) || + (info->connected == BLKIF_STATE_SUSPENDED) ) + return; + + DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend); + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "sectors", "%lu", §ors, + "info", "%u", &binfo, + "sector-size", "%lu", §or_size, + NULL); + if (err) { + xenbus_dev_fatal(info->xbdev, err, + "reading backend fields at %s", + info->xbdev->otherend); + return; + } + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%lu", &info->feature_barrier, + NULL); + if (err) + info->feature_barrier = 0; + + xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); + + (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); + + /* Kick pending requests. */ + mtx_lock(&blkif_io_lock); + info->connected = BLKIF_STATE_CONNECTED; + kick_pending_request_queues(info); + mtx_unlock(&blkif_io_lock); + +#if 0 + add_disk(info->gd); +#endif +} + +/** + * Handle the change of state of the backend to Closing. We must delete our + * device-layer structures now, to ensure that writes are flushed through to + * the backend. Once is this done, we can switch to Closed in + * acknowledgement. + */ +static void blkfront_closing(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev_driver_data; + + DPRINTK("blkfront_closing: %s removed\n", dev->nodename); + + if (info->mi) { + DPRINTK("Calling xlvbd_del\n"); + xlvbd_del(info); + info->mi = NULL; + } + + xenbus_switch_state(dev, XenbusStateClosed); +} + + +static int blkfront_remove(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev_driver_data; + + DPRINTK("blkfront_remove: %s removed\n", dev->nodename); + + blkif_free(info, 0); + + free(info, M_DEVBUF); + + return 0; +} + + +static inline int +GET_ID_FROM_FREELIST(struct blkfront_info *info) +{ + unsigned long nfree = info->shadow_free; + + KASSERT(nfree <= BLK_RING_SIZE, ("free %lu > RING_SIZE", nfree)); + info->shadow_free = info->shadow[nfree].req.id; + info->shadow[nfree].req.id = 0x0fffffee; /* debug */ + return nfree; +} + +static inline void +ADD_ID_TO_FREELIST(struct blkfront_info *info, unsigned long id) +{ + info->shadow[id].req.id = info->shadow_free; + info->shadow[id].request = 0; + info->shadow_free = id; +} + +static inline void +flush_requests(struct blkfront_info *info) +{ + int notify; + + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + + if (notify) + notify_remote_via_irq(info->irq); +} + +static void +kick_pending_request_queues(struct blkfront_info *info) +{ + /* XXX check if we can't simplify */ +#if 0 + if (!RING_FULL(&info->ring)) { + /* Re-enable calldowns. */ + blk_start_queue(info->rq); + /* Kick things off immediately. */ + do_blkif_request(info->rq); + } +#endif + if (!RING_FULL(&info->ring)) { +#if 0 + sc = LIST_FIRST(&xbsl_head); + LIST_REMOVE(sc, entry); + /* Re-enable calldowns. */ + blk_start_queue(di->rq); +#endif + /* Kick things off immediately. */ + xb_startio(info->sc); + } +} + +#if 0 +/* XXX */ +static void blkif_restart_queue(void *arg) +{ + struct blkfront_info *info = (struct blkfront_info *)arg; + + mtx_lock(&blkif_io_lock); + kick_pending_request_queues(info); + mtx_unlock(&blkif_io_lock); +} +#endif + +static void blkif_restart_queue_callback(void *arg) +{ +#if 0 + struct blkfront_info *info = (struct blkfront_info *)arg; + /* XXX BSD equiv ? */ + + schedule_work(&info->work); +#endif +} + +static int +blkif_open(struct disk *dp) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) { + printk("xb%d: not found", sc->xb_unit); + return (ENXIO); + } + + sc->xb_flags |= XB_OPEN; + sc->xb_info->users++; + return (0); +} + +static int +blkif_close(struct disk *dp) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) + return (ENXIO); + sc->xb_flags &= ~XB_OPEN; + if (--(sc->xb_info->users) == 0) { + /* Check whether we have been instructed to close. We will + have ignored this request initially, as the device was + still mounted. */ + struct xenbus_device * dev = sc->xb_info->xbdev; + XenbusState state = xenbus_read_driver_state(dev->otherend); + + if (state == XenbusStateClosing) + blkfront_closing(dev); + } + return (0); +} + +static int +blkif_ioctl(struct disk *dp, u_long cmd, void *addr, int flag, struct thread *td) +{ + struct xb_softc *sc = (struct xb_softc *)dp->d_drv1; + + if (sc == NULL) + return (ENXIO); + + return (ENOTTY); +} + + +/* + * blkif_queue_request + * + * request block io + * + * id: for guest use only. + * operation: BLKIF_OP_{READ,WRITE,PROBE} + * buffer: buffer to read/write into. this should be a + * virtual address in the guest os. + */ +static int blkif_queue_request(struct bio *bp) +{ + caddr_t alignbuf; + unsigned long buffer_ma; + blkif_request_t *ring_req; + unsigned long id; + unsigned int fsect, lsect; + struct xb_softc *sc = (struct xb_softc *)bp->bio_disk->d_drv1; + struct blkfront_info *info = sc->xb_info; + int ref; + + if (unlikely(sc->xb_info->connected != BLKIF_STATE_CONNECTED)) + return 1; + + if (gnttab_alloc_grant_references( + BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { + gnttab_request_free_callback( + &info->callback, + blkif_restart_queue_callback, + info, + BLKIF_MAX_SEGMENTS_PER_REQUEST); + return 1; + } + + /* Check if the buffer is properly aligned */ + if ((vm_offset_t)bp->bio_data & PAGE_MASK) { + int align = (bp->bio_bcount < PAGE_SIZE/2) ? XBD_SECTOR_SIZE : + PAGE_SIZE; + caddr_t newbuf = malloc(bp->bio_bcount + align, M_DEVBUF, + M_NOWAIT); + + alignbuf = (char *)roundup2((u_long)newbuf, align); + + /* save a copy of the current buffer */ + bp->bio_driver1 = newbuf; + bp->bio_driver2 = alignbuf; + + /* Copy the data for a write */ + if (bp->bio_cmd == BIO_WRITE) + bcopy(bp->bio_data, alignbuf, bp->bio_bcount); + } else + alignbuf = bp->bio_data; + + /* Fill out a communications ring structure. */ + ring_req = RING_GET_REQUEST(&info->ring, + info->ring.req_prod_pvt); + id = GET_ID_FROM_FREELIST(info); + info->shadow[id].request = (unsigned long)bp; + + ring_req->id = id; + ring_req->operation = (bp->bio_cmd == BIO_READ) ? BLKIF_OP_READ : + BLKIF_OP_WRITE; + + ring_req->sector_number= (blkif_sector_t)bp->bio_pblkno; + ring_req->handle = (blkif_vdev_t)(uintptr_t)sc->xb_disk; + + ring_req->nr_segments = 0; /* XXX not doing scatter/gather since buffer + * chaining is not supported. + */ + + buffer_ma = vtomach(alignbuf); + fsect = (buffer_ma & PAGE_MASK) >> XBD_SECTOR_SHFT; + lsect = fsect + (bp->bio_bcount >> XBD_SECTOR_SHFT) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + KASSERT( ref != -ENOSPC, ("grant_reference failed") ); + + gnttab_grant_foreign_access_ref( + ref, + info->xbdev->otherend_id, + buffer_ma >> PAGE_SHIFT, + ring_req->operation & 1 ); /* ??? */ + info->shadow[id].frame[ring_req->nr_segments] = + buffer_ma >> PAGE_SHIFT; + + ring_req->seg[ring_req->nr_segments] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + + ring_req->nr_segments++; + KASSERT((buffer_ma & (XBD_SECTOR_SIZE-1)) == 0, + ("XEN buffer must be sector aligned")); + KASSERT(lsect <= 7, + ("XEN disk driver data cannot cross a page boundary")); + + buffer_ma &= ~PAGE_MASK; + + info->ring.req_prod_pvt++; + + /* Keep a private copy so we can reissue requests when recovering. */ + info->shadow[id].req = *ring_req; + + gnttab_free_grant_references(gref_head); + + return 0; +} + + + +/* + * Dequeue buffers and place them in the shared communication ring. + * Return when no more requests can be accepted or all buffers have + * been queued. + * + * Signal XEN once the ring has been filled out. + */ +static void +xb_startio(struct xb_softc *sc) +{ + struct bio *bp; + int queued = 0; + struct blkfront_info *info = sc->xb_info; + DPRINTK(""); + + mtx_assert(&blkif_io_lock, MA_OWNED); + + while ((bp = bioq_takefirst(&sc->xb_bioq)) != NULL) { + + if (RING_FULL(&info->ring)) + goto wait; + + if (blkif_queue_request(bp)) { + wait: + bioq_insert_head(&sc->xb_bioq, bp); + break; + } + queued++; + } + + if (queued != 0) + flush_requests(sc->xb_info); +} + +static void +blkif_int(void *xsc) +{ + struct xb_softc *sc = NULL; + struct bio *bp; + blkif_response_t *bret; + RING_IDX i, rp; + struct blkfront_info *info = xsc; + DPRINTK(""); + + TRACE_ENTER; + + mtx_lock(&blkif_io_lock); + + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { + mtx_unlock(&blkif_io_lock); + return; + } + + again: + rp = info->ring.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + for (i = info->ring.rsp_cons; i != rp; i++) { + unsigned long id; + + bret = RING_GET_RESPONSE(&info->ring, i); + id = bret->id; + bp = (struct bio *)info->shadow[id].request; + + blkif_completion(&info->shadow[id]); + + ADD_ID_TO_FREELIST(info, id); + + switch (bret->operation) { + case BLKIF_OP_READ: + /* had an unaligned buffer that needs to be copied */ + if (bp->bio_driver1) + bcopy(bp->bio_driver2, bp->bio_data, bp->bio_bcount); + /* FALLTHROUGH */ + case BLKIF_OP_WRITE: + + /* free the copy buffer */ + if (bp->bio_driver1) { + free(bp->bio_driver1, M_DEVBUF); + bp->bio_driver1 = NULL; + } + + if ( unlikely(bret->status != BLKIF_RSP_OKAY) ) { + XENPRINTF("Bad return from blkdev data request: %x\n", + bret->status); + bp->bio_flags |= BIO_ERROR; + } + + sc = (struct xb_softc *)bp->bio_disk->d_drv1; + + if (bp->bio_flags & BIO_ERROR) + bp->bio_error = EIO; + else + bp->bio_resid = 0; + + biodone(bp); + break; + default: + panic("received invalid operation"); + break; + } + } + + info->ring.rsp_cons = i; + + if (i != info->ring.req_prod_pvt) { + int more_to_do; + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + if (more_to_do) + goto again; + } else { + info->ring.sring->rsp_event = i + 1; + } + + kick_pending_request_queues(info); + + mtx_unlock(&blkif_io_lock); +} + +static void +blkif_free(struct blkfront_info *info, int suspend) +{ + +/* Prevent new requests being issued until we fix things up. */ + mtx_lock(&blkif_io_lock); + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + mtx_unlock(&blkif_io_lock); + + /* Free resources associated with old device channel. */ + if (info->ring_ref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(info->ring_ref, 0, + info->ring.sring); + info->ring_ref = GRANT_INVALID_REF; + info->ring.sring = NULL; + } + if (info->irq) + unbind_from_irqhandler(info->irq, info); + info->irq = 0; + +} + +static void +blkif_completion(struct blk_shadow *s) +{ + int i; + + for (i = 0; i < s->req.nr_segments; i++) + gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); +} + +static void +blkif_recover(struct blkfront_info *info) +{ + int i, j; + blkif_request_t *req; + struct blk_shadow *copy; + + /* Stage 1: Make a safe copy of the shadow state. */ + copy = (struct blk_shadow *)malloc(sizeof(info->shadow), M_DEVBUF, M_NOWAIT|M_ZERO); + PANIC_IF(copy == NULL); + memcpy(copy, info->shadow, sizeof(info->shadow)); + + /* Stage 2: Set up free list. */ + memset(&info->shadow, 0, sizeof(info->shadow)); + for (i = 0; i < BLK_RING_SIZE; i++) + info->shadow[i].req.id = i+1; + info->shadow_free = info->ring.req_prod_pvt; + info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; + + /* Stage 3: Find pending requests and requeue them. */ + for (i = 0; i < BLK_RING_SIZE; i++) { + /* Not in use? */ + if (copy[i].request == 0) + continue; + + /* Grab a request slot and copy shadow state into it. */ + req = RING_GET_REQUEST( + &info->ring, info->ring.req_prod_pvt); + *req = copy[i].req; + + /* We get a new request id, and must reset the shadow state. */ + req->id = GET_ID_FROM_FREELIST(info); + memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); + + /* Rewrite any grant references invalidated by suspend/resume. */ + for (j = 0; j < req->nr_segments; j++) + gnttab_grant_foreign_access_ref( + req->seg[j].gref, + info->xbdev->otherend_id, + pfn_to_mfn(info->shadow[req->id].frame[j]), + 0 /* assume not readonly */); + + info->shadow[req->id].req = *req; + + info->ring.req_prod_pvt++; + } + + free(copy, M_DEVBUF); + + xenbus_switch_state(info->xbdev, XenbusStateConnected); + + /* Now safe for us to use the shared ring */ + mtx_lock(&blkif_io_lock); + info->connected = BLKIF_STATE_CONNECTED; + mtx_unlock(&blkif_io_lock); + + /* Send off requeued requests */ + mtx_lock(&blkif_io_lock); + flush_requests(info); + + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(info); + mtx_unlock(&blkif_io_lock); +} + +static int +blkfront_is_ready(struct xenbus_device *dev) +{ + struct blkfront_info *info = dev->dev_driver_data; + + return info->is_ready; +} + +static struct xenbus_device_id blkfront_ids[] = { + { "vbd" }, + { "" } +}; + + +static struct xenbus_driver blkfront = { + .name = "vbd", + .ids = blkfront_ids, + .probe = blkfront_probe, + .remove = blkfront_remove, + .resume = blkfront_resume, + .otherend_changed = backend_changed, + .is_ready = blkfront_is_ready, +}; + + + +static void +xenbus_init(void) +{ + xenbus_register_frontend(&blkfront); +} + +MTX_SYSINIT(ioreq, &blkif_io_lock, "BIO LOCK", MTX_NOWITNESS); /* XXX how does one enroll a lock? */ +SYSINIT(xbdev, SI_SUB_PSEUDO, SI_ORDER_SECOND, xenbus_init, NULL); + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 8 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ diff --git a/sys/dev/xen/blkfront/block.h b/sys/dev/xen/blkfront/block.h new file mode 100644 index 0000000..0d14459 --- /dev/null +++ b/sys/dev/xen/blkfront/block.h @@ -0,0 +1,97 @@ +/* + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + * $FreeBSD$ + */ + + +#ifndef __XEN_DRIVERS_BLOCK_H__ +#define __XEN_DRIVERS_BLOCK_H__ +#include <xen/interface/io/blkif.h> + +struct xlbd_type_info +{ + int partn_shift; + int disks_per_major; + char *devname; + char *diskname; +}; + +struct xlbd_major_info +{ + int major; + int index; + int usage; + struct xlbd_type_info *type; +}; + +struct blk_shadow { + blkif_request_t req; + unsigned long request; + unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; + +#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) + + +struct xb_softc { + device_t xb_dev; + struct disk *xb_disk; /* disk params */ + struct bio_queue_head xb_bioq; /* sort queue */ + int xb_unit; + int xb_flags; + struct blkfront_info *xb_info; + LIST_ENTRY(xb_softc) entry; +#define XB_OPEN (1<<0) /* drive is open (can't shut down) */ +}; + + +/* + * We have one of these per vbd, whether ide, scsi or 'other'. They + * hang in private_data off the gendisk structure. We may end up + * putting all kinds of interesting stuff here :-) + */ +struct blkfront_info +{ + struct xenbus_device *xbdev; + dev_t dev; + struct gendisk *gd; + int vdevice; + blkif_vdev_t handle; + int connected; + int ring_ref; + blkif_front_ring_t ring; + unsigned int irq; + struct xlbd_major_info *mi; +#if 0 + request_queue_t *rq; + struct work_struct work; +#endif + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_RING_SIZE]; + unsigned long shadow_free; + struct xb_softc *sc; + int feature_barrier; + int is_ready; + /** + * The number of people holding this device open. We won't allow a + * hot-unplug unless this is 0. + */ + int users; +}; +/* Note that xlvbd_add doesn't call add_disk for you: you're expected + to call add_disk on info->gd once the disk is properly connected + up. */ +int xlvbd_add(blkif_sector_t capacity, int device, + uint16_t vdisk_info, uint16_t sector_size, struct blkfront_info *info); +void xlvbd_del(struct blkfront_info *info); + +#endif /* __XEN_DRIVERS_BLOCK_H__ */ + diff --git a/sys/dev/xen/console/console.c b/sys/dev/xen/console/console.c new file mode 100644 index 0000000..dc9fe6f --- /dev/null +++ b/sys/dev/xen/console/console.c @@ -0,0 +1,564 @@ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/module.h> +#include <sys/systm.h> +#include <sys/consio.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/tty.h> +#include <sys/systm.h> +#include <sys/taskqueue.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <machine/stdarg.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen_intr.h> +#include <sys/cons.h> +#include <sys/priv.h> +#include <sys/proc.h> + +#include <dev/xen/console/xencons_ring.h> +#include <xen/interface/io/console.h> + + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> +#endif + +static char driver_name[] = "xc"; +devclass_t xc_devclass; /* do not make static */ +static void xcstart (struct tty *); +static int xcparam (struct tty *, struct termios *); +static void xcstop (struct tty *, int); +static void xc_timeout(void *); +static void __xencons_tx_flush(void); +static boolean_t xcons_putc(int c); + +/* switch console so that shutdown can occur gracefully */ +static void xc_shutdown(void *arg, int howto); +static int xc_mute; + +static void xcons_force_flush(void); +static void xencons_priv_interrupt(void *); + +static cn_probe_t xccnprobe; +static cn_init_t xccninit; +static cn_getc_t xccngetc; +static cn_putc_t xccnputc; +static cn_putc_t xccnputc_dom0; +static cn_checkc_t xccncheckc; + +#define XC_POLLTIME (hz/10) + +CONS_DRIVER(xc, xccnprobe, xccninit, NULL, xccngetc, + xccncheckc, xccnputc, NULL); + +static int xen_console_up; +static boolean_t xc_start_needed; +static struct callout xc_callout; +struct mtx cn_mtx; + +#define RBUF_SIZE 1024 +#define RBUF_MASK(_i) ((_i)&(RBUF_SIZE-1)) +#define WBUF_SIZE 4096 +#define WBUF_MASK(_i) ((_i)&(WBUF_SIZE-1)) +static char wbuf[WBUF_SIZE]; +static char rbuf[RBUF_SIZE]; +static int rc, rp; +static unsigned int cnsl_evt_reg; +static unsigned int wc, wp; /* write_cons, write_prod */ + +#define CDEV_MAJOR 12 +#define XCUNIT(x) (minor(x)) +#define ISTTYOPEN(tp) ((tp) && ((tp)->t_state & TS_ISOPEN)) +#define CN_LOCK_INIT(x, _name) \ + mtx_init(&x, _name, NULL, MTX_SPIN|MTX_RECURSE) + +#define CN_LOCK(l) \ + do { \ + if (panicstr == NULL) \ + mtx_lock_spin(&(l)); \ + } while (0) +#define CN_UNLOCK(l) \ + do { \ + if (panicstr == NULL) \ + mtx_unlock_spin(&(l)); \ + } while (0) +#define CN_LOCK_ASSERT(x) mtx_assert(&x, MA_OWNED) +#define CN_LOCK_DESTROY(x) mtx_destroy(&x) + + +static struct tty *xccons; + +struct xc_softc { + int xc_unit; + struct cdev *xc_dev; +}; + + +static d_open_t xcopen; +static d_close_t xcclose; +static d_ioctl_t xcioctl; + +static struct cdevsw xc_cdevsw = { + .d_version = D_VERSION, + .d_flags = D_TTY | D_NEEDGIANT, + .d_name = driver_name, + .d_open = xcopen, + .d_close = xcclose, + .d_read = ttyread, + .d_write = ttywrite, + .d_ioctl = xcioctl, + .d_poll = ttypoll, + .d_kqfilter = ttykqfilter, +}; + +static void +xccnprobe(struct consdev *cp) +{ + cp->cn_pri = CN_REMOTE; + cp->cn_tp = xccons; + sprintf(cp->cn_name, "%s0", driver_name); +} + + +static void +xccninit(struct consdev *cp) +{ + CN_LOCK_INIT(cn_mtx,"XCONS LOCK"); + +} +int +xccngetc(struct consdev *dev) +{ + int c; + if (xc_mute) + return 0; + do { + if ((c = xccncheckc(dev)) == -1) { + /* polling without sleeping in Xen doesn't work well. + * Sleeping gives other things like clock a chance to + * run + */ + tsleep(&cn_mtx, PWAIT | PCATCH, "console sleep", + XC_POLLTIME); + } + } while(c == -1); + return c; +} + +int +xccncheckc(struct consdev *dev) +{ + int ret = (xc_mute ? 0 : -1); + if (xencons_has_input()) + xencons_handle_input(NULL); + + CN_LOCK(cn_mtx); + if ((rp - rc)) { + /* we need to return only one char */ + ret = (int)rbuf[RBUF_MASK(rc)]; + rc++; + } + CN_UNLOCK(cn_mtx); + return(ret); +} + +static void +xccnputc(struct consdev *dev, int c) +{ + xcons_putc(c); +} + +static void +xccnputc_dom0(struct consdev *dev, int c) +{ + HYPERVISOR_console_io(CONSOLEIO_write, 1, (char *)&c); +} + +extern int db_active; +static boolean_t +xcons_putc(int c) +{ + int force_flush = xc_mute || +#ifdef DDB + db_active || +#endif + panicstr; /* we're not gonna recover, so force + * flush + */ + + if ((wp-wc) < (WBUF_SIZE-1)) { + if ((wbuf[WBUF_MASK(wp++)] = c) == '\n') { + wbuf[WBUF_MASK(wp++)] = '\r'; +#ifdef notyet + if (force_flush) + xcons_force_flush(); +#endif + } + } else if (force_flush) { +#ifdef notyet + xcons_force_flush(); +#endif + } + if (cnsl_evt_reg) + __xencons_tx_flush(); + + /* inform start path that we're pretty full */ + return ((wp - wc) >= WBUF_SIZE - 100) ? TRUE : FALSE; +} + +static void +xc_identify(driver_t *driver, device_t parent) +{ + device_t child; + child = BUS_ADD_CHILD(parent, 0, driver_name, 0); + device_set_driver(child, driver); + device_set_desc(child, "Xen Console"); +} + +static int +xc_probe(device_t dev) +{ + struct xc_softc *sc = (struct xc_softc *)device_get_softc(dev); + + sc->xc_unit = device_get_unit(dev); + return (0); +} + +static int +xc_attach(device_t dev) +{ + struct xc_softc *sc = (struct xc_softc *)device_get_softc(dev); + + + if (xen_start_info->flags & SIF_INITDOMAIN) { + xc_consdev.cn_putc = xccnputc_dom0; + } + + sc->xc_dev = make_dev(&xc_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "xc%r", 0); + xccons = ttyalloc(); + + sc->xc_dev->si_drv1 = (void *)sc; + sc->xc_dev->si_tty = xccons; + + xccons->t_oproc = xcstart; + xccons->t_param = xcparam; + xccons->t_stop = xcstop; + xccons->t_dev = sc->xc_dev; + + callout_init(&xc_callout, 0); + + xencons_ring_init(); + + cnsl_evt_reg = 1; + callout_reset(&xc_callout, XC_POLLTIME, xc_timeout, xccons); + + if (xen_start_info->flags & SIF_INITDOMAIN) { + PANIC_IF(bind_virq_to_irqhandler( + VIRQ_CONSOLE, + 0, + "console", + NULL, + xencons_priv_interrupt, + INTR_TYPE_TTY) < 0); + + } + + + /* register handler to flush console on shutdown */ + if ((EVENTHANDLER_REGISTER(shutdown_post_sync, xc_shutdown, + NULL, SHUTDOWN_PRI_DEFAULT)) == NULL) + printf("xencons: shutdown event registration failed!\n"); + + TRACE_EXIT; + return (0); +} + +/* + * return 0 for all console input, force flush all output. + */ +static void +xc_shutdown(void *arg, int howto) +{ + xc_mute = 1; + xcons_force_flush(); +} + +void +xencons_rx(char *buf, unsigned len) +{ + int i; + struct tty *tp = xccons; + + for (i = 0; i < len; i++) { + if (xen_console_up) + (*linesw[tp->t_line]->l_rint)(buf[i], tp); + else + rbuf[RBUF_MASK(rp++)] = buf[i]; + } +} + +static void +__xencons_tx_flush(void) +{ + int sz, work_done = 0; + + CN_LOCK(cn_mtx); + while (wc != wp) { + int sent; + sz = wp - wc; + if (sz > (WBUF_SIZE - WBUF_MASK(wc))) + sz = WBUF_SIZE - WBUF_MASK(wc); + if (xen_start_info->flags & SIF_INITDOMAIN) { + HYPERVISOR_console_io(CONSOLEIO_write, sz, &wbuf[WBUF_MASK(wc)]); + wc += sz; + } else { + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent == 0) + break; + wc += sent; + } + work_done = 1; + } + CN_UNLOCK(cn_mtx); + + /* + * ttwakeup calls routines using blocking locks + * + */ + if (work_done && xen_console_up && curthread->td_critnest == 0) + ttwakeup(xccons); +} + +void +xencons_tx(void) +{ + __xencons_tx_flush(); +} + +static void +xencons_priv_interrupt(void *arg) +{ + + static char rbuf[16]; + int l; + + while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0) + xencons_rx(rbuf, l); + + xencons_tx(); +} + +int +xcopen(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct xc_softc *sc; + int unit = XCUNIT(dev); + struct tty *tp; + int s, error; + + sc = (struct xc_softc *)device_get_softc( + devclass_get_device(xc_devclass, unit)); + if (sc == NULL) + return (ENXIO); + + TRACE_ENTER; + tp = dev->si_tty; + s = spltty(); + if (!ISTTYOPEN(tp)) { + tp->t_state |= TS_CARR_ON; + ttychars(tp); + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_cflag = TTYDEF_CFLAG|CLOCAL; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + xcparam(tp, &tp->t_termios); + ttsetwater(tp); + } else if (tp->t_state & TS_XCLUDE && priv_check(td, PRIV_ROOT)) { + splx(s); + return (EBUSY); + } + splx(s); + + xen_console_up = 1; + + error = (*linesw[tp->t_line]->l_open)(dev, tp); + TRACE_EXIT; + return error; +} + +int +xcclose(struct cdev *dev, int flag, int mode, struct thread *td) +{ + struct tty *tp = dev->si_tty; + + if (tp == NULL) + return (0); + xen_console_up = 0; + + spltty(); + (*linesw[tp->t_line]->l_close)(tp, flag); + tty_close(tp); + spl0(); + return (0); +} + + +int +xcioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td) +{ + struct tty *tp = dev->si_tty; + int error; + + error = (*linesw[tp->t_line]->l_ioctl)(tp, cmd, data, flag, td); + if (error != ENOIOCTL) + return (error); + + error = ttioctl(tp, cmd, data, flag); + + if (error != ENOIOCTL) + return (error); + + return (ENOTTY); +} + +static inline int +__xencons_put_char(int ch) +{ + char _ch = (char)ch; + if ((wp - wc) == WBUF_SIZE) + return 0; + wbuf[WBUF_MASK(wp++)] = _ch; + return 1; +} + + +static void +xcstart(struct tty *tp) +{ + boolean_t cons_full = FALSE; + + CN_LOCK(cn_mtx); + if (tp->t_state & (TS_TIMEOUT | TS_TTSTOP)) { + CN_UNLOCK(cn_mtx); + + ttwwakeup(tp); + return; + } + + tp->t_state |= TS_BUSY; + CN_UNLOCK(cn_mtx); + + while (tp->t_outq.c_cc != 0 && !cons_full) + cons_full = xcons_putc(getc(&tp->t_outq)); + + /* if the console is close to full leave our state as busy */ + if (!cons_full) { + CN_LOCK(cn_mtx); + tp->t_state &= ~TS_BUSY; + CN_UNLOCK(cn_mtx); + ttwwakeup(tp); + } else { + /* let the timeout kick us in a bit */ + xc_start_needed = TRUE; + } + +} + +static void +xcstop(struct tty *tp, int flag) +{ + + if (tp->t_state & TS_BUSY) { + if ((tp->t_state & TS_TTSTOP) == 0) { + tp->t_state |= TS_FLUSH; + } + } +} + +static void +xc_timeout(void *v) +{ + struct tty *tp; + int c; + + tp = (struct tty *)v; + + while ((c = xccncheckc(NULL)) != -1) { + if (tp->t_state & TS_ISOPEN) { + (*linesw[tp->t_line]->l_rint)(c, tp); + } + } + + if (xc_start_needed) { + xc_start_needed = FALSE; + xcstart(tp); + } + + callout_reset(&xc_callout, XC_POLLTIME, xc_timeout, tp); +} + +/* + * Set line parameters. + */ +int +xcparam(struct tty *tp, struct termios *t) +{ + tp->t_ispeed = t->c_ispeed; + tp->t_ospeed = t->c_ospeed; + tp->t_cflag = t->c_cflag; + return (0); +} + + +static device_method_t xc_methods[] = { + DEVMETHOD(device_identify, xc_identify), + DEVMETHOD(device_probe, xc_probe), + DEVMETHOD(device_attach, xc_attach), + {0, 0} +}; + +static driver_t xc_driver = { + driver_name, + xc_methods, + sizeof(struct xc_softc), +}; + +/*** Forcibly flush console data before dying. ***/ +void +xcons_force_flush(void) +{ + int sz; + + if (xen_start_info->flags & SIF_INITDOMAIN) + return; + + /* Spin until console data is flushed through to the domain controller. */ + while (wc != wp) { + int sent = 0; + if ((sz = wp - wc) == 0) + continue; + + sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); + if (sent > 0) + wc += sent; + } +} + +DRIVER_MODULE(xc, nexus, xc_driver, xc_devclass, 0, 0); +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 8 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ diff --git a/sys/dev/xen/console/xencons_ring.c b/sys/dev/xen/console/xencons_ring.c new file mode 100644 index 0000000..c9b60ac --- /dev/null +++ b/sys/dev/xen/console/xencons_ring.c @@ -0,0 +1,154 @@ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/module.h> +#include <sys/systm.h> +#include <sys/consio.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/tty.h> +#include <sys/systm.h> +#include <sys/taskqueue.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/bus.h> +#include <machine/stdarg.h> +#include <machine/xen/xen-os.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen_intr.h> +#include <sys/cons.h> + + +#include <dev/xen/console/xencons_ring.h> +#include <machine/xen/evtchn.h> +#include <xen/interface/io/console.h> + + +#define console_evtchn console.domU.evtchn +extern char *console_page; + +static inline struct xencons_interface * +xencons_interface(void) +{ + return (struct xencons_interface *)console_page; +} + + +int +xencons_has_input(void) +{ + struct xencons_interface *intf; + + intf = xencons_interface(); + + return (intf->in_cons != intf->in_prod); +} + + +int +xencons_ring_send(const char *data, unsigned len) +{ + struct xencons_interface *intf; + XENCONS_RING_IDX cons, prod; + int sent; + + intf = xencons_interface(); + cons = intf->out_cons; + prod = intf->out_prod; + sent = 0; + + mb(); + PANIC_IF((prod - cons) > sizeof(intf->out)); + + while ((sent < len) && ((prod - cons) < sizeof(intf->out))) + intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; + + wmb(); + intf->out_prod = prod; + + notify_remote_via_evtchn(xen_start_info->console_evtchn); + + return sent; + +} + + +static xencons_receiver_func *xencons_receiver; + +void +xencons_handle_input(void *unused) +{ + struct xencons_interface *intf; + XENCONS_RING_IDX cons, prod; + + intf = xencons_interface(); + + cons = intf->in_cons; + prod = intf->in_prod; + + /* XXX needs locking */ + while (cons != prod) { + xencons_rx(intf->in + MASK_XENCONS_IDX(cons, intf->in), 1); + cons++; + } + + mb(); + intf->in_cons = cons; + + notify_remote_via_evtchn(xen_start_info->console_evtchn); + + xencons_tx(); +} + +void +xencons_ring_register_receiver(xencons_receiver_func *f) +{ + xencons_receiver = f; +} + +int +xencons_ring_init(void) +{ + int err; + + if (!xen_start_info->console_evtchn) + return 0; + + err = bind_caller_port_to_irqhandler(xen_start_info->console_evtchn, + "xencons", xencons_handle_input, NULL, + INTR_TYPE_MISC | INTR_MPSAFE, NULL); + if (err) { + XENPRINTF("XEN console request irq failed %i\n", err); + return err; + } + + return 0; +} +#ifdef notyet +void +xencons_suspend(void) +{ + + if (!xen_start_info->console_evtchn) + return; + + unbind_evtchn_from_irqhandler(xen_start_info->console_evtchn, NULL); +} + +void +xencons_resume(void) +{ + + (void)xencons_ring_init(); +} +#endif +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 8 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ diff --git a/sys/dev/xen/console/xencons_ring.h b/sys/dev/xen/console/xencons_ring.h new file mode 100644 index 0000000..fc97d95 --- /dev/null +++ b/sys/dev/xen/console/xencons_ring.h @@ -0,0 +1,20 @@ +/* + * $FreeBSD$ + * + */ +#ifndef _XENCONS_RING_H +#define _XENCONS_RING_H + +int xencons_ring_init(void); +int xencons_ring_send(const char *data, unsigned len); +void xencons_rx(char *buf, unsigned len); +void xencons_tx(void); + + +typedef void (xencons_receiver_func)(char *buf, unsigned len); +void xencons_ring_register_receiver(xencons_receiver_func *f); + +void xencons_handle_input(void *unused); +int xencons_has_input(void); + +#endif /* _XENCONS_RING_H */ diff --git a/sys/dev/xen/evtchn/evtchn_dev.c b/sys/dev/xen/evtchn/evtchn_dev.c new file mode 100644 index 0000000..a206708 --- /dev/null +++ b/sys/dev/xen/evtchn/evtchn_dev.c @@ -0,0 +1,394 @@ +/****************************************************************************** + * evtchn.c + * + * Xenolinux driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004, K A Fraser + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/uio.h> +#include <sys/bus.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/selinfo.h> +#include <sys/poll.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/ioccom.h> + +#include <machine/cpufunc.h> +#include <machine/intr_machdep.h> +#include <machine/xen-os.h> +#include <machine/xen_intr.h> +#include <machine/bus.h> +#include <sys/rman.h> +#include <machine/resource.h> +#include <machine/synch_bitops.h> + +#include <machine/hypervisor.h> + + +typedef struct evtchn_sotfc { + + struct selinfo ev_rsel; +} evtchn_softc_t; + + +#ifdef linuxcrap +/* NB. This must be shared amongst drivers if more things go in /dev/xen */ +static devfs_handle_t xen_dev_dir; +#endif + +/* Only one process may open /dev/xen/evtchn at any time. */ +static unsigned long evtchn_dev_inuse; + +/* Notification ring, accessed via /dev/xen/evtchn. */ + +#define EVTCHN_RING_SIZE 2048 /* 2048 16-bit entries */ + +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) +static uint16_t *ring; +static unsigned int ring_cons, ring_prod, ring_overflow; + +/* Which ports is user-space bound to? */ +static uint32_t bound_ports[32]; + +/* Unique address for processes to sleep on */ +static void *evtchn_waddr = ˚ + +static struct mtx lock, upcall_lock; + +static d_read_t evtchn_read; +static d_write_t evtchn_write; +static d_ioctl_t evtchn_ioctl; +static d_poll_t evtchn_poll; +static d_open_t evtchn_open; +static d_close_t evtchn_close; + + +void +evtchn_device_upcall(int port) +{ + mtx_lock(&upcall_lock); + + mask_evtchn(port); + clear_evtchn(port); + + if ( ring != NULL ) { + if ( (ring_prod - ring_cons) < EVTCHN_RING_SIZE ) { + ring[EVTCHN_RING_MASK(ring_prod)] = (uint16_t)port; + if ( ring_cons == ring_prod++ ) { + wakeup(evtchn_waddr); + } + } + else { + ring_overflow = 1; + } + } + + mtx_unlock(&upcall_lock); +} + +static void +__evtchn_reset_buffer_ring(void) +{ + /* Initialise the ring to empty. Clear errors. */ + ring_cons = ring_prod = ring_overflow = 0; +} + +static int +evtchn_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + int rc; + unsigned int count, c, p, sst = 0, bytes1 = 0, bytes2 = 0; + count = uio->uio_resid; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) + { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + for ( ; ; ) { + if ( (c = ring_cons) != (p = ring_prod) ) + break; + + if ( ring_overflow ) { + rc = EFBIG; + goto out; + } + + if (sst != 0) { + rc = EINTR; + goto out; + } + + /* PCATCH == check for signals before and after sleeping + * PWAIT == priority of waiting on resource + */ + sst = tsleep(evtchn_waddr, PWAIT|PCATCH, "evchwt", 10); + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if ( ((c ^ p) & EVTCHN_RING_SIZE) != 0 ) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * sizeof(uint16_t); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(uint16_t); + } + else { + bytes1 = (p - c) * sizeof(uint16_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if ( bytes1 > count ) { + bytes1 = count; + bytes2 = 0; + } + else if ( (bytes1 + bytes2) > count ) { + bytes2 = count - bytes1; + } + + if ( uiomove(&ring[EVTCHN_RING_MASK(c)], bytes1, uio) || + ((bytes2 != 0) && uiomove(&ring[0], bytes2, uio))) + /* keeping this around as its replacement is not equivalent + * copyout(&ring[0], &buf[bytes1], bytes2) + */ + { + rc = EFAULT; + goto out; + } + + ring_cons += (bytes1 + bytes2) / sizeof(uint16_t); + + rc = bytes1 + bytes2; + + out: + + return rc; +} + +static int +evtchn_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + int rc, i, count; + + count = uio->uio_resid; + + uint16_t *kbuf = (uint16_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK); + + + if ( kbuf == NULL ) + return ENOMEM; + + count &= ~1; /* even number of bytes */ + + if ( count == 0 ) { + rc = 0; + goto out; + } + + if ( count > PAGE_SIZE ) + count = PAGE_SIZE; + + if ( uiomove(kbuf, count, uio) != 0 ) { + rc = EFAULT; + goto out; + } + + mtx_lock_spin(&lock); + for ( i = 0; i < (count/2); i++ ) + if ( test_bit(kbuf[i], &bound_ports[0]) ) + unmask_evtchn(kbuf[i]); + mtx_unlock_spin(&lock); + + rc = count; + + out: + free(kbuf, M_DEVBUF); + return rc; +} + +static int +evtchn_ioctl(struct cdev *dev, unsigned long cmd, caddr_t arg, + int mode, struct thread *td __unused) +{ + int rc = 0; + + mtx_lock_spin(&lock); + + switch ( cmd ) + { + case EVTCHN_RESET: + __evtchn_reset_buffer_ring(); + break; + case EVTCHN_BIND: + if ( !synch_test_and_set_bit((int)arg, &bound_ports[0]) ) + unmask_evtchn((int)arg); + else + rc = EINVAL; + break; + case EVTCHN_UNBIND: + if ( synch_test_and_clear_bit((int)arg, &bound_ports[0]) ) + mask_evtchn((int)arg); + else + rc = EINVAL; + break; + default: + rc = ENOSYS; + break; + } + + mtx_unlock_spin(&lock); + + return rc; +} + +static int +evtchn_poll(struct cdev *dev, int poll_events, struct thread *td) +{ + + evtchn_softc_t *sc; + unsigned int mask = POLLOUT | POLLWRNORM; + + sc = dev->si_drv1; + + if ( ring_cons != ring_prod ) + mask |= POLLIN | POLLRDNORM; + else if ( ring_overflow ) + mask = POLLERR; + else + selrecord(td, &sc->ev_rsel); + + + return mask; +} + + +static int +evtchn_open(struct cdev *dev, int flag, int otyp, struct thread *td) +{ + uint16_t *_ring; + + if (flag & O_NONBLOCK) + return EBUSY; + + if ( synch_test_and_set_bit(0, &evtchn_dev_inuse) ) + return EBUSY; + + if ( (_ring = (uint16_t *)malloc(PAGE_SIZE, M_DEVBUF, M_WAITOK)) == NULL ) + return ENOMEM; + + mtx_lock_spin(&lock); + ring = _ring; + __evtchn_reset_buffer_ring(); + mtx_unlock_spin(&lock); + + + return 0; +} + +static int +evtchn_close(struct cdev *dev, int flag, int otyp, struct thread *td __unused) +{ + int i; + + mtx_lock_spin(&lock); + if (ring != NULL) { + free(ring, M_DEVBUF); + ring = NULL; + } + for ( i = 0; i < NR_EVENT_CHANNELS; i++ ) + if ( synch_test_and_clear_bit(i, &bound_ports[0]) ) + mask_evtchn(i); + mtx_unlock_spin(&lock); + + evtchn_dev_inuse = 0; + + return 0; +} + +static struct cdevsw evtchn_devsw = { + d_version: D_VERSION, + d_open: evtchn_open, + d_close: evtchn_close, + d_read: evtchn_read, + d_write: evtchn_write, + d_ioctl: evtchn_ioctl, + d_poll: evtchn_poll, + d_name: "evtchn", + d_flags: 0, +}; + + +/* XXX - if this device is ever supposed to support use by more than one process + * this global static will have to go away + */ +static struct cdev *evtchn_dev; + + + +static int +evtchn_init(void *dummy __unused) +{ + /* XXX I believe we don't need these leaving them here for now until we + * have some semblance of it working + */ + mtx_init(&upcall_lock, "evtchup", NULL, MTX_DEF); + + /* (DEVFS) create '/dev/misc/evtchn'. */ + evtchn_dev = make_dev(&evtchn_devsw, 0, UID_ROOT, GID_WHEEL, 0600, "xen/evtchn"); + + mtx_init(&lock, "evch", NULL, MTX_SPIN | MTX_NOWITNESS); + + evtchn_dev->si_drv1 = malloc(sizeof(evtchn_softc_t), M_DEVBUF, M_WAITOK); + bzero(evtchn_dev->si_drv1, sizeof(evtchn_softc_t)); + + /* XXX I don't think we need any of this rubbish */ +#if 0 + if ( err != 0 ) + { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + /* (DEVFS) create directory '/dev/xen'. */ + xen_dev_dir = devfs_mk_dir(NULL, "xen", NULL); + + /* (DEVFS) &link_dest[pos] == '../misc/evtchn'. */ + pos = devfs_generate_path(evtchn_miscdev.devfs_handle, + &link_dest[3], + sizeof(link_dest) - 3); + if ( pos >= 0 ) + strncpy(&link_dest[pos], "../", 3); + /* (DEVFS) symlink '/dev/xen/evtchn' -> '../misc/evtchn'. */ + (void)devfs_mk_symlink(xen_dev_dir, + "evtchn", + DEVFS_FL_DEFAULT, + &link_dest[pos], + &symlink_handle, + NULL); + + /* (DEVFS) automatically destroy the symlink with its destination. */ + devfs_auto_unregister(evtchn_miscdev.devfs_handle, symlink_handle); +#endif + printk("Event-channel device installed.\n"); + + return 0; +} + + +SYSINIT(evtchn_init, SI_SUB_DRIVERS, SI_ORDER_FIRST, evtchn_init, NULL); + + diff --git a/sys/dev/xen/netback/netback.c b/sys/dev/xen/netback/netback.c new file mode 100644 index 0000000..950a68c --- /dev/null +++ b/sys/dev/xen/netback/netback.c @@ -0,0 +1,1585 @@ +/* + * Copyright (c) 2006, Cisco Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/taskqueue.h> + +#include <sys/module.h> +#include <sys/bus.h> +#include <sys/sysctl.h> + +#include <net/if.h> +#include <net/if_arp.h> +#include <net/if_types.h> +#include <net/ethernet.h> +#include <net/if_bridgevar.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/in_var.h> +#include <netinet/ip.h> +#include <netinet/tcp.h> +#include <netinet/udp.h> + +#include <vm/vm_extern.h> +#include <vm/vm_kern.h> + +#include <machine/in_cksum.h> +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs.h> +#include <machine/xen_intr.h> +#include <machine/evtchn.h> +#include <machine/xenbus.h> +#include <machine/gnttab.h> +#include <machine/xen-public/memory.h> +#include <dev/xen/xenbus/xenbus_comms.h> + + +#ifdef XEN_NETBACK_DEBUG +#define DPRINTF(fmt, args...) \ + printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) ((void)0) +#endif + +#ifdef XEN_NETBACK_DEBUG_LOTS +#define DDPRINTF(fmt, args...) \ + printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#define DPRINTF_MBUF(_m) print_mbuf(_m, 0) +#define DPRINTF_MBUF_LEN(_m, _len) print_mbuf(_m, _len) +#else +#define DDPRINTF(fmt, args...) ((void)0) +#define DPRINTF_MBUF(_m) ((void)0) +#define DPRINTF_MBUF_LEN(_m, _len) ((void)0) +#endif + +#define WPRINTF(fmt, args...) \ + printf("netback (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) +#define BUG_ON PANIC_IF + +#define IFNAME(_np) (_np)->ifp->if_xname + +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) + +struct ring_ref { + vm_offset_t va; + grant_handle_t handle; + uint64_t bus_addr; +}; + +typedef struct netback_info { + + /* Schedule lists */ + STAILQ_ENTRY(netback_info) next_tx; + STAILQ_ENTRY(netback_info) next_rx; + int on_tx_sched_list; + int on_rx_sched_list; + + struct xenbus_device *xdev; + XenbusState frontend_state; + + domid_t domid; + int handle; + char *bridge; + + int rings_connected; + struct ring_ref tx_ring_ref; + struct ring_ref rx_ring_ref; + netif_tx_back_ring_t tx; + netif_rx_back_ring_t rx; + evtchn_port_t evtchn; + int irq; + void *irq_cookie; + + struct ifnet *ifp; + int ref_cnt; + + device_t ndev; + int attached; +} netif_t; + + +#define MAX_PENDING_REQS 256 +#define PKT_PROT_LEN 64 + +static struct { + netif_tx_request_t req; + netif_t *netif; +} pending_tx_info[MAX_PENDING_REQS]; +static uint16_t pending_ring[MAX_PENDING_REQS]; +typedef unsigned int PEND_RING_IDX; +#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) +static PEND_RING_IDX pending_prod, pending_cons; +#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) + +static unsigned long mmap_vstart; +#define MMAP_VADDR(_req) (mmap_vstart + ((_req) * PAGE_SIZE)) + +/* Freed TX mbufs get batched on this ring before return to pending_ring. */ +static uint16_t dealloc_ring[MAX_PENDING_REQS]; +static PEND_RING_IDX dealloc_prod, dealloc_cons; + +static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1]; +static mmu_update_t rx_mmu[NET_RX_RING_SIZE]; +static gnttab_transfer_t grant_rx_op[NET_RX_RING_SIZE]; + +static grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; +static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS]; +static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS]; + +static struct task net_tx_task, net_rx_task; +static struct callout rx_task_callout; + +static STAILQ_HEAD(netback_tx_sched_list, netback_info) tx_sched_list = + STAILQ_HEAD_INITIALIZER(tx_sched_list); +static STAILQ_HEAD(netback_rx_sched_list, netback_info) rx_sched_list = + STAILQ_HEAD_INITIALIZER(rx_sched_list); +static struct mtx tx_sched_list_lock; +static struct mtx rx_sched_list_lock; + +static int vif_unit_maker = 0; + +/* Protos */ +static void netback_start(struct ifnet *ifp); +static int netback_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data); +static int vif_add_dev(struct xenbus_device *xdev); +static void disconnect_rings(netif_t *netif); + +#ifdef XEN_NETBACK_DEBUG_LOTS +/* Debug code to display the contents of an mbuf */ +static void +print_mbuf(struct mbuf *m, int max) +{ + int i, j=0; + printf("mbuf %08x len = %d", (unsigned int)m, m->m_pkthdr.len); + for (; m; m = m->m_next) { + unsigned char *d = m->m_data; + for (i=0; i < m->m_len; i++) { + if (max && j == max) + break; + if ((j++ % 16) == 0) + printf("\n%04x:", j); + printf(" %02x", d[i]); + } + } + printf("\n"); +} +#endif + + +#define MAX_MFN_ALLOC 64 +static unsigned long mfn_list[MAX_MFN_ALLOC]; +static unsigned int alloc_index = 0; + +static unsigned long +alloc_mfn(void) +{ + unsigned long mfn = 0; + struct xen_memory_reservation reservation = { + .extent_start = mfn_list, + .nr_extents = MAX_MFN_ALLOC, + .extent_order = 0, + .domid = DOMID_SELF + }; + if ( unlikely(alloc_index == 0) ) + alloc_index = HYPERVISOR_memory_op( + XENMEM_increase_reservation, &reservation); + if ( alloc_index != 0 ) + mfn = mfn_list[--alloc_index]; + return mfn; +} + +static unsigned long +alloc_empty_page_range(unsigned long nr_pages) +{ + void *pages; + int i = 0, j = 0; + multicall_entry_t mcl[17]; + unsigned long mfn_list[16]; + struct xen_memory_reservation reservation = { + .extent_start = mfn_list, + .nr_extents = 0, + .address_bits = 0, + .extent_order = 0, + .domid = DOMID_SELF + }; + + pages = malloc(nr_pages*PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (pages == NULL) + return 0; + + memset(mcl, 0, sizeof(mcl)); + + while (i < nr_pages) { + unsigned long va = (unsigned long)pages + (i++ * PAGE_SIZE); + + mcl[j].op = __HYPERVISOR_update_va_mapping; + mcl[j].args[0] = va; + + mfn_list[j++] = vtomach(va) >> PAGE_SHIFT; + + xen_phys_machine[(vtophys(va) >> PAGE_SHIFT)] = INVALID_P2M_ENTRY; + + if (j == 16 || i == nr_pages) { + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_LOCAL; + + reservation.nr_extents = j; + + mcl[j].op = __HYPERVISOR_memory_op; + mcl[j].args[0] = XENMEM_decrease_reservation; + mcl[j].args[1] = (unsigned long)&reservation; + + (void)HYPERVISOR_multicall(mcl, j+1); + + mcl[j-1].args[MULTI_UVMFLAGS_INDEX] = 0; + j = 0; + } + } + + return (unsigned long)pages; +} + +#ifdef XEN_NETBACK_FIXUP_CSUM +static void +fixup_checksum(struct mbuf *m) +{ + struct ether_header *eh = mtod(m, struct ether_header *); + struct ip *ip = (struct ip *)(eh + 1); + int iphlen = ip->ip_hl << 2; + int iplen = ntohs(ip->ip_len); + + if ((m->m_pkthdr.csum_flags & CSUM_TCP)) { + struct tcphdr *th = (struct tcphdr *)((caddr_t)ip + iphlen); + th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(IPPROTO_TCP + (iplen - iphlen))); + th->th_sum = in_cksum_skip(m, iplen + sizeof(*eh), sizeof(*eh) + iphlen); + m->m_pkthdr.csum_flags &= ~CSUM_TCP; + } else { + u_short csum; + struct udphdr *uh = (struct udphdr *)((caddr_t)ip + iphlen); + uh->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, + htons(IPPROTO_UDP + (iplen - iphlen))); + if ((csum = in_cksum_skip(m, iplen + sizeof(*eh), sizeof(*eh) + iphlen)) == 0) + csum = 0xffff; + uh->uh_sum = csum; + m->m_pkthdr.csum_flags &= ~CSUM_UDP; + } +} +#endif + +/* Add the interface to the specified bridge */ +static int +add_to_bridge(struct ifnet *ifp, char *bridge) +{ + struct ifdrv ifd; + struct ifbreq ifb; + struct ifnet *ifp_bridge = ifunit(bridge); + + if (!ifp_bridge) + return ENOENT; + + bzero(&ifd, sizeof(ifd)); + bzero(&ifb, sizeof(ifb)); + + strcpy(ifb.ifbr_ifsname, ifp->if_xname); + strcpy(ifd.ifd_name, ifp->if_xname); + ifd.ifd_cmd = BRDGADD; + ifd.ifd_len = sizeof(ifb); + ifd.ifd_data = &ifb; + + return bridge_ioctl_kern(ifp_bridge, SIOCSDRVSPEC, &ifd); + +} + +static int +netif_create(int handle, struct xenbus_device *xdev, char *bridge) +{ + netif_t *netif; + struct ifnet *ifp; + + netif = (netif_t *)malloc(sizeof(*netif), M_DEVBUF, M_NOWAIT | M_ZERO); + if (!netif) + return ENOMEM; + + netif->ref_cnt = 1; + netif->handle = handle; + netif->domid = xdev->otherend_id; + netif->xdev = xdev; + netif->bridge = bridge; + xdev->data = netif; + + /* Set up ifnet structure */ + ifp = netif->ifp = if_alloc(IFT_ETHER); + if (!ifp) { + if (bridge) + free(bridge, M_DEVBUF); + free(netif, M_DEVBUF); + return ENOMEM; + } + + ifp->if_softc = netif; + if_initname(ifp, "vif", + atomic_fetchadd_int(&vif_unit_maker, 1) /* ifno */ ); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX; + ifp->if_output = ether_output; + ifp->if_start = netback_start; + ifp->if_ioctl = netback_ioctl; + ifp->if_mtu = ETHERMTU; + ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1; + + DPRINTF("Created %s for domid=%d handle=%d\n", IFNAME(netif), netif->domid, netif->handle); + + return 0; +} + +static void +netif_get(netif_t *netif) +{ + atomic_add_int(&netif->ref_cnt, 1); +} + +static void +netif_put(netif_t *netif) +{ + if (atomic_fetchadd_int(&netif->ref_cnt, -1) == 1) { + DPRINTF("%s\n", IFNAME(netif)); + disconnect_rings(netif); + if (netif->ifp) { + if_free(netif->ifp); + netif->ifp = NULL; + } + if (netif->bridge) + free(netif->bridge, M_DEVBUF); + free(netif, M_DEVBUF); + } +} + +static int +netback_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + switch (cmd) { + case SIOCSIFFLAGS: + DDPRINTF("%s cmd=SIOCSIFFLAGS flags=%x\n", + IFNAME((struct netback_info *)ifp->if_softc), ((struct ifreq *)data)->ifr_flags); + return 0; + } + + DDPRINTF("%s cmd=%lx\n", IFNAME((struct netback_info *)ifp->if_softc), cmd); + + return ether_ioctl(ifp, cmd, data); +} + +static inline void +maybe_schedule_tx_action(void) +{ + smp_mb(); + if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && !STAILQ_EMPTY(&tx_sched_list)) + taskqueue_enqueue(taskqueue_swi, &net_tx_task); +} + +/* Removes netif from front of list and does not call netif_put() (caller must) */ +static netif_t * +remove_from_tx_schedule_list(void) +{ + netif_t *netif; + + mtx_lock(&tx_sched_list_lock); + + if ((netif = STAILQ_FIRST(&tx_sched_list))) { + STAILQ_REMOVE(&tx_sched_list, netif, netback_info, next_tx); + STAILQ_NEXT(netif, next_tx) = NULL; + netif->on_tx_sched_list = 0; + } + + mtx_unlock(&tx_sched_list_lock); + + return netif; +} + +/* Adds netif to end of list and calls netif_get() */ +static void +add_to_tx_schedule_list_tail(netif_t *netif) +{ + if (netif->on_tx_sched_list) + return; + + mtx_lock(&tx_sched_list_lock); + if (!netif->on_tx_sched_list && (netif->ifp->if_drv_flags & IFF_DRV_RUNNING)) { + netif_get(netif); + STAILQ_INSERT_TAIL(&tx_sched_list, netif, next_tx); + netif->on_tx_sched_list = 1; + } + mtx_unlock(&tx_sched_list_lock); +} + +/* + * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER: + * If this driver is pipelining transmit requests then we can be very + * aggressive in avoiding new-packet notifications -- frontend only needs to + * send a notification if there are no outstanding unreceived responses. + * If we may be buffer transmit buffers for any reason then we must be rather + * more conservative and treat this as the final check for pending work. + */ +static void +netif_schedule_tx_work(netif_t *netif) +{ + int more_to_do; + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx); +#else + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); +#endif + + if (more_to_do) { + DDPRINTF("Adding %s to tx sched list\n", IFNAME(netif)); + add_to_tx_schedule_list_tail(netif); + maybe_schedule_tx_action(); + } +} + +static struct mtx dealloc_lock; +MTX_SYSINIT(netback_dealloc, &dealloc_lock, "DEALLOC LOCK", MTX_SPIN | MTX_NOWITNESS); + +static void +netif_idx_release(uint16_t pending_idx) +{ + mtx_lock_spin(&dealloc_lock); + dealloc_ring[MASK_PEND_IDX(dealloc_prod++)] = pending_idx; + mtx_unlock_spin(&dealloc_lock); + + taskqueue_enqueue(taskqueue_swi, &net_tx_task); +} + +static void +make_tx_response(netif_t *netif, + uint16_t id, + int8_t st) +{ + RING_IDX i = netif->tx.rsp_prod_pvt; + netif_tx_response_t *resp; + int notify; + + resp = RING_GET_RESPONSE(&netif->tx, i); + resp->id = id; + resp->status = st; + + netif->tx.rsp_prod_pvt = ++i; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify); + if (notify) + notify_remote_via_irq(netif->irq); + +#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER + if (i == netif->tx.req_cons) { + int more_to_do; + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); + if (more_to_do) + add_to_tx_schedule_list_tail(netif); + } +#endif +} + +inline static void +net_tx_action_dealloc(void) +{ + gnttab_unmap_grant_ref_t *gop; + uint16_t pending_idx; + PEND_RING_IDX dc, dp; + netif_t *netif; + int ret; + + dc = dealloc_cons; + dp = dealloc_prod; + + /* + * Free up any grants we have finished using + */ + gop = tx_unmap_ops; + while (dc != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; + gop->host_addr = MMAP_VADDR(pending_idx); + gop->dev_bus_addr = 0; + gop->handle = grant_tx_handle[pending_idx]; + gop++; + } + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops); + BUG_ON(ret); + + while (dealloc_cons != dp) { + pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; + + netif = pending_tx_info[pending_idx].netif; + + make_tx_response(netif, pending_tx_info[pending_idx].req.id, + NETIF_RSP_OKAY); + + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + + netif_put(netif); + } +} + +static void +netif_page_release(void *buf, void *args) +{ + uint16_t pending_idx = (unsigned int)args; + + DDPRINTF("pending_idx=%u\n", pending_idx); + + KASSERT(pending_idx < MAX_PENDING_REQS, ("%s: bad index %u", __func__, pending_idx)); + + netif_idx_release(pending_idx); +} + +static void +net_tx_action(void *context, int pending) +{ + struct mbuf *m; + netif_t *netif; + netif_tx_request_t txreq; + uint16_t pending_idx; + RING_IDX i; + gnttab_map_grant_ref_t *mop; + int ret, work_to_do; + struct mbuf *txq = NULL, *txq_last = NULL; + + if (dealloc_cons != dealloc_prod) + net_tx_action_dealloc(); + + mop = tx_map_ops; + while ((NR_PENDING_REQS < MAX_PENDING_REQS) && !STAILQ_EMPTY(&tx_sched_list)) { + + /* Get a netif from the list with work to do. */ + netif = remove_from_tx_schedule_list(); + + DDPRINTF("Processing %s (prod=%u, cons=%u)\n", + IFNAME(netif), netif->tx.sring->req_prod, netif->tx.req_cons); + + RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do); + if (!work_to_do) { + netif_put(netif); + continue; + } + + i = netif->tx.req_cons; + rmb(); /* Ensure that we see the request before we copy it. */ + memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq)); + + /* If we want credit-based scheduling, coud add it here - WORK */ + + netif->tx.req_cons++; + + netif_schedule_tx_work(netif); + + if (unlikely(txreq.size < ETHER_HDR_LEN) || + unlikely(txreq.size > (ETHER_MAX_LEN-ETHER_CRC_LEN))) { + WPRINTF("Bad packet size: %d\n", txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + /* No crossing a page as the payload mustn't fragment. */ + if (unlikely((txreq.offset + txreq.size) >= PAGE_SIZE)) { + WPRINTF("txreq.offset: %x, size: %u, end: %u\n", + txreq.offset, txreq.size, + (txreq.offset & PAGE_MASK) + txreq.size); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + continue; + } + + pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; + + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (!m) { + WPRINTF("Failed to allocate mbuf\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + m->m_pkthdr.rcvif = netif->ifp; + + if ((m->m_pkthdr.len = txreq.size) > PKT_PROT_LEN) { + struct mbuf *n; + MGET(n, M_DONTWAIT, MT_DATA); + if (!(m->m_next = n)) { + m_freem(m); + WPRINTF("Failed to allocate second mbuf\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + break; + } + n->m_len = txreq.size - PKT_PROT_LEN; + m->m_len = PKT_PROT_LEN; + } else + m->m_len = txreq.size; + + mop->host_addr = MMAP_VADDR(pending_idx); + mop->dom = netif->domid; + mop->ref = txreq.gref; + mop->flags = GNTMAP_host_map | GNTMAP_readonly; + mop++; + + memcpy(&pending_tx_info[pending_idx].req, + &txreq, sizeof(txreq)); + pending_tx_info[pending_idx].netif = netif; + *((uint16_t *)m->m_data) = pending_idx; + + if (txq_last) + txq_last->m_nextpkt = m; + else + txq = m; + txq_last = m; + + pending_cons++; + + if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops)) + break; + } + + if (!txq) + return; + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops); + BUG_ON(ret); + + mop = tx_map_ops; + while ((m = txq) != NULL) { + caddr_t data; + + txq = m->m_nextpkt; + m->m_nextpkt = NULL; + + pending_idx = *((uint16_t *)m->m_data); + netif = pending_tx_info[pending_idx].netif; + memcpy(&txreq, &pending_tx_info[pending_idx].req, sizeof(txreq)); + + /* Check the remap error code. */ + if (unlikely(mop->status)) { + WPRINTF("#### netback grant fails\n"); + make_tx_response(netif, txreq.id, NETIF_RSP_ERROR); + netif_put(netif); + m_freem(m); + mop++; + pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; + continue; + } + +#if 0 + /* Can't do this in FreeBSD since vtophys() returns the pfn */ + /* of the remote domain who loaned us the machine page - DPT */ + xen_phys_machine[(vtophys(MMAP_VADDR(pending_idx)) >> PAGE_SHIFT)] = + mop->dev_bus_addr >> PAGE_SHIFT; +#endif + grant_tx_handle[pending_idx] = mop->handle; + + /* Setup data in mbuf (lengths are already set) */ + data = (caddr_t)(MMAP_VADDR(pending_idx)|txreq.offset); + bcopy(data, m->m_data, m->m_len); + if (m->m_next) { + struct mbuf *n = m->m_next; + MEXTADD(n, MMAP_VADDR(pending_idx), PAGE_SIZE, netif_page_release, + (void *)(unsigned int)pending_idx, M_RDONLY, EXT_NET_DRV); + n->m_data = &data[PKT_PROT_LEN]; + } else { + /* Schedule a response immediately. */ + netif_idx_release(pending_idx); + } + + if ((txreq.flags & NETTXF_data_validated)) { + /* Tell the stack the checksums are okay */ + m->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + } + + /* If necessary, inform stack to compute the checksums if it forwards the packet */ + if ((txreq.flags & NETTXF_csum_blank)) { + struct ether_header *eh = mtod(m, struct ether_header *); + if (ntohs(eh->ether_type) == ETHERTYPE_IP) { + struct ip *ip = (struct ip *)&m->m_data[14]; + if (ip->ip_p == IPPROTO_TCP) + m->m_pkthdr.csum_flags |= CSUM_TCP; + else if (ip->ip_p == IPPROTO_UDP) + m->m_pkthdr.csum_flags |= CSUM_UDP; + } + } + + netif->ifp->if_ibytes += m->m_pkthdr.len; + netif->ifp->if_ipackets++; + + DDPRINTF("RECV %d bytes from %s (cflags=%x)\n", + m->m_pkthdr.len, IFNAME(netif), m->m_pkthdr.csum_flags); + DPRINTF_MBUF_LEN(m, 128); + + (*netif->ifp->if_input)(netif->ifp, m); + + mop++; + } +} + +/* Handle interrupt from a frontend */ +static void +netback_intr(void *arg) +{ + netif_t *netif = arg; + DDPRINTF("%s\n", IFNAME(netif)); + add_to_tx_schedule_list_tail(netif); + maybe_schedule_tx_action(); +} + +/* Removes netif from front of list and does not call netif_put() (caller must) */ +static netif_t * +remove_from_rx_schedule_list(void) +{ + netif_t *netif; + + mtx_lock(&rx_sched_list_lock); + + if ((netif = STAILQ_FIRST(&rx_sched_list))) { + STAILQ_REMOVE(&rx_sched_list, netif, netback_info, next_rx); + STAILQ_NEXT(netif, next_rx) = NULL; + netif->on_rx_sched_list = 0; + } + + mtx_unlock(&rx_sched_list_lock); + + return netif; +} + +/* Adds netif to end of list and calls netif_get() */ +static void +add_to_rx_schedule_list_tail(netif_t *netif) +{ + if (netif->on_rx_sched_list) + return; + + mtx_lock(&rx_sched_list_lock); + if (!netif->on_rx_sched_list && (netif->ifp->if_drv_flags & IFF_DRV_RUNNING)) { + netif_get(netif); + STAILQ_INSERT_TAIL(&rx_sched_list, netif, next_rx); + netif->on_rx_sched_list = 1; + } + mtx_unlock(&rx_sched_list_lock); +} + +static int +make_rx_response(netif_t *netif, uint16_t id, int8_t st, + uint16_t offset, uint16_t size, uint16_t flags) +{ + RING_IDX i = netif->rx.rsp_prod_pvt; + netif_rx_response_t *resp; + int notify; + + resp = RING_GET_RESPONSE(&netif->rx, i); + resp->offset = offset; + resp->flags = flags; + resp->id = id; + resp->status = (int16_t)size; + if (st < 0) + resp->status = (int16_t)st; + + DDPRINTF("rx resp(%d): off=%x fl=%x id=%x stat=%d\n", + i, resp->offset, resp->flags, resp->id, resp->status); + + netif->rx.rsp_prod_pvt = ++i; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, notify); + + return notify; +} + +static int +netif_rx(netif_t *netif) +{ + struct ifnet *ifp = netif->ifp; + struct mbuf *m; + multicall_entry_t *mcl; + mmu_update_t *mmu; + gnttab_transfer_t *gop; + unsigned long vdata, old_mfn, new_mfn; + struct mbuf *rxq = NULL, *rxq_last = NULL; + int ret, notify = 0, pkts_dequeued = 0; + + DDPRINTF("%s\n", IFNAME(netif)); + + mcl = rx_mcl; + mmu = rx_mmu; + gop = grant_rx_op; + + while (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) { + + /* Quit if the target domain has no receive buffers */ + if (netif->rx.req_cons == netif->rx.sring->req_prod) + break; + + IFQ_DRV_DEQUEUE(&ifp->if_snd, m); + if (m == NULL) + break; + + pkts_dequeued++; + + /* Check if we need to copy the data */ + if (((m->m_flags & (M_RDONLY|M_EXT)) != M_EXT) || + (*m->m_ext.ref_cnt > 1) || m->m_next != NULL) { + struct mbuf *n; + + DDPRINTF("copying mbuf (fl=%x ext=%x rc=%d n=%x)\n", + m->m_flags, + (m->m_flags & M_EXT) ? m->m_ext.ext_type : 0, + (m->m_flags & M_EXT) ? *m->m_ext.ref_cnt : 0, + (unsigned int)m->m_next); + + /* Make copy */ + MGETHDR(n, M_DONTWAIT, MT_DATA); + if (!n) + goto drop; + + MCLGET(n, M_DONTWAIT); + if (!(n->m_flags & M_EXT)) { + m_freem(n); + goto drop; + } + + /* Leave space at front and keep current alignment */ + n->m_data += 16 + ((unsigned int)m->m_data & 0x3); + + if (m->m_pkthdr.len > M_TRAILINGSPACE(n)) { + WPRINTF("pkt to big %d\n", m->m_pkthdr.len); + m_freem(n); + goto drop; + } + m_copydata(m, 0, m->m_pkthdr.len, n->m_data); + n->m_pkthdr.len = n->m_len = m->m_pkthdr.len; + n->m_pkthdr.csum_flags = (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA); + m_freem(m); + m = n; + } + + vdata = (unsigned long)m->m_data; + old_mfn = vtomach(vdata) >> PAGE_SHIFT; + + if ((new_mfn = alloc_mfn()) == 0) + goto drop; + +#ifdef XEN_NETBACK_FIXUP_CSUM + /* Check if we need to compute a checksum. This happens */ + /* when bridging from one domain to another. */ + if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) + fixup_checksum(m); +#endif + + xen_phys_machine[(vtophys(vdata) >> PAGE_SHIFT)] = new_mfn; + + mcl->op = __HYPERVISOR_update_va_mapping; + mcl->args[0] = vdata; + mcl->args[1] = (new_mfn << PAGE_SHIFT) | PG_V | PG_RW | PG_M | PG_A; + mcl->args[2] = 0; + mcl->args[3] = 0; + mcl++; + + gop->mfn = old_mfn; + gop->domid = netif->domid; + gop->ref = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons)->gref; + netif->rx.req_cons++; + gop++; + + mmu->ptr = (new_mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE; + mmu->val = vtophys(vdata) >> PAGE_SHIFT; + mmu++; + + if (rxq_last) + rxq_last->m_nextpkt = m; + else + rxq = m; + rxq_last = m; + + DDPRINTF("XMIT %d bytes to %s\n", m->m_pkthdr.len, IFNAME(netif)); + DPRINTF_MBUF_LEN(m, 128); + + /* Filled the batch queue? */ + if ((gop - grant_rx_op) == ARRAY_SIZE(grant_rx_op)) + break; + + continue; + drop: + DDPRINTF("dropping pkt\n"); + ifp->if_oerrors++; + m_freem(m); + } + + if (mcl == rx_mcl) + return pkts_dequeued; + + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (unsigned long)rx_mmu; + mcl->args[1] = mmu - rx_mmu; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + mcl++; + + mcl[-2].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; + ret = HYPERVISOR_multicall(rx_mcl, mcl - rx_mcl); + BUG_ON(ret != 0); + + ret = HYPERVISOR_grant_table_op(GNTTABOP_transfer, grant_rx_op, gop - grant_rx_op); + BUG_ON(ret != 0); + + mcl = rx_mcl; + gop = grant_rx_op; + + while ((m = rxq) != NULL) { + int8_t status; + uint16_t id, flags = 0; + + rxq = m->m_nextpkt; + m->m_nextpkt = NULL; + + /* Rederive the machine addresses. */ + new_mfn = mcl->args[1] >> PAGE_SHIFT; + old_mfn = gop->mfn; + + ifp->if_obytes += m->m_pkthdr.len; + ifp->if_opackets++; + + /* The update_va_mapping() must not fail. */ + BUG_ON(mcl->result != 0); + + /* Setup flags */ + if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)) + flags |= NETRXF_csum_blank | NETRXF_data_validated; + else if ((m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) + flags |= NETRXF_data_validated; + + /* Check the reassignment error code. */ + status = NETIF_RSP_OKAY; + if (gop->status != 0) { + DPRINTF("Bad status %d from grant transfer to DOM%u\n", + gop->status, netif->domid); + /* + * Page no longer belongs to us unless GNTST_bad_page, + * but that should be a fatal error anyway. + */ + BUG_ON(gop->status == GNTST_bad_page); + status = NETIF_RSP_ERROR; + } + id = RING_GET_REQUEST(&netif->rx, netif->rx.rsp_prod_pvt)->id; + notify |= make_rx_response(netif, id, status, + (unsigned long)m->m_data & PAGE_MASK, + m->m_pkthdr.len, flags); + + m_freem(m); + mcl++; + gop++; + } + + if (notify) + notify_remote_via_irq(netif->irq); + + return pkts_dequeued; +} + +static void +rx_task_timer(void *arg) +{ + DDPRINTF("\n"); + taskqueue_enqueue(taskqueue_swi, &net_rx_task); +} + +static void +net_rx_action(void *context, int pending) +{ + netif_t *netif, *last_zero_work = NULL; + + DDPRINTF("\n"); + + while ((netif = remove_from_rx_schedule_list())) { + struct ifnet *ifp = netif->ifp; + + if (netif == last_zero_work) { + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + add_to_rx_schedule_list_tail(netif); + netif_put(netif); + if (!STAILQ_EMPTY(&rx_sched_list)) + callout_reset(&rx_task_callout, 1, rx_task_timer, NULL); + break; + } + + if ((ifp->if_drv_flags & IFF_DRV_RUNNING)) { + if (netif_rx(netif)) + last_zero_work = NULL; + else if (!last_zero_work) + last_zero_work = netif; + if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + add_to_rx_schedule_list_tail(netif); + } + + netif_put(netif); + } +} + +static void +netback_start(struct ifnet *ifp) +{ + netif_t *netif = (netif_t *)ifp->if_softc; + + DDPRINTF("%s\n", IFNAME(netif)); + + add_to_rx_schedule_list_tail(netif); + taskqueue_enqueue(taskqueue_swi, &net_rx_task); +} + +/* Map a grant ref to a ring */ +static int +map_ring(grant_ref_t ref, domid_t dom, struct ring_ref *ring) +{ + struct gnttab_map_grant_ref op; + + ring->va = kmem_alloc_nofault(kernel_map, PAGE_SIZE); + if (ring->va == 0) + return ENOMEM; + + op.host_addr = ring->va; + op.flags = GNTMAP_host_map; + op.ref = ref; + op.dom = dom; + HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1); + if (op.status) { + WPRINTF("grant table op err=%d\n", op.status); + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; + return EACCES; + } + + ring->handle = op.handle; + ring->bus_addr = op.dev_bus_addr; + + return 0; +} + +/* Unmap grant ref for a ring */ +static void +unmap_ring(struct ring_ref *ring) +{ + struct gnttab_unmap_grant_ref op; + + op.host_addr = ring->va; + op.dev_bus_addr = ring->bus_addr; + op.handle = ring->handle; + HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1); + if (op.status) + WPRINTF("grant table op err=%d\n", op.status); + + kmem_free(kernel_map, ring->va, PAGE_SIZE); + ring->va = 0; +} + +static int +connect_rings(netif_t *netif) +{ + struct xenbus_device *xdev = netif->xdev; + netif_tx_sring_t *txs; + netif_rx_sring_t *rxs; + unsigned long tx_ring_ref, rx_ring_ref; + evtchn_port_t evtchn; + evtchn_op_t op = { .cmd = EVTCHNOP_bind_interdomain }; + int err; + + // Grab FE data and map his memory + err = xenbus_gather(NULL, xdev->otherend, + "tx-ring-ref", "%lu", &tx_ring_ref, + "rx-ring-ref", "%lu", &rx_ring_ref, + "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(xdev, err, + "reading %s/ring-ref and event-channel", + xdev->otherend); + return err; + } + + err = map_ring(tx_ring_ref, netif->domid, &netif->tx_ring_ref); + if (err) { + xenbus_dev_fatal(xdev, err, "mapping tx ring"); + return err; + } + txs = (netif_tx_sring_t *)netif->tx_ring_ref.va; + BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE); + + err = map_ring(rx_ring_ref, netif->domid, &netif->rx_ring_ref); + if (err) { + unmap_ring(&netif->tx_ring_ref); + xenbus_dev_fatal(xdev, err, "mapping rx ring"); + return err; + } + rxs = (netif_rx_sring_t *)netif->rx_ring_ref.va; + BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE); + + op.u.bind_interdomain.remote_dom = netif->domid; + op.u.bind_interdomain.remote_port = evtchn; + err = HYPERVISOR_event_channel_op(&op); + if (err) { + unmap_ring(&netif->tx_ring_ref); + unmap_ring(&netif->rx_ring_ref); + xenbus_dev_fatal(xdev, err, "binding event channel"); + return err; + } + netif->evtchn = op.u.bind_interdomain.local_port; + + /* bind evtchn to irq handler */ + netif->irq = + bind_evtchn_to_irqhandler(netif->evtchn, "netback", + netback_intr, netif, INTR_TYPE_NET|INTR_MPSAFE, &netif->irq_cookie); + + netif->rings_connected = 1; + + DPRINTF("%s connected! evtchn=%d irq=%d\n", + IFNAME(netif), netif->evtchn, netif->irq); + + return 0; +} + +static void +disconnect_rings(netif_t *netif) +{ + DPRINTF("\n"); + + if (netif->rings_connected) { + unbind_from_irqhandler(netif->irq, netif->irq_cookie); + netif->irq = 0; + unmap_ring(&netif->tx_ring_ref); + unmap_ring(&netif->rx_ring_ref); + netif->rings_connected = 0; + } +} + +static void +connect(netif_t *netif) +{ + if (!netif->xdev || + !netif->attached || + netif->frontend_state != XenbusStateConnected) { + return; + } + + if (!connect_rings(netif)) { + xenbus_switch_state(netif->xdev, NULL, XenbusStateConnected); + + /* Turn on interface */ + netif->ifp->if_drv_flags |= IFF_DRV_RUNNING; + netif->ifp->if_flags |= IFF_UP; + } +} + +static int +netback_remove(struct xenbus_device *xdev) +{ + netif_t *netif = xdev->data; + device_t ndev; + + DPRINTF("remove %s\n", xdev->nodename); + + if ((ndev = netif->ndev)) { + netif->ndev = NULL; + mtx_lock(&Giant); + device_detach(ndev); + mtx_unlock(&Giant); + } + + xdev->data = NULL; + netif->xdev = NULL; + netif_put(netif); + + return 0; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffers for communication with the frontend. + * Switch to Connected state. + */ +static int +netback_probe(struct xenbus_device *xdev, const struct xenbus_device_id *id) +{ + int err; + long handle; + char *bridge; + + DPRINTF("node=%s\n", xdev->nodename); + + /* Grab the handle */ + err = xenbus_scanf(NULL, xdev->nodename, "handle", "%li", &handle); + if (err != 1) { + xenbus_dev_fatal(xdev, err, "reading handle"); + return err; + } + + /* Check for bridge */ + bridge = xenbus_read(NULL, xdev->nodename, "bridge", NULL); + if (IS_ERR(bridge)) + bridge = NULL; + + err = xenbus_switch_state(xdev, NULL, XenbusStateInitWait); + if (err) { + xenbus_dev_fatal(xdev, err, "writing switch state"); + return err; + } + + err = netif_create(handle, xdev, bridge); + if (err) { + xenbus_dev_fatal(xdev, err, "creating netif"); + return err; + } + + err = vif_add_dev(xdev); + if (err) { + netif_put((netif_t *)xdev->data); + xenbus_dev_fatal(xdev, err, "adding vif device"); + return err; + } + + return 0; +} + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our netif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int netback_resume(struct xenbus_device *xdev) +{ + DPRINTF("node=%s\n", xdev->nodename); + return 0; +} + + +/** + * Callback received when the frontend's state changes. + */ +static void frontend_changed(struct xenbus_device *xdev, + XenbusState frontend_state) +{ + netif_t *netif = xdev->data; + + DPRINTF("state=%d\n", frontend_state); + + netif->frontend_state = frontend_state; + + switch (frontend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + break; + case XenbusStateConnected: + connect(netif); + break; + case XenbusStateClosing: + xenbus_switch_state(xdev, NULL, XenbusStateClosing); + break; + case XenbusStateClosed: + xenbus_remove_device(xdev); + break; + case XenbusStateUnknown: + case XenbusStateInitWait: + xenbus_dev_fatal(xdev, EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + +/* ** Driver registration ** */ + +static struct xenbus_device_id netback_ids[] = { + { "vif" }, + { "" } +}; + +static struct xenbus_driver netback = { + .name = "netback", + .ids = netback_ids, + .probe = netback_probe, + .remove = netback_remove, + .resume= netback_resume, + .otherend_changed = frontend_changed, +}; + +static void +netback_init(void *unused) +{ + callout_init(&rx_task_callout, CALLOUT_MPSAFE); + + mmap_vstart = alloc_empty_page_range(MAX_PENDING_REQS); + BUG_ON(!mmap_vstart); + + pending_cons = 0; + for (pending_prod = 0; pending_prod < MAX_PENDING_REQS; pending_prod++) + pending_ring[pending_prod] = pending_prod; + + TASK_INIT(&net_tx_task, 0, net_tx_action, NULL); + TASK_INIT(&net_rx_task, 0, net_rx_action, NULL); + mtx_init(&tx_sched_list_lock, "nb_tx_sched_lock", "netback tx sched lock", MTX_DEF); + mtx_init(&rx_sched_list_lock, "nb_rx_sched_lock", "netback rx sched lock", MTX_DEF); + + DPRINTF("registering %s\n", netback.name); + + xenbus_register_backend(&netback); +} + +SYSINIT(xnbedev, SI_SUB_PSEUDO, SI_ORDER_ANY, netback_init, NULL) + +static int +vif_add_dev(struct xenbus_device *xdev) +{ + netif_t *netif = xdev->data; + device_t nexus, ndev; + devclass_t dc; + int err = 0; + + mtx_lock(&Giant); + + /* We will add a vif device as a child of nexus0 (for now) */ + if (!(dc = devclass_find("nexus")) || + !(nexus = devclass_get_device(dc, 0))) { + WPRINTF("could not find nexus0!\n"); + err = ENOENT; + goto done; + } + + + /* Create a newbus device representing the vif */ + ndev = BUS_ADD_CHILD(nexus, 0, "vif", netif->ifp->if_dunit); + if (!ndev) { + WPRINTF("could not create newbus device %s!\n", IFNAME(netif)); + err = EFAULT; + goto done; + } + + netif_get(netif); + device_set_ivars(ndev, netif); + netif->ndev = ndev; + + device_probe_and_attach(ndev); + + done: + + mtx_unlock(&Giant); + + return err; +} + +enum { + VIF_SYSCTL_DOMID, + VIF_SYSCTL_HANDLE, + VIF_SYSCTL_TXRING, + VIF_SYSCTL_RXRING, +}; + +static char * +vif_sysctl_ring_info(netif_t *netif, int cmd) +{ + char *buf = malloc(256, M_DEVBUF, M_WAITOK); + if (buf) { + if (!netif->rings_connected) + sprintf(buf, "rings not connected\n"); + else if (cmd == VIF_SYSCTL_TXRING) { + netif_tx_back_ring_t *tx = &netif->tx; + sprintf(buf, "nr_ents=%x req_cons=%x" + " req_prod=%x req_event=%x" + " rsp_prod=%x rsp_event=%x", + tx->nr_ents, tx->req_cons, + tx->sring->req_prod, tx->sring->req_event, + tx->sring->rsp_prod, tx->sring->rsp_event); + } else { + netif_rx_back_ring_t *rx = &netif->rx; + sprintf(buf, "nr_ents=%x req_cons=%x" + " req_prod=%x req_event=%x" + " rsp_prod=%x rsp_event=%x", + rx->nr_ents, rx->req_cons, + rx->sring->req_prod, rx->sring->req_event, + rx->sring->rsp_prod, rx->sring->rsp_event); + } + } + return buf; +} + +static int +vif_sysctl_handler(SYSCTL_HANDLER_ARGS) +{ + device_t dev = (device_t)arg1; + netif_t *netif = (netif_t *)device_get_ivars(dev); + const char *value; + char *buf = NULL; + int err; + + switch (arg2) { + case VIF_SYSCTL_DOMID: + return sysctl_handle_int(oidp, NULL, netif->domid, req); + case VIF_SYSCTL_HANDLE: + return sysctl_handle_int(oidp, NULL, netif->handle, req); + case VIF_SYSCTL_TXRING: + case VIF_SYSCTL_RXRING: + value = buf = vif_sysctl_ring_info(netif, arg2); + break; + default: + return (EINVAL); + } + + err = SYSCTL_OUT(req, value, strlen(value)); + if (buf != NULL) + free(buf, M_DEVBUF); + + return err; +} + +/* Newbus vif device driver probe */ +static int +vif_probe(device_t dev) +{ + DDPRINTF("vif%d\n", device_get_unit(dev)); + return 0; +} + +/* Newbus vif device driver attach */ +static int +vif_attach(device_t dev) +{ + netif_t *netif = (netif_t *)device_get_ivars(dev); + uint8_t mac[ETHER_ADDR_LEN]; + + DDPRINTF("%s\n", IFNAME(netif)); + + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "domid", CTLTYPE_INT|CTLFLAG_RD, + dev, VIF_SYSCTL_DOMID, vif_sysctl_handler, "I", + "domid of frontend"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "handle", CTLTYPE_INT|CTLFLAG_RD, + dev, VIF_SYSCTL_HANDLE, vif_sysctl_handler, "I", + "handle of frontend"); +#ifdef XEN_NETBACK_DEBUG + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "txring", CTLFLAG_RD, + dev, VIF_SYSCTL_TXRING, vif_sysctl_handler, "A", + "tx ring info"); + SYSCTL_ADD_PROC(device_get_sysctl_ctx(dev), SYSCTL_CHILDREN(device_get_sysctl_tree(dev)), + OID_AUTO, "rxring", CTLFLAG_RD, + dev, VIF_SYSCTL_RXRING, vif_sysctl_handler, "A", + "rx ring info"); +#endif + + memset(mac, 0xff, sizeof(mac)); + mac[0] &= ~0x01; + + ether_ifattach(netif->ifp, mac); + netif->attached = 1; + + connect(netif); + + if (netif->bridge) { + DPRINTF("Adding %s to bridge %s\n", IFNAME(netif), netif->bridge); + int err = add_to_bridge(netif->ifp, netif->bridge); + if (err) { + WPRINTF("Error adding %s to %s; err=%d\n", + IFNAME(netif), netif->bridge, err); + } + } + + return bus_generic_attach(dev); +} + +/* Newbus vif device driver detach */ +static int +vif_detach(device_t dev) +{ + netif_t *netif = (netif_t *)device_get_ivars(dev); + struct ifnet *ifp = netif->ifp; + + DDPRINTF("%s\n", IFNAME(netif)); + + /* Tell the stack that the interface is no longer active */ + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); + + ether_ifdetach(ifp); + + bus_generic_detach(dev); + + netif->attached = 0; + + netif_put(netif); + + return 0; +} + +static device_method_t vif_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, vif_probe), + DEVMETHOD(device_attach, vif_attach), + DEVMETHOD(device_detach, vif_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + {0, 0} +}; + +static devclass_t vif_devclass; + +static driver_t vif_driver = { + "vif", + vif_methods, + 0, +}; + +DRIVER_MODULE(vif, nexus, vif_driver, vif_devclass, 0, 0); + + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ diff --git a/sys/dev/xen/netfront/mbufq.h b/sys/dev/xen/netfront/mbufq.h new file mode 100644 index 0000000..0d6c604 --- /dev/null +++ b/sys/dev/xen/netfront/mbufq.h @@ -0,0 +1,123 @@ +/************************************************************************** + +Copyright (c) 2007, Chelsio Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + 1. Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + + 2. Neither the name of the Chelsio Corporation nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +$FreeBSD$ + +***************************************************************************/ + +#ifndef CXGB_MBUFQ_H_ +#define CXGB_MBUFQ_H_ + +struct mbuf_head { + struct mbuf *head; + struct mbuf *tail; + uint32_t qlen; + uint32_t qsize; + struct mtx lock; +}; + +static __inline void +mbufq_init(struct mbuf_head *l) +{ + l->head = l->tail = NULL; + l->qlen = l->qsize = 0; +} + +static __inline int +mbufq_empty(struct mbuf_head *l) +{ + return (l->head == NULL); +} + +static __inline int +mbufq_len(struct mbuf_head *l) +{ + return (l->qlen); +} + +static __inline int +mbufq_size(struct mbuf_head *l) +{ + return (l->qsize); +} + +static __inline int +mbufq_head_size(struct mbuf_head *l) +{ + return (l->head ? l->head->m_pkthdr.len : 0); +} + +static __inline void +mbufq_tail(struct mbuf_head *l, struct mbuf *m) +{ + l->qlen++; + if (l->head == NULL) + l->head = m; + else + l->tail->m_nextpkt = m; + l->tail = m; + l->qsize += m->m_pkthdr.len; +} + +static __inline struct mbuf * +mbufq_dequeue(struct mbuf_head *l) +{ + struct mbuf *m; + + m = l->head; + if (m) { + if (m == l->tail) + l->head = l->tail = NULL; + else + l->head = m->m_nextpkt; + m->m_nextpkt = NULL; + l->qlen--; + l->qsize -= m->m_pkthdr.len; + } + + return (m); +} + +static __inline struct mbuf * +mbufq_peek(struct mbuf_head *l) +{ + return (l->head); +} + +static __inline void +mbufq_append(struct mbuf_head *a, struct mbuf_head *b) +{ + if (a->tail) + a->tail->m_nextpkt = b->head; + if (b->tail) + a->tail = b->tail; + a->qlen += b->qlen; + a->qsize += b->qsize; + + +} +#endif /* CXGB_MBUFQ_H_ */ diff --git a/sys/dev/xen/netfront/netfront.c b/sys/dev/xen/netfront/netfront.c new file mode 100644 index 0000000..fd174b6 --- /dev/null +++ b/sys/dev/xen/netfront/netfront.c @@ -0,0 +1,1829 @@ +/* + * + * Copyright (c) 2004-2006 Kip Macy + * All rights reserved. + * + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sockio.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/queue.h> +#include <sys/sx.h> + +#include <net/if.h> +#include <net/if_arp.h> +#include <net/ethernet.h> +#include <net/if_dl.h> +#include <net/if_media.h> + +#include <net/bpf.h> + +#include <net/if_types.h> +#include <net/if.h> + +#include <netinet/in_systm.h> +#include <netinet/in.h> +#include <netinet/ip.h> +#include <netinet/if_ether.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/clock.h> /* for DELAY */ +#include <machine/bus.h> +#include <machine/resource.h> +#include <machine/frame.h> + + +#include <sys/bus.h> +#include <sys/rman.h> + +#include <machine/intr_machdep.h> + +#include <machine/xen/xen-os.h> +#include <machine/xen/hypervisor.h> +#include <machine/xen/xen_intr.h> +#include <machine/xen/evtchn.h> +#include <machine/xen/xenbus.h> +#include <xen/gnttab.h> +#include <xen/interface/memory.h> +#include <dev/xen/netfront/mbufq.h> +#include <machine/xen/features.h> +#include <xen/interface/io/netif.h> + + +#define GRANT_INVALID_REF 0 + +#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) +#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) + +#ifdef CONFIG_XEN +static int MODPARM_rx_copy = 0; +module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); +MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)"); +static int MODPARM_rx_flip = 0; +module_param_named(rx_flip, MODPARM_rx_flip, bool, 0); +MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)"); +#else +static const int MODPARM_rx_copy = 1; +static const int MODPARM_rx_flip = 0; +#endif + +#define RX_COPY_THRESHOLD 256 + +#define net_ratelimit() 0 + +struct netfront_info; +struct netfront_rx_info; + +static void xn_txeof(struct netfront_info *); +static void xn_rxeof(struct netfront_info *); +static void network_alloc_rx_buffers(struct netfront_info *); + +static void xn_tick_locked(struct netfront_info *); +static void xn_tick(void *); + +static void xn_intr(void *); +static void xn_start_locked(struct ifnet *); +static void xn_start(struct ifnet *); +static int xn_ioctl(struct ifnet *, u_long, caddr_t); +static void xn_ifinit_locked(struct netfront_info *); +static void xn_ifinit(void *); +static void xn_stop(struct netfront_info *); +#ifdef notyet +static void xn_watchdog(struct ifnet *); +#endif + +static void show_device(struct netfront_info *sc); +#ifdef notyet +static void netfront_closing(struct xenbus_device *dev); +#endif +static void netif_free(struct netfront_info *info); +static int netfront_remove(struct xenbus_device *dev); + +static int talk_to_backend(struct xenbus_device *dev, struct netfront_info *info); +static int create_netdev(struct xenbus_device *dev, struct ifnet **ifp); +static void netif_disconnect_backend(struct netfront_info *info); +static int setup_device(struct xenbus_device *dev, struct netfront_info *info); +static void end_access(int ref, void *page); + +/* Xenolinux helper functions */ +static int network_connect(struct ifnet *ifp); + +static void xn_free_rx_ring(struct netfront_info *); + +static void xn_free_tx_ring(struct netfront_info *); + +static int xennet_get_responses(struct netfront_info *np, + struct netfront_rx_info *rinfo, RING_IDX rp, struct mbuf_head *list, + int *pages_flipped_p); + +#define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT) + +#define INVALID_P2M_ENTRY (~0UL) + +/* + * Mbuf pointers. We need these to keep track of the virtual addresses + * of our mbuf chains since we can only convert from virtual to physical, + * not the other way around. The size must track the free index arrays. + */ +struct xn_chain_data { + struct mbuf *xn_tx_chain[NET_TX_RING_SIZE+1]; + struct mbuf *xn_rx_chain[NET_RX_RING_SIZE+1]; +}; + + +struct net_device_stats +{ + u_long rx_packets; /* total packets received */ + u_long tx_packets; /* total packets transmitted */ + u_long rx_bytes; /* total bytes received */ + u_long tx_bytes; /* total bytes transmitted */ + u_long rx_errors; /* bad packets received */ + u_long tx_errors; /* packet transmit problems */ + u_long rx_dropped; /* no space in linux buffers */ + u_long tx_dropped; /* no space available in linux */ + u_long multicast; /* multicast packets received */ + u_long collisions; + + /* detailed rx_errors: */ + u_long rx_length_errors; + u_long rx_over_errors; /* receiver ring buff overflow */ + u_long rx_crc_errors; /* recved pkt with crc error */ + u_long rx_frame_errors; /* recv'd frame alignment error */ + u_long rx_fifo_errors; /* recv'r fifo overrun */ + u_long rx_missed_errors; /* receiver missed packet */ + + /* detailed tx_errors */ + u_long tx_aborted_errors; + u_long tx_carrier_errors; + u_long tx_fifo_errors; + u_long tx_heartbeat_errors; + u_long tx_window_errors; + + /* for cslip etc */ + u_long rx_compressed; + u_long tx_compressed; +}; + +struct netfront_info { + + struct ifnet *xn_ifp; + + struct net_device_stats stats; + u_int tx_full; + + netif_tx_front_ring_t tx; + netif_rx_front_ring_t rx; + + struct mtx tx_lock; + struct mtx rx_lock; + struct sx sc_lock; + + u_int handle; + u_int irq; + u_int copying_receiver; + u_int carrier; + + /* Receive-ring batched refills. */ +#define RX_MIN_TARGET 32 +#define RX_MAX_TARGET NET_RX_RING_SIZE + int rx_min_target, rx_max_target, rx_target; + + /* + * {tx,rx}_skbs store outstanding skbuffs. The first entry in each + * array is an index into a chain of free entries. + */ + + grant_ref_t gref_tx_head; + grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; + grant_ref_t gref_rx_head; + grant_ref_t grant_rx_ref[NET_TX_RING_SIZE + 1]; + +#define TX_MAX_TARGET min(NET_RX_RING_SIZE, 256) + struct xenbus_device *xbdev; + int tx_ring_ref; + int rx_ring_ref; + uint8_t mac[ETHER_ADDR_LEN]; + struct xn_chain_data xn_cdata; /* mbufs */ + struct mbuf_head xn_rx_batch; /* head of the batch queue */ + + int xn_if_flags; + struct callout xn_stat_ch; + + u_long rx_pfn_array[NET_RX_RING_SIZE]; + multicall_entry_t rx_mcl[NET_RX_RING_SIZE+1]; + mmu_update_t rx_mmu[NET_RX_RING_SIZE]; +}; + +#define rx_mbufs xn_cdata.xn_rx_chain +#define tx_mbufs xn_cdata.xn_tx_chain + +#define XN_LOCK_INIT(_sc, _name) \ + mtx_init(&(_sc)->tx_lock, #_name"_tx", "network transmit lock", MTX_DEF); \ + mtx_init(&(_sc)->rx_lock, #_name"_rx", "network receive lock", MTX_DEF); \ + sx_init(&(_sc)->sc_lock, #_name"_rx") + +#define XN_RX_LOCK(_sc) mtx_lock(&(_sc)->rx_lock) +#define XN_RX_UNLOCK(_sc) mtx_unlock(&(_sc)->rx_lock) + +#define XN_TX_LOCK(_sc) mtx_lock(&(_sc)->tx_lock) +#define XN_TX_UNLOCK(_sc) mtx_unlock(&(_sc)->tx_lock) + +#define XN_LOCK(_sc) sx_xlock(&(_sc)->sc_lock); +#define XN_UNLOCK(_sc) sx_xunlock(&(_sc)->sc_lock); + +#define XN_LOCK_ASSERT(_sc) sx_assert(&(_sc)->sc_lock, SX_LOCKED); +#define XN_RX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->rx_lock, MA_OWNED); +#define XN_TX_LOCK_ASSERT(_sc) mtx_assert(&(_sc)->tx_lock, MA_OWNED); +#define XN_LOCK_DESTROY(_sc) mtx_destroy(&(_sc)->rx_lock); \ + mtx_destroy(&(_sc)->tx_lock); \ + sx_destroy(&(_sc)->sc_lock); + +struct netfront_rx_info { + struct netif_rx_response rx; + struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; +}; + +#define netfront_carrier_on(netif) ((netif)->carrier = 1) +#define netfront_carrier_off(netif) ((netif)->carrier = 0) +#define netfront_carrier_ok(netif) ((netif)->carrier) + +/* Access macros for acquiring freeing slots in xn_free_{tx,rx}_idxs[]. */ + + + +/* + * Access macros for acquiring freeing slots in tx_skbs[]. + */ + +static inline void +add_id_to_freelist(struct mbuf **list, unsigned short id) +{ + list[id] = list[0]; + list[0] = (void *)(u_long)id; +} + +static inline unsigned short +get_id_from_freelist(struct mbuf **list) +{ + u_int id = (u_int)(u_long)list[0]; + list[0] = list[id]; + return (id); +} + +static inline int +xennet_rxidx(RING_IDX idx) +{ + return idx & (NET_RX_RING_SIZE - 1); +} + +static inline struct mbuf * +xennet_get_rx_mbuf(struct netfront_info *np, + RING_IDX ri) +{ + int i = xennet_rxidx(ri); + struct mbuf *m; + + m = np->rx_mbufs[i]; + np->rx_mbufs[i] = NULL; + return (m); +} + +static inline grant_ref_t +xennet_get_rx_ref(struct netfront_info *np, RING_IDX ri) +{ + int i = xennet_rxidx(ri); + grant_ref_t ref = np->grant_rx_ref[i]; + np->grant_rx_ref[i] = GRANT_INVALID_REF; + return ref; +} + +#ifdef DEBUG + +#endif +#define IPRINTK(fmt, args...) \ + printf("[XEN] " fmt, ##args) +#define WPRINTK(fmt, args...) \ + printf("[XEN] " fmt, ##args) +#define DPRINTK(fmt, args...) \ + printf("[XEN] " fmt, ##args) + + +static __inline struct mbuf* +makembuf (struct mbuf *buf) +{ + struct mbuf *m = NULL; + + MGETHDR (m, M_DONTWAIT, MT_DATA); + + if (! m) + return 0; + + M_MOVE_PKTHDR(m, buf); + + m_cljget(m, M_DONTWAIT, MJUMPAGESIZE); + m->m_pkthdr.len = buf->m_pkthdr.len; + m->m_len = buf->m_len; + m_copydata(buf, 0, buf->m_pkthdr.len, mtod(m,caddr_t) ); + + m->m_ext.ext_arg1 = (caddr_t *)(uintptr_t)(vtophys(mtod(m,caddr_t)) >> PAGE_SHIFT); + + return m; +} + +/** + * Read the 'mac' node at the given device's node in the store, and parse that + * as colon-separated octets, placing result the given mac array. mac must be + * a preallocated array of length ETH_ALEN (as declared in linux/if_ether.h). + * Return 0 on success, or errno on error. + */ +static int +xen_net_read_mac(struct xenbus_device *dev, uint8_t mac[]) +{ + char *s; + int i; + char *e; + char *macstr = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); + if (IS_ERR(macstr)) { + return PTR_ERR(macstr); + } + s = macstr; + for (i = 0; i < ETHER_ADDR_LEN; i++) { + mac[i] = strtoul(s, &e, 16); + if (s == e || (e[0] != ':' && e[0] != 0)) { + free(macstr, M_DEVBUF); + return ENOENT; + } + s = &e[1]; + } + free(macstr, M_DEVBUF); + return 0; +} + +/** + * Entry point to this code when a new device is created. Allocate the basic + * structures and the ring buffers for communication with the backend, and + * inform the backend of the appropriate details for those. Switch to + * Connected state. + */ +static int +netfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) +{ + int err; + struct ifnet *ifp; + struct netfront_info *info; + + printf("netfront_probe() \n"); + + err = create_netdev(dev, &ifp); + if (err) { + xenbus_dev_fatal(dev, err, "creating netdev"); + return err; + } + + info = ifp->if_softc; + dev->dev_driver_data = info; + + + return 0; +} + + +/** + * We are reconnecting to the backend, due to a suspend/resume, or a backend + * driver restart. We tear down our netif structure and recreate it, but + * leave the device-layer structures intact so that this is transparent to the + * rest of the kernel. + */ +static int +netfront_resume(struct xenbus_device *dev) +{ + struct netfront_info *info = dev->dev_driver_data; + + DPRINTK("%s\n", dev->nodename); + + netif_disconnect_backend(info); + return (0); +} + + +/* Common code used when first setting up, and when resuming. */ +static int +talk_to_backend(struct xenbus_device *dev, struct netfront_info *info) +{ + const char *message; + struct xenbus_transaction xbt; + int err; + + err = xen_net_read_mac(dev, info->mac); + if (err) { + xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); + goto out; + } + + /* Create shared ring, alloc event channel. */ + err = setup_device(dev, info); + if (err) + goto out; + + again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + goto destroy_ring; + } + err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u", + info->tx_ring_ref); + if (err) { + message = "writing tx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u", + info->rx_ring_ref); + if (err) { + message = "writing rx ring-ref"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, + "event-channel", "%u", irq_to_evtchn_port(info->irq)); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u", + info->copying_receiver); + if (err) { + message = "writing request-rx-copy"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1); + if (err) { + message = "writing feature-rx-notify"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", "%d", 1); + if (err) { + message = "writing feature-no-csum-offload"; + goto abort_transaction; + } + err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1); + if (err) { + message = "writing feature-sg"; + goto abort_transaction; + } +#ifdef HAVE_TSO + err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1); + if (err) { + message = "writing feature-gso-tcpv4"; + goto abort_transaction; + } +#endif + + err = xenbus_transaction_end(xbt, 0); + if (err) { + if (err == EAGAIN) + goto again; + xenbus_dev_fatal(dev, err, "completing transaction"); + goto destroy_ring; + } + + return 0; + + abort_transaction: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, err, "%s", message); + destroy_ring: + netif_free(info); + out: + return err; +} + + +static int +setup_device(struct xenbus_device *dev, struct netfront_info *info) +{ + netif_tx_sring_t *txs; + netif_rx_sring_t *rxs; + int err; + struct ifnet *ifp; + + ifp = info->xn_ifp; + + info->tx_ring_ref = GRANT_INVALID_REF; + info->rx_ring_ref = GRANT_INVALID_REF; + info->rx.sring = NULL; + info->tx.sring = NULL; + info->irq = 0; + + txs = (netif_tx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); + if (!txs) { + err = ENOMEM; + xenbus_dev_fatal(dev, err, "allocating tx ring page"); + goto fail; + } + SHARED_RING_INIT(txs); + FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE); + err = xenbus_grant_ring(dev, virt_to_mfn(txs)); + if (err < 0) + goto fail; + info->tx_ring_ref = err; + + rxs = (netif_rx_sring_t *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT|M_ZERO); + if (!rxs) { + err = ENOMEM; + xenbus_dev_fatal(dev, err, "allocating rx ring page"); + goto fail; + } + SHARED_RING_INIT(rxs); + FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE); + + err = xenbus_grant_ring(dev, virt_to_mfn(rxs)); + if (err < 0) + goto fail; + info->rx_ring_ref = err; + +#if 0 + network_connect(ifp); +#endif + err = bind_listening_port_to_irqhandler(dev->otherend_id, + "xn", xn_intr, info, INTR_TYPE_NET | INTR_MPSAFE, NULL); + + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_evtchn_to_irqhandler failed"); + goto fail; + } + info->irq = err; + + show_device(info); + + return 0; + + fail: + netif_free(info); + return err; +} + +/** + * Callback received when the backend's state changes. + */ +static void +backend_changed(struct xenbus_device *dev, + XenbusState backend_state) +{ + struct netfront_info *sc = dev->dev_driver_data; + + DPRINTK("\n"); + + switch (backend_state) { + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateConnected: + case XenbusStateUnknown: + case XenbusStateClosed: + break; + case XenbusStateInitWait: + if (dev->state != XenbusStateInitialising) + break; + if (network_connect(sc->xn_ifp) != 0) + break; + xenbus_switch_state(dev, XenbusStateConnected); +#ifdef notyet + (void)send_fake_arp(netdev); +#endif + break; break; + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static void +xn_free_rx_ring(struct netfront_info *sc) +{ +#if 0 + int i; + + for (i = 0; i < NET_RX_RING_SIZE; i++) { + if (sc->xn_cdata.xn_rx_chain[i] != NULL) { + m_freem(sc->xn_cdata.xn_rx_chain[i]); + sc->xn_cdata.xn_rx_chain[i] = NULL; + } + } + + sc->rx.rsp_cons = 0; + sc->xn_rx_if->req_prod = 0; + sc->xn_rx_if->event = sc->rx.rsp_cons ; +#endif +} + +static void +xn_free_tx_ring(struct netfront_info *sc) +{ +#if 0 + int i; + + for (i = 0; i < NET_TX_RING_SIZE; i++) { + if (sc->xn_cdata.xn_tx_chain[i] != NULL) { + m_freem(sc->xn_cdata.xn_tx_chain[i]); + sc->xn_cdata.xn_tx_chain[i] = NULL; + } + } + + return; +#endif +} + +static inline int +netfront_tx_slot_available(struct netfront_info *np) +{ + return ((np->tx.req_prod_pvt - np->tx.rsp_cons) < + (TX_MAX_TARGET - /* MAX_SKB_FRAGS */ 24 - 2)); +} +static void +netif_release_tx_bufs(struct netfront_info *np) +{ + struct mbuf *m; + int i; + + for (i = 1; i <= NET_TX_RING_SIZE; i++) { + m = np->xn_cdata.xn_tx_chain[i]; + + if (((u_long)m) < KERNBASE) + continue; + gnttab_grant_foreign_access_ref(np->grant_tx_ref[i], + np->xbdev->otherend_id, virt_to_mfn(mtod(m, vm_offset_t)), + GNTMAP_readonly); + gnttab_release_grant_reference(&np->gref_tx_head, + np->grant_tx_ref[i]); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + add_id_to_freelist(np->tx_mbufs, i); + m_freem(m); + } +} + +static void +network_alloc_rx_buffers(struct netfront_info *sc) +{ + unsigned short id; + struct mbuf *m_new; + int i, batch_target, notify; + RING_IDX req_prod; + struct xen_memory_reservation reservation; + grant_ref_t ref; + int nr_flips; + netif_rx_request_t *req; + vm_offset_t vaddr; + u_long pfn; + + req_prod = sc->rx.req_prod_pvt; + + if (unlikely(sc->carrier == 0)) + return; + + /* + * Allocate skbuffs greedily, even though we batch updates to the + * receive ring. This creates a less bursty demand on the memory + * allocator, so should reduce the chance of failed allocation + * requests both for ourself and for other kernel subsystems. + */ + batch_target = sc->rx_target - (req_prod - sc->rx.rsp_cons); + for (i = mbufq_len(&sc->xn_rx_batch); i < batch_target; i++) { + MGETHDR(m_new, M_DONTWAIT, MT_DATA); + if (m_new == NULL) + goto no_mbuf; + + m_cljget(m_new, M_DONTWAIT, MJUMPAGESIZE); + if ((m_new->m_flags & M_EXT) == 0) { + m_freem(m_new); + +no_mbuf: + if (i != 0) + goto refill; + /* + * XXX set timer + */ + break; + } + m_new->m_len = m_new->m_pkthdr.len = MJUMPAGESIZE; + + /* queue the mbufs allocated */ + mbufq_tail(&sc->xn_rx_batch, m_new); + } + + /* Is the batch large enough to be worthwhile? */ + if (i < (sc->rx_target/2)) { + if (req_prod >sc->rx.sring->req_prod) + goto push; + return; + } + /* Adjust floating fill target if we risked running out of buffers. */ + if ( ((req_prod - sc->rx.sring->rsp_prod) < (sc->rx_target / 4)) && + ((sc->rx_target *= 2) > sc->rx_max_target) ) + sc->rx_target = sc->rx_max_target; + +refill: + for (nr_flips = i = 0; ; i++) { + if ((m_new = mbufq_dequeue(&sc->xn_rx_batch)) == NULL) + break; + + m_new->m_ext.ext_arg1 = (vm_paddr_t *)(uintptr_t)( + vtophys(m_new->m_ext.ext_buf) >> PAGE_SHIFT); + + id = xennet_rxidx(req_prod + i); + + KASSERT(sc->xn_cdata.xn_rx_chain[id] == NULL, + ("non-NULL xm_rx_chain")); + sc->xn_cdata.xn_rx_chain[id] = m_new; + + ref = gnttab_claim_grant_reference(&sc->gref_rx_head); + KASSERT((short)ref >= 0, ("negative ref")); + sc->grant_rx_ref[id] = ref; + + vaddr = mtod(m_new, vm_offset_t); + pfn = vtophys(vaddr) >> PAGE_SHIFT; + req = RING_GET_REQUEST(&sc->rx, req_prod + i); + + if (sc->copying_receiver == 0) { + gnttab_grant_foreign_transfer_ref(ref, + sc->xbdev->otherend_id, pfn); + sc->rx_pfn_array[nr_flips] = PFNTOMFN(pfn); + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + /* Remove this page before passing + * back to Xen. + */ + set_phys_to_machine(pfn, INVALID_P2M_ENTRY); + MULTI_update_va_mapping(&sc->rx_mcl[i], + vaddr, 0, 0); + } + nr_flips++; + } else { + gnttab_grant_foreign_access_ref(ref, + sc->xbdev->otherend_id, + PFNTOMFN(pfn), 0); + } + req->id = id; + req->gref = ref; + + sc->rx_pfn_array[i] = + vtomach(mtod(m_new,vm_offset_t)) >> PAGE_SHIFT; + } + + KASSERT(i, ("no mbufs processed")); /* should have returned earlier */ + KASSERT(mbufq_len(&sc->xn_rx_batch) == 0, ("not all mbufs processed")); + /* + * We may have allocated buffers which have entries outstanding + * in the page * update queue -- make sure we flush those first! + */ + PT_UPDATES_FLUSH(); + if (nr_flips != 0) { +#ifdef notyet + /* Tell the ballon driver what is going on. */ + balloon_update_driver_allowance(i); +#endif + set_xen_guest_handle(reservation.extent_start,sc->rx_pfn_array); + reservation.nr_extents = i; + reservation.extent_order = 0; + reservation.address_bits = 0; + reservation.domid = DOMID_SELF; + + if (!xen_feature(XENFEAT_auto_translated_physmap)) { + + /* After all PTEs have been zapped, flush the TLB. */ + sc->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = + UVMF_TLB_FLUSH|UVMF_ALL; + + /* Give away a batch of pages. */ + sc->rx_mcl[i].op = __HYPERVISOR_memory_op; + sc->rx_mcl[i].args[0] = XENMEM_decrease_reservation; + sc->rx_mcl[i].args[1] = (u_long)&reservation; + /* Zap PTEs and give away pages in one big multicall. */ + (void)HYPERVISOR_multicall(sc->rx_mcl, i+1); + + /* Check return status of HYPERVISOR_dom_mem_op(). */ + if (unlikely(sc->rx_mcl[i].result != i)) + panic("Unable to reduce memory reservation\n"); + } else { + if (HYPERVISOR_memory_op( + XENMEM_decrease_reservation, &reservation) + != i) + panic("Unable to reduce memory " + "reservation\n"); + } + } else { + wmb(); + } + + /* Above is a suitable barrier to ensure backend will see requests. */ + sc->rx.req_prod_pvt = req_prod + i; +push: + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->rx, notify); + if (notify) + notify_remote_via_irq(sc->irq); +} + +static void +xn_rxeof(struct netfront_info *np) +{ + struct ifnet *ifp; + struct netfront_rx_info rinfo; + struct netif_rx_response *rx = &rinfo.rx; + struct netif_extra_info *extras = rinfo.extras; + RING_IDX i, rp; + multicall_entry_t *mcl; + struct mbuf *m; + struct mbuf_head rxq, errq, tmpq; + int err, pages_flipped = 0; + + XN_RX_LOCK_ASSERT(np); + if (!netfront_carrier_ok(np)) + return; + + mbufq_init(&tmpq); + mbufq_init(&errq); + mbufq_init(&rxq); + + ifp = np->xn_ifp; + + rp = np->rx.sring->rsp_prod; + rmb(); /* Ensure we see queued responses up to 'rp'. */ + + i = np->rx.rsp_cons; + while ((i != rp)) { + memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); + memset(extras, 0, sizeof(rinfo.extras)); + + err = xennet_get_responses(np, &rinfo, rp, &tmpq, + &pages_flipped); + + if (unlikely(err)) { + while ((m = mbufq_dequeue(&tmpq))) + mbufq_tail(&errq, m); + np->stats.rx_errors++; + i = np->rx.rsp_cons; + continue; + } + + m = mbufq_dequeue(&tmpq); + + m->m_data += rx->offset;/* (rx->addr & PAGE_MASK); */ + m->m_pkthdr.len = m->m_len = rx->status; + m->m_pkthdr.rcvif = ifp; + + if ( rx->flags & NETRXF_data_validated ) { + /* Tell the stack the checksums are okay */ + /* + * XXX this isn't necessarily the case - need to add + * check + */ + + m->m_pkthdr.csum_flags |= + (CSUM_IP_CHECKED | CSUM_IP_VALID | CSUM_DATA_VALID + | CSUM_PSEUDO_HDR); + m->m_pkthdr.csum_data = 0xffff; + } + + np->stats.rx_packets++; + np->stats.rx_bytes += rx->status; + + mbufq_tail(&rxq, m); + np->rx.rsp_cons = ++i; + } + + if (pages_flipped) { + /* Some pages are no longer absent... */ +#ifdef notyet + balloon_update_driver_allowance(-pages_flipped); +#endif + /* Do all the remapping work, and M->P updates, in one big + * hypercall. + */ + if (!!xen_feature(XENFEAT_auto_translated_physmap)) { + mcl = np->rx_mcl + pages_flipped; + mcl->op = __HYPERVISOR_mmu_update; + mcl->args[0] = (u_long)np->rx_mmu; + mcl->args[1] = pages_flipped; + mcl->args[2] = 0; + mcl->args[3] = DOMID_SELF; + (void)HYPERVISOR_multicall(np->rx_mcl, + pages_flipped + 1); + } + } + + while ((m = mbufq_dequeue(&errq))) + m_freem(m); + + /* + * Process all the mbufs after the remapping is complete. + * Break the mbuf chain first though. + */ + while ((m = mbufq_dequeue(&rxq)) != NULL) { + ifp->if_ipackets++; + + /* + * Do we really need to drop the rx lock? + */ + XN_RX_UNLOCK(np); + /* Pass it up. */ + (*ifp->if_input)(ifp, m); + XN_RX_LOCK(np); + } + + np->rx.rsp_cons = i; + +#if 0 + /* If we get a callback with very few responses, reduce fill target. */ + /* NB. Note exponential increase, linear decrease. */ + if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) > + ((3*np->rx_target) / 4)) && (--np->rx_target < np->rx_min_target)) + np->rx_target = np->rx_min_target; +#endif + + network_alloc_rx_buffers(np); + + np->rx.sring->rsp_event = i + 1; +} + +static void +xn_txeof(struct netfront_info *np) +{ + RING_IDX i, prod; + unsigned short id; + struct ifnet *ifp; + struct mbuf *m; + + XN_TX_LOCK_ASSERT(np); + + if (!netfront_carrier_ok(np)) + return; + + ifp = np->xn_ifp; + ifp->if_timer = 0; + + do { + prod = np->tx.sring->rsp_prod; + rmb(); /* Ensure we see responses up to 'rp'. */ + + for (i = np->tx.rsp_cons; i != prod; i++) { + id = RING_GET_RESPONSE(&np->tx, i)->id; + m = np->xn_cdata.xn_tx_chain[id]; + + ifp->if_opackets++; + KASSERT(m != NULL, ("mbuf not found in xn_tx_chain")); + M_ASSERTVALID(m); + if (unlikely(gnttab_query_foreign_access( + np->grant_tx_ref[id]) != 0)) { + printf("network_tx_buf_gc: warning " + "-- grant still in use by backend " + "domain.\n"); + goto out; + } + gnttab_end_foreign_access_ref( + np->grant_tx_ref[id], GNTMAP_readonly); + gnttab_release_grant_reference( + &np->gref_tx_head, np->grant_tx_ref[id]); + np->grant_tx_ref[id] = GRANT_INVALID_REF; + + np->xn_cdata.xn_tx_chain[id] = NULL; + add_id_to_freelist(np->xn_cdata.xn_tx_chain, id); + m_freem(m); + } + np->tx.rsp_cons = prod; + + /* + * Set a new event, then check for race with update of + * tx_cons. Note that it is essential to schedule a + * callback, no matter how few buffers are pending. Even if + * there is space in the transmit ring, higher layers may + * be blocked because too much data is outstanding: in such + * cases notification from Xen is likely to be the only kick + * that we'll get. + */ + np->tx.sring->rsp_event = + prod + ((np->tx.sring->req_prod - prod) >> 1) + 1; + + mb(); + + } while (prod != np->tx.sring->rsp_prod); + + out: + if (np->tx_full && + ((np->tx.sring->req_prod - prod) < NET_TX_RING_SIZE)) { + np->tx_full = 0; +#if 0 + if (np->user_state == UST_OPEN) + netif_wake_queue(dev); +#endif + } + +} + +static void +xn_intr(void *xsc) +{ + struct netfront_info *np = xsc; + struct ifnet *ifp = np->xn_ifp; + +#if 0 + if (!(np->rx.rsp_cons != np->rx.sring->rsp_prod && + likely(netfront_carrier_ok(np)) && + ifp->if_drv_flags & IFF_DRV_RUNNING)) + return; +#endif + if (np->tx.rsp_cons != np->tx.sring->rsp_prod) { + XN_TX_LOCK(np); + xn_txeof(np); + XN_TX_UNLOCK(np); + } + + XN_RX_LOCK(np); + xn_rxeof(np); + XN_RX_UNLOCK(np); + + if (ifp->if_drv_flags & IFF_DRV_RUNNING && + !IFQ_DRV_IS_EMPTY(&ifp->if_snd)) + xn_start(ifp); +} + + +static void +xennet_move_rx_slot(struct netfront_info *np, struct mbuf *m, + grant_ref_t ref) +{ + int new = xennet_rxidx(np->rx.req_prod_pvt); + + KASSERT(np->rx_mbufs[new] == NULL, ("rx_mbufs != NULL")); + np->rx_mbufs[new] = m; + np->grant_rx_ref[new] = ref; + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new; + RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref; + np->rx.req_prod_pvt++; +} + +static int +xennet_get_extras(struct netfront_info *np, + struct netif_extra_info *extras, RING_IDX rp) +{ + struct netif_extra_info *extra; + RING_IDX cons = np->rx.rsp_cons; + + int err = 0; + + do { + struct mbuf *m; + grant_ref_t ref; + + if (unlikely(cons + 1 == rp)) { +#if 0 + if (net_ratelimit()) + WPRINTK("Missing extra info\n"); +#endif + err = -EINVAL; + break; + } + + extra = (struct netif_extra_info *) + RING_GET_RESPONSE(&np->rx, ++cons); + + if (unlikely(!extra->type || + extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { +#if 0 + if (net_ratelimit()) + WPRINTK("Invalid extra type: %d\n", + extra->type); +#endif + err = -EINVAL; + } else { + memcpy(&extras[extra->type - 1], extra, sizeof(*extra)); + } + + m = xennet_get_rx_mbuf(np, cons); + ref = xennet_get_rx_ref(np, cons); + xennet_move_rx_slot(np, m, ref); + } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); + + np->rx.rsp_cons = cons; + return err; +} + +static int +xennet_get_responses(struct netfront_info *np, + struct netfront_rx_info *rinfo, RING_IDX rp, + struct mbuf_head *list, + int *pages_flipped_p) +{ + int pages_flipped = *pages_flipped_p; + struct mmu_update *mmu; + struct multicall_entry *mcl; + struct netif_rx_response *rx = &rinfo->rx; + struct netif_extra_info *extras = rinfo->extras; + RING_IDX cons = np->rx.rsp_cons; + struct mbuf *m = xennet_get_rx_mbuf(np, cons); + grant_ref_t ref = xennet_get_rx_ref(np, cons); + int max = 24 /* MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD) */; + int frags = 1; + int err = 0; + u_long ret; + + if (rx->flags & NETRXF_extra_info) { + err = xennet_get_extras(np, extras, rp); + cons = np->rx.rsp_cons; + } + + for (;;) { + u_long mfn; + + if (unlikely(rx->status < 0 || + rx->offset + rx->status > PAGE_SIZE)) { +#if 0 + if (net_ratelimit()) + WPRINTK("rx->offset: %x, size: %u\n", + rx->offset, rx->status); +#endif + xennet_move_rx_slot(np, m, ref); + err = -EINVAL; + goto next; + } + + /* + * This definitely indicates a bug, either in this driver or in + * the backend driver. In future this should flag the bad + * situation to the system controller to reboot the backed. + */ + if (ref == GRANT_INVALID_REF) { +#if 0 + if (net_ratelimit()) + WPRINTK("Bad rx response id %d.\n", rx->id); +#endif + err = -EINVAL; + goto next; + } + + if (!np->copying_receiver) { + /* Memory pressure, insufficient buffer + * headroom, ... + */ + if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) { + if (net_ratelimit()) + WPRINTK("Unfulfilled rx req " + "(id=%d, st=%d).\n", + rx->id, rx->status); + xennet_move_rx_slot(np, m, ref); + err = -ENOMEM; + goto next; + } + + if (!xen_feature( XENFEAT_auto_translated_physmap)) { + /* Remap the page. */ + void *vaddr = mtod(m, void *); + uint32_t pfn; + + mcl = np->rx_mcl + pages_flipped; + mmu = np->rx_mmu + pages_flipped; + + MULTI_update_va_mapping(mcl, (u_long)vaddr, + (mfn << PAGE_SHIFT) | PG_RW | + PG_V | PG_M | PG_A, 0); + pfn = (uint32_t)m->m_ext.ext_arg1; + mmu->ptr = ((vm_paddr_t)mfn << PAGE_SHIFT) | + MMU_MACHPHYS_UPDATE; + mmu->val = pfn; + + set_phys_to_machine(pfn, mfn); + } + pages_flipped++; + } else { + ret = gnttab_end_foreign_access_ref(ref, 0); + KASSERT(ret, ("ret != 0")); + } + + gnttab_release_grant_reference(&np->gref_rx_head, ref); + mbufq_tail(list, m); + +next: + if (!(rx->flags & NETRXF_more_data)) + break; + + if (cons + frags == rp) { + if (net_ratelimit()) + WPRINTK("Need more frags\n"); + err = -ENOENT; + break; + } + + rx = RING_GET_RESPONSE(&np->rx, cons + frags); + m = xennet_get_rx_mbuf(np, cons + frags); + ref = xennet_get_rx_ref(np, cons + frags); + frags++; + } + + if (unlikely(frags > max)) { + if (net_ratelimit()) + WPRINTK("Too many frags\n"); + err = -E2BIG; + } + + if (unlikely(err)) + np->rx.rsp_cons = cons + frags; + + *pages_flipped_p = pages_flipped; + + return err; +} + +static void +xn_tick_locked(struct netfront_info *sc) +{ + XN_RX_LOCK_ASSERT(sc); + callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); + + /* XXX placeholder for printing debug information */ + +} + + +static void +xn_tick(void *xsc) +{ + struct netfront_info *sc; + + sc = xsc; + XN_RX_LOCK(sc); + xn_tick_locked(sc); + XN_RX_UNLOCK(sc); + +} +static void +xn_start_locked(struct ifnet *ifp) +{ + unsigned short id; + struct mbuf *m_head, *new_m; + struct netfront_info *sc; + netif_tx_request_t *tx; + RING_IDX i; + grant_ref_t ref; + u_long mfn, tx_bytes; + int notify; + + sc = ifp->if_softc; + tx_bytes = 0; + + if (!netfront_carrier_ok(sc)) + return; + + for (i = sc->tx.req_prod_pvt; TRUE; i++) { + IF_DEQUEUE(&ifp->if_snd, m_head); + if (m_head == NULL) + break; + + if (!netfront_tx_slot_available(sc)) { + IF_PREPEND(&ifp->if_snd, m_head); + ifp->if_drv_flags |= IFF_DRV_OACTIVE; + break; + } + + id = get_id_from_freelist(sc->xn_cdata.xn_tx_chain); + + /* + * Start packing the mbufs in this chain into + * the fragment pointers. Stop when we run out + * of fragments or hit the end of the mbuf chain. + */ + new_m = makembuf(m_head); + tx = RING_GET_REQUEST(&sc->tx, i); + tx->id = id; + ref = gnttab_claim_grant_reference(&sc->gref_tx_head); + KASSERT((short)ref >= 0, ("Negative ref")); + mfn = virt_to_mfn(mtod(new_m, vm_offset_t)); + gnttab_grant_foreign_access_ref(ref, sc->xbdev->otherend_id, + mfn, GNTMAP_readonly); + tx->gref = sc->grant_tx_ref[id] = ref; + tx->size = new_m->m_pkthdr.len; +#if 0 + tx->flags = (skb->ip_summed == CHECKSUM_HW) ? NETTXF_csum_blank : 0; +#endif + tx->flags = 0; + new_m->m_next = NULL; + new_m->m_nextpkt = NULL; + + m_freem(m_head); + + sc->xn_cdata.xn_tx_chain[id] = new_m; + BPF_MTAP(ifp, new_m); + + sc->stats.tx_bytes += new_m->m_pkthdr.len; + sc->stats.tx_packets++; + } + + sc->tx.req_prod_pvt = i; + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&sc->tx, notify); + if (notify) + notify_remote_via_irq(sc->irq); + + xn_txeof(sc); + + if (RING_FULL(&sc->tx)) { + sc->tx_full = 1; +#if 0 + netif_stop_queue(dev); +#endif + } + + return; +} + +static void +xn_start(struct ifnet *ifp) +{ + struct netfront_info *sc; + sc = ifp->if_softc; + XN_TX_LOCK(sc); + xn_start_locked(ifp); + XN_TX_UNLOCK(sc); +} + +/* equivalent of network_open() in Linux */ +static void +xn_ifinit_locked(struct netfront_info *sc) +{ + struct ifnet *ifp; + + XN_LOCK_ASSERT(sc); + + ifp = sc->xn_ifp; + + if (ifp->if_drv_flags & IFF_DRV_RUNNING) + return; + + xn_stop(sc); + + network_alloc_rx_buffers(sc); + sc->rx.sring->rsp_event = sc->rx.rsp_cons + 1; + + ifp->if_drv_flags |= IFF_DRV_RUNNING; + ifp->if_drv_flags &= ~IFF_DRV_OACTIVE; + + callout_reset(&sc->xn_stat_ch, hz, xn_tick, sc); + +} + + +static void +xn_ifinit(void *xsc) +{ + struct netfront_info *sc = xsc; + + XN_LOCK(sc); + xn_ifinit_locked(sc); + XN_UNLOCK(sc); + +} + + +static int +xn_ioctl(struct ifnet *ifp, u_long cmd, caddr_t data) +{ + struct netfront_info *sc = ifp->if_softc; + struct ifreq *ifr = (struct ifreq *) data; + struct ifaddr *ifa = (struct ifaddr *)data; + + int mask, error = 0; + switch(cmd) { + case SIOCSIFADDR: + case SIOCGIFADDR: + XN_LOCK(sc); + if (ifa->ifa_addr->sa_family == AF_INET) { + ifp->if_flags |= IFF_UP; + if (!(ifp->if_drv_flags & IFF_DRV_RUNNING)) + xn_ifinit_locked(sc); + arp_ifinit(ifp, ifa); + } else + error = ether_ioctl(ifp, cmd, data); + XN_UNLOCK(sc); + break; + case SIOCSIFMTU: + /* XXX can we alter the MTU on a VN ?*/ +#ifdef notyet + if (ifr->ifr_mtu > XN_JUMBO_MTU) + error = EINVAL; + else +#endif + { + ifp->if_mtu = ifr->ifr_mtu; + ifp->if_drv_flags &= ~IFF_DRV_RUNNING; + xn_ifinit(sc); + } + break; + case SIOCSIFFLAGS: + XN_LOCK(sc); + if (ifp->if_flags & IFF_UP) { + /* + * If only the state of the PROMISC flag changed, + * then just use the 'set promisc mode' command + * instead of reinitializing the entire NIC. Doing + * a full re-init means reloading the firmware and + * waiting for it to start up, which may take a + * second or two. + */ +#ifdef notyet + /* No promiscuous mode with Xen */ + if (ifp->if_drv_flags & IFF_DRV_RUNNING && + ifp->if_flags & IFF_PROMISC && + !(sc->xn_if_flags & IFF_PROMISC)) { + XN_SETBIT(sc, XN_RX_MODE, + XN_RXMODE_RX_PROMISC); + } else if (ifp->if_drv_flags & IFF_DRV_RUNNING && + !(ifp->if_flags & IFF_PROMISC) && + sc->xn_if_flags & IFF_PROMISC) { + XN_CLRBIT(sc, XN_RX_MODE, + XN_RXMODE_RX_PROMISC); + } else +#endif + xn_ifinit_locked(sc); + } else { + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + xn_stop(sc); + } + } + sc->xn_if_flags = ifp->if_flags; + XN_UNLOCK(sc); + error = 0; + break; + case SIOCSIFCAP: + mask = ifr->ifr_reqcap ^ ifp->if_capenable; + if (mask & IFCAP_HWCSUM) { + if (IFCAP_HWCSUM & ifp->if_capenable) + ifp->if_capenable &= ~IFCAP_HWCSUM; + else + ifp->if_capenable |= IFCAP_HWCSUM; + } + error = 0; + break; + case SIOCADDMULTI: + case SIOCDELMULTI: +#ifdef notyet + if (ifp->if_drv_flags & IFF_DRV_RUNNING) { + XN_LOCK(sc); + xn_setmulti(sc); + XN_UNLOCK(sc); + error = 0; + } +#endif + /* FALLTHROUGH */ + case SIOCSIFMEDIA: + case SIOCGIFMEDIA: + error = EINVAL; + break; + default: + error = ether_ioctl(ifp, cmd, data); + } + + return (error); +} + +static void +xn_stop(struct netfront_info *sc) +{ + struct ifnet *ifp; + + XN_LOCK_ASSERT(sc); + + ifp = sc->xn_ifp; + + callout_stop(&sc->xn_stat_ch); + + xn_free_rx_ring(sc); + xn_free_tx_ring(sc); + + ifp->if_drv_flags &= ~(IFF_DRV_RUNNING | IFF_DRV_OACTIVE); +} + +/* START of Xenolinux helper functions adapted to FreeBSD */ +static int +network_connect(struct ifnet *ifp) +{ + struct netfront_info *np; + int i, requeue_idx, err; + grant_ref_t ref; + netif_rx_request_t *req; + u_int feature_rx_copy, feature_rx_flip; + + printf("network_connect\n"); + + np = ifp->if_softc; + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-copy", "%u", &feature_rx_copy); + if (err != 1) + feature_rx_copy = 0; + err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, + "feature-rx-flip", "%u", &feature_rx_flip); + if (err != 1) + feature_rx_flip = 1; + + /* + * Copy packets on receive path if: + * (a) This was requested by user, and the backend supports it; or + * (b) Flipping was requested, but this is unsupported by the backend. + */ + np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) || + (MODPARM_rx_flip && !feature_rx_flip)); + + XN_LOCK(np); + /* Recovery procedure: */ + err = talk_to_backend(np->xbdev, np); + if (err) + return (err); + + /* Step 1: Reinitialise variables. */ + netif_release_tx_bufs(np); + + /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */ + for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) { + struct mbuf *m; + + if (np->rx_mbufs[i] == NULL) + continue; + + m = np->rx_mbufs[requeue_idx] = xennet_get_rx_mbuf(np, i); + ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i); + req = RING_GET_REQUEST(&np->rx, requeue_idx); + + if (!np->copying_receiver) { + gnttab_grant_foreign_transfer_ref(ref, + np->xbdev->otherend_id, + vtophys(mtod(m, vm_offset_t))); + } else { + gnttab_grant_foreign_access_ref(ref, + np->xbdev->otherend_id, + vtophys(mtod(m, vm_offset_t)), 0); + } + req->gref = ref; + req->id = requeue_idx; + + requeue_idx++; + } + + np->rx.req_prod_pvt = requeue_idx; + + /* Step 3: All public and private state should now be sane. Get + * ready to start sending and receiving packets and give the driver + * domain a kick because we've probably just requeued some + * packets. + */ + netfront_carrier_on(np); + notify_remote_via_irq(np->irq); + XN_TX_LOCK(np); + xn_txeof(np); + XN_TX_UNLOCK(np); + network_alloc_rx_buffers(np); + XN_UNLOCK(np); + + return (0); +} + + +static void +show_device(struct netfront_info *sc) +{ +#ifdef DEBUG + if (sc) { + IPRINTK("<vif handle=%u %s(%s) evtchn=%u irq=%u tx=%p rx=%p>\n", + sc->xn_ifno, + be_state_name[sc->xn_backend_state], + sc->xn_user_state ? "open" : "closed", + sc->xn_evtchn, + sc->xn_irq, + sc->xn_tx_if, + sc->xn_rx_if); + } else { + IPRINTK("<vif NULL>\n"); + } +#endif +} + +static int ifno = 0; + +/** Create a network device. + * @param handle device handle + */ +static int +create_netdev(struct xenbus_device *dev, struct ifnet **ifpp) +{ + int i; + struct netfront_info *np; + int err; + struct ifnet *ifp; + + np = (struct netfront_info *)malloc(sizeof(struct netfront_info), + M_DEVBUF, M_NOWAIT); + if (np == NULL) + return (ENOMEM); + + memset(np, 0, sizeof(struct netfront_info)); + + np->xbdev = dev; + + XN_LOCK_INIT(np, xennetif); + np->rx_target = RX_MIN_TARGET; + np->rx_min_target = RX_MIN_TARGET; + np->rx_max_target = RX_MAX_TARGET; + + /* Initialise {tx,rx}_skbs to be a free chain containing every entry. */ + for (i = 0; i <= NET_TX_RING_SIZE; i++) { + np->tx_mbufs[i] = (void *) ((u_long) i+1); + np->grant_tx_ref[i] = GRANT_INVALID_REF; + } + for (i = 0; i <= NET_RX_RING_SIZE; i++) { + np->rx_mbufs[i] = NULL; + np->grant_rx_ref[i] = GRANT_INVALID_REF; + } + /* A grant for every tx ring slot */ + if (gnttab_alloc_grant_references(TX_MAX_TARGET, + &np->gref_tx_head) < 0) { + printf("#### netfront can't alloc tx grant refs\n"); + err = ENOMEM; + goto exit; + } + /* A grant for every rx ring slot */ + if (gnttab_alloc_grant_references(RX_MAX_TARGET, + &np->gref_rx_head) < 0) { + printf("#### netfront can't alloc rx grant refs\n"); + gnttab_free_grant_references(np->gref_tx_head); + err = ENOMEM; + goto exit; + } + + err = xen_net_read_mac(dev, np->mac); + if (err) { + xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); + goto out; + } + + /* Set up ifnet structure */ + *ifpp = ifp = np->xn_ifp = if_alloc(IFT_ETHER); + ifp->if_softc = np; + if_initname(ifp, "xn", ifno++/* ifno */); + ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX; + ifp->if_ioctl = xn_ioctl; + ifp->if_output = ether_output; + ifp->if_start = xn_start; +#ifdef notyet + ifp->if_watchdog = xn_watchdog; +#endif + ifp->if_init = xn_ifinit; + ifp->if_mtu = ETHERMTU; + ifp->if_snd.ifq_maxlen = NET_TX_RING_SIZE - 1; + +#ifdef notyet + ifp->if_hwassist = XN_CSUM_FEATURES; + ifp->if_capabilities = IFCAP_HWCSUM; + ifp->if_capenable = ifp->if_capabilities; +#endif + + ether_ifattach(ifp, np->mac); + callout_init(&np->xn_stat_ch, CALLOUT_MPSAFE); + netfront_carrier_off(np); + + return (0); + +exit: + gnttab_free_grant_references(np->gref_tx_head); +out: + panic("do something smart"); + +} + +/** + * Handle the change of state of the backend to Closing. We must delete our + * device-layer structures now, to ensure that writes are flushed through to + * the backend. Once is this done, we can switch to Closed in + * acknowledgement. + */ +#if 0 +static void netfront_closing(struct xenbus_device *dev) +{ +#if 0 + struct netfront_info *info = dev->dev_driver_data; + + DPRINTK("netfront_closing: %s removed\n", dev->nodename); + + close_netdev(info); +#endif + xenbus_switch_state(dev, XenbusStateClosed); +} +#endif + +static int netfront_remove(struct xenbus_device *dev) +{ + struct netfront_info *info = dev->dev_driver_data; + + DPRINTK("%s\n", dev->nodename); + + netif_free(info); + free(info, M_DEVBUF); + + return 0; +} + + +static void netif_free(struct netfront_info *info) +{ + netif_disconnect_backend(info); +#if 0 + close_netdev(info); +#endif +} + + + +static void netif_disconnect_backend(struct netfront_info *info) +{ + xn_stop(info); + end_access(info->tx_ring_ref, info->tx.sring); + end_access(info->rx_ring_ref, info->rx.sring); + info->tx_ring_ref = GRANT_INVALID_REF; + info->rx_ring_ref = GRANT_INVALID_REF; + info->tx.sring = NULL; + info->rx.sring = NULL; + +#if 0 + if (info->irq) + unbind_from_irqhandler(info->irq, info->netdev); +#else + panic("FIX ME"); +#endif + info->irq = 0; +} + + +static void end_access(int ref, void *page) +{ + if (ref != GRANT_INVALID_REF) + gnttab_end_foreign_access(ref, 0, page); +} + + +/* ** Driver registration ** */ + + +static struct xenbus_device_id netfront_ids[] = { + { "vif" }, + { "" } +}; + + +static struct xenbus_driver netfront = { + .name = "vif", + .ids = netfront_ids, + .probe = netfront_probe, + .remove = netfront_remove, + .resume = netfront_resume, + .otherend_changed = backend_changed, +}; + +static void +netif_init(void *unused) +{ + if (!is_running_on_xen()) + return; + + if (is_initial_xendomain()) + return; + + IPRINTK("Initialising virtual ethernet driver.\n"); + + xenbus_register_frontend(&netfront); +} + +SYSINIT(xennetif, SI_SUB_PSEUDO, SI_ORDER_ANY, netif_init, NULL) + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 8 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ diff --git a/sys/dev/xen/pcifront/pcifront.c b/sys/dev/xen/pcifront/pcifront.c new file mode 100644 index 0000000..e6c498b --- /dev/null +++ b/sys/dev/xen/pcifront/pcifront.c @@ -0,0 +1,688 @@ +/* + * Copyright (c) 2006, Cisco Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of Cisco Systems, Inc. nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/module.h> +#include <sys/systm.h> +#include <sys/mbuf.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/socket.h> +#include <sys/queue.h> + +#include <machine/vmparam.h> +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/bus.h> +#include <machine/resource.h> +#include <machine/frame.h> + +#include <sys/bus.h> +#include <sys/rman.h> + +#include <machine/intr_machdep.h> + +#include <machine/xen-os.h> +#include <machine/hypervisor.h> +#include <machine/hypervisor-ifs.h> +#include <machine/xen_intr.h> +#include <machine/evtchn.h> +#include <machine/xenbus.h> +#include <machine/gnttab.h> +#include <machine/xen-public/memory.h> +#include <machine/xen-public/io/pciif.h> + +#include <sys/pciio.h> +#include <dev/pci/pcivar.h> +#include "pcib_if.h" + +#ifdef XEN_PCIDEV_FE_DEBUG +#define DPRINTF(fmt, args...) \ + printf("pcifront (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) +#else +#define DPRINTF(fmt, args...) ((void)0) +#endif +#define WPRINTF(fmt, args...) \ + printf("pcifront (%s:%d): " fmt, __FUNCTION__, __LINE__, ##args) + +#define INVALID_GRANT_REF (0) +#define INVALID_EVTCHN (-1) +#define virt_to_mfn(x) (vtomach(x) >> PAGE_SHIFT) + +struct pcifront_device { + STAILQ_ENTRY(pcifront_device) next; + + struct xenbus_device *xdev; + + int unit; + int evtchn; + int gnt_ref; + + /* Lock this when doing any operations in sh_info */ + struct mtx sh_info_lock; + struct xen_pci_sharedinfo *sh_info; + + device_t ndev; + + int ref_cnt; +}; + +static STAILQ_HEAD(pcifront_dlist, pcifront_device) pdev_list = STAILQ_HEAD_INITIALIZER(pdev_list); + +struct xpcib_softc { + int domain; + int bus; + struct pcifront_device *pdev; +}; + +/* Allocate a PCI device structure */ +static struct pcifront_device * +alloc_pdev(struct xenbus_device *xdev) +{ + struct pcifront_device *pdev = NULL; + int err, unit; + + err = sscanf(xdev->nodename, "device/pci/%d", &unit); + if (err != 1) { + if (err == 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, "Error scanning pci device instance number"); + goto out; + } + + pdev = (struct pcifront_device *)malloc(sizeof(struct pcifront_device), M_DEVBUF, M_NOWAIT); + if (pdev == NULL) { + err = -ENOMEM; + xenbus_dev_fatal(xdev, err, "Error allocating pcifront_device struct"); + goto out; + } + pdev->unit = unit; + pdev->xdev = xdev; + pdev->ref_cnt = 1; + + pdev->sh_info = (struct xen_pci_sharedinfo *)malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT); + if (pdev->sh_info == NULL) { + free(pdev, M_DEVBUF); + pdev = NULL; + err = -ENOMEM; + xenbus_dev_fatal(xdev, err, "Error allocating sh_info struct"); + goto out; + } + pdev->sh_info->flags = 0; + + xdev->data = pdev; + + mtx_init(&pdev->sh_info_lock, "info_lock", "pci shared dev info lock", MTX_DEF); + + pdev->evtchn = INVALID_EVTCHN; + pdev->gnt_ref = INVALID_GRANT_REF; + + STAILQ_INSERT_TAIL(&pdev_list, pdev, next); + + DPRINTF("Allocated pdev @ 0x%p (unit=%d)\n", pdev, unit); + + out: + return pdev; +} + +/* Hold a reference to a pcifront device */ +static void +get_pdev(struct pcifront_device *pdev) +{ + pdev->ref_cnt++; +} + +/* Release a reference to a pcifront device */ +static void +put_pdev(struct pcifront_device *pdev) +{ + if (--pdev->ref_cnt > 0) + return; + + DPRINTF("freeing pdev @ 0x%p (ref_cnt=%d)\n", pdev, pdev->ref_cnt); + + if (pdev->evtchn != INVALID_EVTCHN) + xenbus_free_evtchn(pdev->xdev, pdev->evtchn); + + if (pdev->gnt_ref != INVALID_GRANT_REF) + gnttab_end_foreign_access(pdev->gnt_ref, 0, (void *)pdev->sh_info); + + pdev->xdev->data = NULL; + + free(pdev, M_DEVBUF); +} + + +/* Write to the xenbus info needed by backend */ +static int +pcifront_publish_info(struct pcifront_device *pdev) +{ + int err = 0; + struct xenbus_transaction *trans; + + err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info)); + if (err < 0) { + WPRINTF("error granting access to ring page\n"); + goto out; + } + + pdev->gnt_ref = err; + + err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn); + if (err) + goto out; + + do_publish: + trans = xenbus_transaction_start(); + if (IS_ERR(trans)) { + xenbus_dev_fatal(pdev->xdev, err, + "Error writing configuration for backend " + "(start transaction)"); + goto out; + } + + err = xenbus_printf(trans, pdev->xdev->nodename, + "pci-op-ref", "%u", pdev->gnt_ref); + if (!err) + err = xenbus_printf(trans, pdev->xdev->nodename, + "event-channel", "%u", pdev->evtchn); + if (!err) + err = xenbus_printf(trans, pdev->xdev->nodename, + "magic", XEN_PCI_MAGIC); + if (!err) + err = xenbus_switch_state(pdev->xdev, trans, + XenbusStateInitialised); + + if (err) { + xenbus_transaction_end(trans, 1); + xenbus_dev_fatal(pdev->xdev, err, + "Error writing configuration for backend"); + goto out; + } else { + err = xenbus_transaction_end(trans, 0); + if (err == -EAGAIN) + goto do_publish; + else if (err) { + xenbus_dev_fatal(pdev->xdev, err, + "Error completing transaction for backend"); + goto out; + } + } + + out: + return err; +} + +/* The backend is now connected so complete the connection process on our side */ +static int +pcifront_connect(struct pcifront_device *pdev) +{ + device_t nexus; + devclass_t nexus_devclass; + + /* We will add our device as a child of the nexus0 device */ + if (!(nexus_devclass = devclass_find("nexus")) || + !(nexus = devclass_get_device(nexus_devclass, 0))) { + WPRINTF("could not find nexus0!\n"); + return -1; + } + + /* Create a newbus device representing this frontend instance */ + pdev->ndev = BUS_ADD_CHILD(nexus, 0, "xpcife", pdev->unit); + if (!pdev->ndev) { + WPRINTF("could not create xpcife%d!\n", pdev->unit); + return -EFAULT; + } + get_pdev(pdev); + device_set_ivars(pdev->ndev, pdev); + + /* Good to go connected now */ + xenbus_switch_state(pdev->xdev, NULL, XenbusStateConnected); + + printf("pcifront: connected to %s\n", pdev->xdev->nodename); + + mtx_lock(&Giant); + device_probe_and_attach(pdev->ndev); + mtx_unlock(&Giant); + + return 0; +} + +/* The backend is closing so process a disconnect */ +static int +pcifront_disconnect(struct pcifront_device *pdev) +{ + int err = 0; + XenbusState prev_state; + + prev_state = xenbus_read_driver_state(pdev->xdev->nodename); + + if (prev_state < XenbusStateClosing) { + err = xenbus_switch_state(pdev->xdev, NULL, XenbusStateClosing); + if (!err && prev_state == XenbusStateConnected) { + /* TODO - need to detach the newbus devices */ + } + } + + return err; +} + +/* Process a probe from the xenbus */ +static int +pcifront_probe(struct xenbus_device *xdev, + const struct xenbus_device_id *id) +{ + int err = 0; + struct pcifront_device *pdev; + + DPRINTF("xenbus probing\n"); + + if ((pdev = alloc_pdev(xdev)) == NULL) + goto out; + + err = pcifront_publish_info(pdev); + + out: + if (err) + put_pdev(pdev); + return err; +} + +/* Remove the xenbus PCI device */ +static int +pcifront_remove(struct xenbus_device *xdev) +{ + DPRINTF("removing xenbus device node (%s)\n", xdev->nodename); + if (xdev->data) + put_pdev(xdev->data); + return 0; +} + +/* Called by xenbus when our backend node changes state */ +static void +pcifront_backend_changed(struct xenbus_device *xdev, + XenbusState be_state) +{ + struct pcifront_device *pdev = xdev->data; + + switch (be_state) { + case XenbusStateClosing: + DPRINTF("backend closing (%s)\n", xdev->nodename); + pcifront_disconnect(pdev); + break; + + case XenbusStateClosed: + DPRINTF("backend closed (%s)\n", xdev->nodename); + pcifront_disconnect(pdev); + break; + + case XenbusStateConnected: + DPRINTF("backend connected (%s)\n", xdev->nodename); + pcifront_connect(pdev); + break; + + default: + break; + } +} + +/* Process PCI operation */ +static int +do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op) +{ + int err = 0; + struct xen_pci_op *active_op = &pdev->sh_info->op; + evtchn_port_t port = pdev->evtchn; + time_t timeout; + + mtx_lock(&pdev->sh_info_lock); + + memcpy(active_op, op, sizeof(struct xen_pci_op)); + + /* Go */ + wmb(); + set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); + notify_remote_via_evtchn(port); + + timeout = time_uptime + 2; + + clear_evtchn(port); + + /* Spin while waiting for the answer */ + while (test_bit + (_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags)) { + int err = HYPERVISOR_poll(&port, 1, 3 * hz); + if (err) + panic("Failed HYPERVISOR_poll: err=%d", err); + clear_evtchn(port); + if (time_uptime > timeout) { + WPRINTF("pciback not responding!!!\n"); + clear_bit(_XEN_PCIF_active, + (unsigned long *)&pdev->sh_info->flags); + err = XEN_PCI_ERR_dev_not_found; + goto out; + } + } + + memcpy(op, active_op, sizeof(struct xen_pci_op)); + + err = op->err; + out: + mtx_unlock(&pdev->sh_info_lock); + return err; +} + +/* ** XenBus Driver registration ** */ + +static struct xenbus_device_id pcifront_ids[] = { + { "pci" }, + { "" } +}; + +static struct xenbus_driver pcifront = { + .name = "pcifront", + .ids = pcifront_ids, + .probe = pcifront_probe, + .remove = pcifront_remove, + .otherend_changed = pcifront_backend_changed, +}; + +/* Register the driver with xenbus during sys init */ +static void +pcifront_init(void *unused) +{ + if ((xen_start_info->flags & SIF_INITDOMAIN)) + return; + + DPRINTF("xenbus registering\n"); + + xenbus_register_frontend(&pcifront); +} + +SYSINIT(pciif, SI_SUB_PSEUDO, SI_ORDER_ANY, pcifront_init, NULL) + + +/* Newbus xpcife device driver probe */ +static int +xpcife_probe(device_t dev) +{ +#ifdef XEN_PCIDEV_FE_DEBUG + struct pcifront_device *pdev = (struct pcifront_device *)device_get_ivars(dev); + DPRINTF("xpcife probe (unit=%d)\n", pdev->unit); +#endif + return 0; +} + +/* Newbus xpcife device driver attach */ +static int +xpcife_attach(device_t dev) +{ + struct pcifront_device *pdev = (struct pcifront_device *)device_get_ivars(dev); + int i, num_roots, len, err; + char str[64]; + unsigned int domain, bus; + + DPRINTF("xpcife attach (unit=%d)\n", pdev->unit); + + err = xenbus_scanf(NULL, pdev->xdev->otherend, + "root_num", "%d", &num_roots); + if (err != 1) { + if (err == 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading number of PCI roots"); + goto out; + } + + /* Add a pcib device for each root */ + for (i = 0; i < num_roots; i++) { + device_t child; + + len = snprintf(str, sizeof(str), "root-%d", i); + if (unlikely(len >= (sizeof(str) - 1))) { + err = -ENOMEM; + goto out; + } + + err = xenbus_scanf(NULL, pdev->xdev->otherend, str, + "%x:%x", &domain, &bus); + if (err != 2) { + if (err >= 0) + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Error reading PCI root %d", i); + goto out; + } + err = 0; + if (domain != pdev->xdev->otherend_id) { + err = -EINVAL; + xenbus_dev_fatal(pdev->xdev, err, + "Domain mismatch %d != %d", domain, pdev->xdev->otherend_id); + goto out; + } + + child = device_add_child(dev, "pcib", bus); + if (!child) { + err = -ENOMEM; + xenbus_dev_fatal(pdev->xdev, err, + "Unable to create pcib%d", bus); + goto out; + } + } + + out: + return bus_generic_attach(dev); +} + +static devclass_t xpcife_devclass; + +static device_method_t xpcife_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, xpcife_probe), + DEVMETHOD(device_attach, xpcife_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + /* Bus interface */ + DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + {0, 0} +}; + +static driver_t xpcife_driver = { + "xpcife", + xpcife_methods, + 0, +}; + +DRIVER_MODULE(xpcife, nexus, xpcife_driver, xpcife_devclass, 0, 0); + + +/* Newbus xen pcib device driver probe */ +static int +xpcib_probe(device_t dev) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + struct pcifront_device *pdev = (struct pcifront_device *)device_get_ivars(device_get_parent(dev)); + + DPRINTF("xpcib probe (bus=%d)\n", device_get_unit(dev)); + + sc->domain = pdev->xdev->otherend_id; + sc->bus = device_get_unit(dev); + sc->pdev = pdev; + + return 0; +} + +/* Newbus xen pcib device driver attach */ +static int +xpcib_attach(device_t dev) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + + DPRINTF("xpcib attach (bus=%d)\n", sc->bus); + + device_add_child(dev, "pci", sc->bus); + return bus_generic_attach(dev); +} + +static int +xpcib_read_ivar(device_t dev, device_t child, int which, uintptr_t *result) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + switch (which) { + case PCIB_IVAR_BUS: + *result = sc->bus; + return 0; + } + return ENOENT; +} + +/* Return the number of slots supported */ +static int +xpcib_maxslots(device_t dev) +{ + return 31; +} + +#define PCI_DEVFN(slot,func) ((((slot) & 0x1f) << 3) | ((func) & 0x07)) + +/* Read configuration space register */ +static u_int32_t +xpcib_read_config(device_t dev, int bus, int slot, int func, + int reg, int bytes) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_conf_read, + .domain = sc->domain, + .bus = sc->bus, + .devfn = PCI_DEVFN(slot, func), + .offset = reg, + .size = bytes, + }; + int err; + + err = do_pci_op(sc->pdev, &op); + + DPRINTF("read config (b=%d, s=%d, f=%d, reg=%d, len=%d, val=%x, err=%d)\n", + bus, slot, func, reg, bytes, op.value, err); + + if (err) + op.value = ~0; + + return op.value; +} + +/* Write configuration space register */ +static void +xpcib_write_config(device_t dev, int bus, int slot, int func, + int reg, u_int32_t data, int bytes) +{ + struct xpcib_softc *sc = (struct xpcib_softc *)device_get_softc(dev); + struct xen_pci_op op = { + .cmd = XEN_PCI_OP_conf_write, + .domain = sc->domain, + .bus = sc->bus, + .devfn = PCI_DEVFN(slot, func), + .offset = reg, + .size = bytes, + .value = data, + }; + int err; + + err = do_pci_op(sc->pdev, &op); + + DPRINTF("write config (b=%d, s=%d, f=%d, reg=%d, len=%d, val=%x, err=%d)\n", + bus, slot, func, reg, bytes, data, err); +} + +static int +xpcib_route_interrupt(device_t pcib, device_t dev, int pin) +{ + struct pci_devinfo *dinfo = device_get_ivars(dev); + pcicfgregs *cfg = &dinfo->cfg; + + DPRINTF("route intr (pin=%d, line=%d)\n", pin, cfg->intline); + + return cfg->intline; +} + +static device_method_t xpcib_methods[] = { + /* Device interface */ + DEVMETHOD(device_probe, xpcib_probe), + DEVMETHOD(device_attach, xpcib_attach), + DEVMETHOD(device_detach, bus_generic_detach), + DEVMETHOD(device_shutdown, bus_generic_shutdown), + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_print_child, bus_generic_print_child), + DEVMETHOD(bus_read_ivar, xpcib_read_ivar), + DEVMETHOD(bus_alloc_resource, bus_generic_alloc_resource), + DEVMETHOD(bus_activate_resource, bus_generic_activate_resource), + DEVMETHOD(bus_release_resource, bus_generic_release_resource), + DEVMETHOD(bus_deactivate_resource, bus_generic_deactivate_resource), + DEVMETHOD(bus_setup_intr, bus_generic_setup_intr), + DEVMETHOD(bus_teardown_intr, bus_generic_teardown_intr), + + /* pcib interface */ + DEVMETHOD(pcib_maxslots, xpcib_maxslots), + DEVMETHOD(pcib_read_config, xpcib_read_config), + DEVMETHOD(pcib_write_config, xpcib_write_config), + DEVMETHOD(pcib_route_interrupt, xpcib_route_interrupt), + { 0, 0 } +}; + +static devclass_t xpcib_devclass; + +DEFINE_CLASS_0(pcib, xpcib_driver, xpcib_methods, sizeof(struct xpcib_softc)); +DRIVER_MODULE(pcib, xpcife, xpcib_driver, xpcib_devclass, 0, 0); + +/* + * Local variables: + * mode: C + * c-set-style: "BSD" + * c-basic-offset: 4 + * tab-width: 4 + * indent-tabs-mode: t + * End: + */ |