diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-20 13:28:18 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-03-20 13:28:18 -0700 |
commit | f0691533b756931089902464ca15afc218a49d70 (patch) | |
tree | 7d72b43866be5ae5507efb9c2976059d5d5cc0f1 /drivers/virtio | |
parent | 2b2f72d8ce59acce95ceb156f0f31b848e32e6d4 (diff) | |
parent | c67f5db82027ba6d2ea4ac9176bc45996a03ae6a (diff) | |
download | op-kernel-dev-f0691533b756931089902464ca15afc218a49d70.zip op-kernel-dev-f0691533b756931089902464ca15afc218a49d70.tar.gz |
Merge tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost
Pull virtio/vhost updates from Michael Tsirkin:
"New features, performance improvements, cleanups:
- basic polling support for vhost
- rework virtio to optionally use DMA API, fixing it on Xen
- balloon stats gained a new entry
- using the new napi_alloc_skb speeds up virtio net
- virtio blk stats can now be read while another VCPU is busy
inflating or deflating the balloon
plus misc cleanups in various places"
* tag 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost:
virtio_net: replace netdev_alloc_skb_ip_align() with napi_alloc_skb()
vhost_net: basic polling support
vhost: introduce vhost_vq_avail_empty()
vhost: introduce vhost_has_work()
virtio_balloon: Allow to resize and update the balloon stats in parallel
virtio_balloon: Use a workqueue instead of "vballoon" kthread
virtio/s390: size of SET_IND payload
virtio/s390: use dev_to_virtio
vhost: rename vhost_init_used()
vhost: rename cross-endian helpers
virtio_blk: VIRTIO_BLK_F_WCE->VIRTIO_BLK_F_FLUSH
vring: Use the DMA API on Xen
virtio_pci: Use the DMA API if enabled
virtio_mmio: Use the DMA API if enabled
virtio: Add improved queue allocation API
virtio_ring: Support DMA APIs
vring: Introduce vring_use_dma_api()
s390/dma: Allow per device dma ops
alpha/dma: use common noop dma ops
dma: Provide simple noop dma ops
Diffstat (limited to 'drivers/virtio')
-rw-r--r-- | drivers/virtio/Kconfig | 2 | ||||
-rw-r--r-- | drivers/virtio/virtio_balloon.c | 122 | ||||
-rw-r--r-- | drivers/virtio/virtio_mmio.c | 67 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_common.h | 6 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_legacy.c | 42 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci_modern.c | 61 | ||||
-rw-r--r-- | drivers/virtio/virtio_ring.c | 439 |
7 files changed, 487 insertions, 252 deletions
diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index cab9f3f..7759032 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -60,7 +60,7 @@ config VIRTIO_INPUT config VIRTIO_MMIO tristate "Platform bus driver for memory mapped virtio devices" - depends on HAS_IOMEM + depends on HAS_IOMEM && HAS_DMA select VIRTIO ---help--- This drivers provides support for memory mapped virtio diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index f2b77de..7b6d74f 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -22,8 +22,7 @@ #include <linux/virtio.h> #include <linux/virtio_balloon.h> #include <linux/swap.h> -#include <linux/kthread.h> -#include <linux/freezer.h> +#include <linux/workqueue.h> #include <linux/delay.h> #include <linux/slab.h> #include <linux/module.h> @@ -50,11 +49,13 @@ struct virtio_balloon { struct virtio_device *vdev; struct virtqueue *inflate_vq, *deflate_vq, *stats_vq; - /* Where the ballooning thread waits for config to change. */ - wait_queue_head_t config_change; + /* The balloon servicing is delegated to a freezable workqueue. */ + struct work_struct update_balloon_stats_work; + struct work_struct update_balloon_size_work; - /* The thread servicing the balloon. */ - struct task_struct *thread; + /* Prevent updating balloon when it is being canceled. */ + spinlock_t stop_update_lock; + bool stop_update; /* Waiting for host to ack the pages we released. */ wait_queue_head_t acked; @@ -77,7 +78,6 @@ struct virtio_balloon { u32 pfns[VIRTIO_BALLOON_ARRAY_PFNS_MAX]; /* Memory statistics */ - int need_stats_update; struct virtio_balloon_stat stats[VIRTIO_BALLOON_S_NR]; /* To register callback in oom notifier call chain */ @@ -124,6 +124,7 @@ static void tell_host(struct virtio_balloon *vb, struct virtqueue *vq) /* When host has read buffer, this completes via balloon_ack */ wait_event(vb->acked, virtqueue_get_buf(vq, &len)); + } static void set_page_pfns(u32 pfns[], struct page *page) @@ -136,9 +137,10 @@ static void set_page_pfns(u32 pfns[], struct page *page) pfns[i] = page_to_balloon_pfn(page) + i; } -static void fill_balloon(struct virtio_balloon *vb, size_t num) +static unsigned fill_balloon(struct virtio_balloon *vb, size_t num) { struct balloon_dev_info *vb_dev_info = &vb->vb_dev_info; + unsigned num_allocated_pages; /* We can only do one array worth at a time. */ num = min(num, ARRAY_SIZE(vb->pfns)); @@ -163,10 +165,13 @@ static void fill_balloon(struct virtio_balloon *vb, size_t num) adjust_managed_page_count(page, -1); } + num_allocated_pages = vb->num_pfns; /* Did we get any? */ if (vb->num_pfns != 0) tell_host(vb, vb->inflate_vq); mutex_unlock(&vb->balloon_lock); + + return num_allocated_pages; } static void release_pages_balloon(struct virtio_balloon *vb) @@ -257,14 +262,17 @@ static void update_balloon_stats(struct virtio_balloon *vb) * with a single buffer. From that point forward, all conversations consist of * a hypervisor request (a call to this function) which directs us to refill * the virtqueue with a fresh stats buffer. Since stats collection can sleep, - * we notify our kthread which does the actual work via stats_handle_request(). + * we delegate the job to a freezable workqueue that will do the actual work via + * stats_handle_request(). */ static void stats_request(struct virtqueue *vq) { struct virtio_balloon *vb = vq->vdev->priv; - vb->need_stats_update = 1; - wake_up(&vb->config_change); + spin_lock(&vb->stop_update_lock); + if (!vb->stop_update) + queue_work(system_freezable_wq, &vb->update_balloon_stats_work); + spin_unlock(&vb->stop_update_lock); } static void stats_handle_request(struct virtio_balloon *vb) @@ -273,7 +281,6 @@ static void stats_handle_request(struct virtio_balloon *vb) struct scatterlist sg; unsigned int len; - vb->need_stats_update = 0; update_balloon_stats(vb); vq = vb->stats_vq; @@ -287,8 +294,12 @@ static void stats_handle_request(struct virtio_balloon *vb) static void virtballoon_changed(struct virtio_device *vdev) { struct virtio_balloon *vb = vdev->priv; + unsigned long flags; - wake_up(&vb->config_change); + spin_lock_irqsave(&vb->stop_update_lock, flags); + if (!vb->stop_update) + queue_work(system_freezable_wq, &vb->update_balloon_size_work); + spin_unlock_irqrestore(&vb->stop_update_lock, flags); } static inline s64 towards_target(struct virtio_balloon *vb) @@ -351,43 +362,32 @@ static int virtballoon_oom_notify(struct notifier_block *self, return NOTIFY_OK; } -static int balloon(void *_vballoon) +static void update_balloon_stats_func(struct work_struct *work) { - struct virtio_balloon *vb = _vballoon; - DEFINE_WAIT_FUNC(wait, woken_wake_function); - - set_freezable(); - while (!kthread_should_stop()) { - s64 diff; - - try_to_freeze(); - - add_wait_queue(&vb->config_change, &wait); - for (;;) { - if ((diff = towards_target(vb)) != 0 || - vb->need_stats_update || - kthread_should_stop() || - freezing(current)) - break; - wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); - } - remove_wait_queue(&vb->config_change, &wait); + struct virtio_balloon *vb; - if (vb->need_stats_update) - stats_handle_request(vb); - if (diff > 0) - fill_balloon(vb, diff); - else if (diff < 0) - leak_balloon(vb, -diff); - update_balloon_size(vb); + vb = container_of(work, struct virtio_balloon, + update_balloon_stats_work); + stats_handle_request(vb); +} - /* - * For large balloon changes, we could spend a lot of time - * and always have work to do. Be nice if preempt disabled. - */ - cond_resched(); - } - return 0; +static void update_balloon_size_func(struct work_struct *work) +{ + struct virtio_balloon *vb; + s64 diff; + + vb = container_of(work, struct virtio_balloon, + update_balloon_size_work); + diff = towards_target(vb); + + if (diff > 0) + diff -= fill_balloon(vb, diff); + else if (diff < 0) + diff += leak_balloon(vb, -diff); + update_balloon_size(vb); + + if (diff) + queue_work(system_freezable_wq, work); } static int init_vqs(struct virtio_balloon *vb) @@ -505,12 +505,14 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out; } + INIT_WORK(&vb->update_balloon_stats_work, update_balloon_stats_func); + INIT_WORK(&vb->update_balloon_size_work, update_balloon_size_func); + spin_lock_init(&vb->stop_update_lock); + vb->stop_update = false; vb->num_pages = 0; mutex_init(&vb->balloon_lock); - init_waitqueue_head(&vb->config_change); init_waitqueue_head(&vb->acked); vb->vdev = vdev; - vb->need_stats_update = 0; balloon_devinfo_init(&vb->vb_dev_info); #ifdef CONFIG_BALLOON_COMPACTION @@ -529,16 +531,8 @@ static int virtballoon_probe(struct virtio_device *vdev) virtio_device_ready(vdev); - vb->thread = kthread_run(balloon, vb, "vballoon"); - if (IS_ERR(vb->thread)) { - err = PTR_ERR(vb->thread); - goto out_del_vqs; - } - return 0; -out_del_vqs: - unregister_oom_notifier(&vb->nb); out_oom_notify: vdev->config->del_vqs(vdev); out_free_vb: @@ -565,7 +559,13 @@ static void virtballoon_remove(struct virtio_device *vdev) struct virtio_balloon *vb = vdev->priv; unregister_oom_notifier(&vb->nb); - kthread_stop(vb->thread); + + spin_lock_irq(&vb->stop_update_lock); + vb->stop_update = true; + spin_unlock_irq(&vb->stop_update_lock); + cancel_work_sync(&vb->update_balloon_size_work); + cancel_work_sync(&vb->update_balloon_stats_work); + remove_common(vb); kfree(vb); } @@ -576,10 +576,9 @@ static int virtballoon_freeze(struct virtio_device *vdev) struct virtio_balloon *vb = vdev->priv; /* - * The kthread is already frozen by the PM core before this + * The workqueue is already frozen by the PM core before this * function is called. */ - remove_common(vb); return 0; } @@ -595,7 +594,8 @@ static int virtballoon_restore(struct virtio_device *vdev) virtio_device_ready(vdev); - fill_balloon(vb, towards_target(vb)); + if (towards_target(vb)) + virtballoon_changed(vdev); update_balloon_size(vb); return 0; } diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 745c6ee..48bfea9 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -99,12 +99,6 @@ struct virtio_mmio_vq_info { /* the actual virtqueue */ struct virtqueue *vq; - /* the number of entries in the queue */ - unsigned int num; - - /* the virtual address of the ring queue */ - void *queue; - /* the list node for the virtqueues list */ struct list_head node; }; @@ -322,15 +316,13 @@ static void vm_del_vq(struct virtqueue *vq) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vq->vdev); struct virtio_mmio_vq_info *info = vq->priv; - unsigned long flags, size; + unsigned long flags; unsigned int index = vq->index; spin_lock_irqsave(&vm_dev->lock, flags); list_del(&info->node); spin_unlock_irqrestore(&vm_dev->lock, flags); - vring_del_virtqueue(vq); - /* Select and deactivate the queue */ writel(index, vm_dev->base + VIRTIO_MMIO_QUEUE_SEL); if (vm_dev->version == 1) { @@ -340,8 +332,8 @@ static void vm_del_vq(struct virtqueue *vq) WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); } - size = PAGE_ALIGN(vring_size(info->num, VIRTIO_MMIO_VRING_ALIGN)); - free_pages_exact(info->queue, size); + vring_del_virtqueue(vq); + kfree(info); } @@ -356,8 +348,6 @@ static void vm_del_vqs(struct virtio_device *vdev) free_irq(platform_get_irq(vm_dev->pdev, 0), vm_dev); } - - static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), const char *name) @@ -365,7 +355,8 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); struct virtio_mmio_vq_info *info; struct virtqueue *vq; - unsigned long flags, size; + unsigned long flags; + unsigned int num; int err; if (!name) @@ -388,66 +379,40 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, goto error_kmalloc; } - /* Allocate pages for the queue - start with a queue as big as - * possible (limited by maximum size allowed by device), drop down - * to a minimal size, just big enough to fit descriptor table - * and two rings (which makes it "alignment_size * 2") - */ - info->num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); - - /* If the device reports a 0 entry queue, we won't be able to - * use it to perform I/O, and vring_new_virtqueue() can't create - * empty queues anyway, so don't bother to set up the device. - */ - if (info->num == 0) { + num = readl(vm_dev->base + VIRTIO_MMIO_QUEUE_NUM_MAX); + if (num == 0) { err = -ENOENT; - goto error_alloc_pages; - } - - while (1) { - size = PAGE_ALIGN(vring_size(info->num, - VIRTIO_MMIO_VRING_ALIGN)); - /* Did the last iter shrink the queue below minimum size? */ - if (size < VIRTIO_MMIO_VRING_ALIGN * 2) { - err = -ENOMEM; - goto error_alloc_pages; - } - - info->queue = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO); - if (info->queue) - break; - - info->num /= 2; + goto error_new_virtqueue; } /* Create the vring */ - vq = vring_new_virtqueue(index, info->num, VIRTIO_MMIO_VRING_ALIGN, vdev, - true, info->queue, vm_notify, callback, name); + vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, + true, true, vm_notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; } /* Activate the queue */ - writel(info->num, vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); + writel(virtqueue_get_vring_size(vq), vm_dev->base + VIRTIO_MMIO_QUEUE_NUM); if (vm_dev->version == 1) { writel(PAGE_SIZE, vm_dev->base + VIRTIO_MMIO_QUEUE_ALIGN); - writel(virt_to_phys(info->queue) >> PAGE_SHIFT, + writel(virtqueue_get_desc_addr(vq) >> PAGE_SHIFT, vm_dev->base + VIRTIO_MMIO_QUEUE_PFN); } else { u64 addr; - addr = virt_to_phys(info->queue); + addr = virtqueue_get_desc_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_DESC_HIGH); - addr = virt_to_phys(virtqueue_get_avail(vq)); + addr = virtqueue_get_avail_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_AVAIL_HIGH); - addr = virt_to_phys(virtqueue_get_used(vq)); + addr = virtqueue_get_used_addr(vq); writel((u32)addr, vm_dev->base + VIRTIO_MMIO_QUEUE_USED_LOW); writel((u32)(addr >> 32), vm_dev->base + VIRTIO_MMIO_QUEUE_USED_HIGH); @@ -471,8 +436,6 @@ error_new_virtqueue: writel(0, vm_dev->base + VIRTIO_MMIO_QUEUE_READY); WARN_ON(readl(vm_dev->base + VIRTIO_MMIO_QUEUE_READY)); } - free_pages_exact(info->queue, size); -error_alloc_pages: kfree(info); error_kmalloc: error_available: diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 2cc2522..2826320 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -35,12 +35,6 @@ struct virtio_pci_vq_info { /* the actual virtqueue */ struct virtqueue *vq; - /* the number of entries in the queue */ - int num; - - /* the virtual address of the ring queue */ - void *queue; - /* the list node for the virtqueues list */ struct list_head node; diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c index 48bc979..8c4e617 100644 --- a/drivers/virtio/virtio_pci_legacy.c +++ b/drivers/virtio/virtio_pci_legacy.c @@ -119,7 +119,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, u16 msix_vec) { struct virtqueue *vq; - unsigned long size; u16 num; int err; @@ -131,27 +130,19 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (!num || ioread32(vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN)) return ERR_PTR(-ENOENT); - info->num = num; info->msix_vector = msix_vec; - size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); - info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); - if (info->queue == NULL) + /* create the vring */ + vq = vring_create_virtqueue(index, num, + VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev, + true, false, vp_notify, callback, name); + if (!vq) return ERR_PTR(-ENOMEM); /* activate the queue */ - iowrite32(virt_to_phys(info->queue) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT, + iowrite32(virtqueue_get_desc_addr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); - /* create the vring */ - vq = vring_new_virtqueue(index, info->num, - VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev, - true, info->queue, vp_notify, callback, name); - if (!vq) { - err = -ENOMEM; - goto out_activate_queue; - } - vq->priv = (void __force *)vp_dev->ioaddr + VIRTIO_PCI_QUEUE_NOTIFY; if (msix_vec != VIRTIO_MSI_NO_VECTOR) { @@ -159,17 +150,15 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, msix_vec = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); if (msix_vec == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; - goto out_assign; + goto out_deactivate; } } return vq; -out_assign: - vring_del_virtqueue(vq); -out_activate_queue: +out_deactivate: iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); - free_pages_exact(info->queue, size); + vring_del_virtqueue(vq); return ERR_PTR(err); } @@ -177,7 +166,6 @@ static void del_vq(struct virtio_pci_vq_info *info) { struct virtqueue *vq = info->vq; struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); - unsigned long size; iowrite16(vq->index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); @@ -188,13 +176,10 @@ static void del_vq(struct virtio_pci_vq_info *info) ioread8(vp_dev->ioaddr + VIRTIO_PCI_ISR); } - vring_del_virtqueue(vq); - /* Select and deactivate the queue */ iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); - size = PAGE_ALIGN(vring_size(info->num, VIRTIO_PCI_VRING_ALIGN)); - free_pages_exact(info->queue, size); + vring_del_virtqueue(vq); } static const struct virtio_config_ops virtio_pci_config_ops = { @@ -227,6 +212,13 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) return -ENODEV; } + rc = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)); + if (rc) + rc = dma_set_mask_and_coherent(&pci_dev->dev, + DMA_BIT_MASK(32)); + if (rc) + dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy"); if (rc) return rc; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 7760fc1..f6f28cc 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -287,31 +287,6 @@ static u16 vp_config_vector(struct virtio_pci_device *vp_dev, u16 vector) return vp_ioread16(&vp_dev->common->msix_config); } -static size_t vring_pci_size(u16 num) -{ - /* We only need a cacheline separation. */ - return PAGE_ALIGN(vring_size(num, SMP_CACHE_BYTES)); -} - -static void *alloc_virtqueue_pages(int *num) -{ - void *pages; - - /* TODO: allocate each queue chunk individually */ - for (; *num && vring_pci_size(*num) > PAGE_SIZE; *num /= 2) { - pages = alloc_pages_exact(vring_pci_size(*num), - GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); - if (pages) - return pages; - } - - if (!*num) - return NULL; - - /* Try to get a single page. You are my only hope! */ - return alloc_pages_exact(vring_pci_size(*num), GFP_KERNEL|__GFP_ZERO); -} - static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, struct virtio_pci_vq_info *info, unsigned index, @@ -343,29 +318,22 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, /* get offset of notification word for this vq */ off = vp_ioread16(&cfg->queue_notify_off); - info->num = num; info->msix_vector = msix_vec; - info->queue = alloc_virtqueue_pages(&info->num); - if (info->queue == NULL) - return ERR_PTR(-ENOMEM); - /* create the vring */ - vq = vring_new_virtqueue(index, info->num, - SMP_CACHE_BYTES, &vp_dev->vdev, - true, info->queue, vp_notify, callback, name); - if (!vq) { - err = -ENOMEM; - goto err_new_queue; - } + vq = vring_create_virtqueue(index, num, + SMP_CACHE_BYTES, &vp_dev->vdev, + true, true, vp_notify, callback, name); + if (!vq) + return ERR_PTR(-ENOMEM); /* activate the queue */ - vp_iowrite16(num, &cfg->queue_size); - vp_iowrite64_twopart(virt_to_phys(info->queue), + vp_iowrite16(virtqueue_get_vring_size(vq), &cfg->queue_size); + vp_iowrite64_twopart(virtqueue_get_desc_addr(vq), &cfg->queue_desc_lo, &cfg->queue_desc_hi); - vp_iowrite64_twopart(virt_to_phys(virtqueue_get_avail(vq)), + vp_iowrite64_twopart(virtqueue_get_avail_addr(vq), &cfg->queue_avail_lo, &cfg->queue_avail_hi); - vp_iowrite64_twopart(virt_to_phys(virtqueue_get_used(vq)), + vp_iowrite64_twopart(virtqueue_get_used_addr(vq), &cfg->queue_used_lo, &cfg->queue_used_hi); if (vp_dev->notify_base) { @@ -410,8 +378,6 @@ err_assign_vector: pci_iounmap(vp_dev->pci_dev, (void __iomem __force *)vq->priv); err_map_notify: vring_del_virtqueue(vq); -err_new_queue: - free_pages_exact(info->queue, vring_pci_size(info->num)); return ERR_PTR(err); } @@ -456,8 +422,6 @@ static void del_vq(struct virtio_pci_vq_info *info) pci_iounmap(vp_dev->pci_dev, (void __force __iomem *)vq->priv); vring_del_virtqueue(vq); - - free_pages_exact(info->queue, vring_pci_size(info->num)); } static const struct virtio_config_ops virtio_pci_config_nodev_ops = { @@ -641,6 +605,13 @@ int virtio_pci_modern_probe(struct virtio_pci_device *vp_dev) return -EINVAL; } + err = dma_set_mask_and_coherent(&pci_dev->dev, DMA_BIT_MASK(64)); + if (err) + err = dma_set_mask_and_coherent(&pci_dev->dev, + DMA_BIT_MASK(32)); + if (err) + dev_warn(&pci_dev->dev, "Failed to enable 64-bit or 32-bit DMA. Trying to continue, but this might not work.\n"); + /* Device capability is only mandatory for devices that have * device-specific configuration. */ diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index e12e385..5c802d4 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -24,6 +24,8 @@ #include <linux/module.h> #include <linux/hrtimer.h> #include <linux/kmemleak.h> +#include <linux/dma-mapping.h> +#include <xen/xen.h> #ifdef DEBUG /* For development, we want to crash whenever the ring is screwed. */ @@ -54,6 +56,11 @@ #define END_USE(vq) #endif +struct vring_desc_state { + void *data; /* Data for callback. */ + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ +}; + struct vring_virtqueue { struct virtqueue vq; @@ -89,6 +96,11 @@ struct vring_virtqueue { /* How to notify other side. FIXME: commonalize hcalls! */ bool (*notify)(struct virtqueue *vq); + /* DMA, allocation, and size information */ + bool we_own_ring; + size_t queue_size_in_bytes; + dma_addr_t queue_dma_addr; + #ifdef DEBUG /* They're supposed to lock for us. */ unsigned int in_use; @@ -98,12 +110,120 @@ struct vring_virtqueue { ktime_t last_add_time; #endif - /* Tokens for callbacks. */ - void *data[]; + /* Per-descriptor state. */ + struct vring_desc_state desc_state[]; }; #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq) +/* + * The interaction between virtio and a possible IOMMU is a mess. + * + * On most systems with virtio, physical addresses match bus addresses, + * and it doesn't particularly matter whether we use the DMA API. + * + * On some systems, including Xen and any system with a physical device + * that speaks virtio behind a physical IOMMU, we must use the DMA API + * for virtio DMA to work at all. + * + * On other systems, including SPARC and PPC64, virtio-pci devices are + * enumerated as though they are behind an IOMMU, but the virtio host + * ignores the IOMMU, so we must either pretend that the IOMMU isn't + * there or somehow map everything as the identity. + * + * For the time being, we preserve historic behavior and bypass the DMA + * API. + */ + +static bool vring_use_dma_api(struct virtio_device *vdev) +{ + /* + * In theory, it's possible to have a buggy QEMU-supposed + * emulated Q35 IOMMU and Xen enabled at the same time. On + * such a configuration, virtio has never worked and will + * not work without an even larger kludge. Instead, enable + * the DMA API if we're a Xen guest, which at least allows + * all of the sensible Xen configurations to work correctly. + */ + if (xen_domain()) + return true; + + return false; +} + +/* + * The DMA ops on various arches are rather gnarly right now, and + * making all of the arch DMA ops work on the vring device itself + * is a mess. For now, we use the parent device for DMA ops. + */ +struct device *vring_dma_dev(const struct vring_virtqueue *vq) +{ + return vq->vq.vdev->dev.parent; +} + +/* Map one sg entry. */ +static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, + struct scatterlist *sg, + enum dma_data_direction direction) +{ + if (!vring_use_dma_api(vq->vq.vdev)) + return (dma_addr_t)sg_phys(sg); + + /* + * We can't use dma_map_sg, because we don't use scatterlists in + * the way it expects (we don't guarantee that the scatterlist + * will exist for the lifetime of the mapping). + */ + return dma_map_page(vring_dma_dev(vq), + sg_page(sg), sg->offset, sg->length, + direction); +} + +static dma_addr_t vring_map_single(const struct vring_virtqueue *vq, + void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + if (!vring_use_dma_api(vq->vq.vdev)) + return (dma_addr_t)virt_to_phys(cpu_addr); + + return dma_map_single(vring_dma_dev(vq), + cpu_addr, size, direction); +} + +static void vring_unmap_one(const struct vring_virtqueue *vq, + struct vring_desc *desc) +{ + u16 flags; + + if (!vring_use_dma_api(vq->vq.vdev)) + return; + + flags = virtio16_to_cpu(vq->vq.vdev, desc->flags); + + if (flags & VRING_DESC_F_INDIRECT) { + dma_unmap_single(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } else { + dma_unmap_page(vring_dma_dev(vq), + virtio64_to_cpu(vq->vq.vdev, desc->addr), + virtio32_to_cpu(vq->vq.vdev, desc->len), + (flags & VRING_DESC_F_WRITE) ? + DMA_FROM_DEVICE : DMA_TO_DEVICE); + } +} + +static int vring_mapping_error(const struct vring_virtqueue *vq, + dma_addr_t addr) +{ + if (!vring_use_dma_api(vq->vq.vdev)) + return 0; + + return dma_mapping_error(vring_dma_dev(vq), addr); +} + static struct vring_desc *alloc_indirect(struct virtqueue *_vq, unsigned int total_sg, gfp_t gfp) { @@ -137,7 +257,7 @@ static inline int virtqueue_add(struct virtqueue *_vq, struct vring_virtqueue *vq = to_vvq(_vq); struct scatterlist *sg; struct vring_desc *desc; - unsigned int i, n, avail, descs_used, uninitialized_var(prev); + unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx; int head; bool indirect; @@ -177,21 +297,15 @@ static inline int virtqueue_add(struct virtqueue *_vq, if (desc) { /* Use a single buffer which doesn't continue */ - vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); - vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, virt_to_phys(desc)); - /* avoid kmemleak false positive (hidden by virt_to_phys) */ - kmemleak_ignore(desc); - vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); - + indirect = true; /* Set up rest to use this indirect table. */ i = 0; descs_used = 1; - indirect = true; } else { + indirect = false; desc = vq->vring.desc; i = head; descs_used = total_sg; - indirect = false; } if (vq->vq.num_free < descs_used) { @@ -206,13 +320,14 @@ static inline int virtqueue_add(struct virtqueue *_vq, return -ENOSPC; } - /* We're about to use some buffers from the free list. */ - vq->vq.num_free -= descs_used; - for (n = 0; n < out_sgs; n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT); - desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; i = virtio16_to_cpu(_vq->vdev, desc[i].next); @@ -220,8 +335,12 @@ static inline int virtqueue_add(struct virtqueue *_vq, } for (; n < (out_sgs + in_sgs); n++) { for (sg = sgs[n]; sg; sg = sg_next(sg)) { + dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE); - desc[i].addr = cpu_to_virtio64(_vq->vdev, sg_phys(sg)); + desc[i].addr = cpu_to_virtio64(_vq->vdev, addr); desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length); prev = i; i = virtio16_to_cpu(_vq->vdev, desc[i].next); @@ -230,14 +349,33 @@ static inline int virtqueue_add(struct virtqueue *_vq, /* Last one doesn't continue. */ desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT); + if (indirect) { + /* Now that the indirect table is filled in, map it. */ + dma_addr_t addr = vring_map_single( + vq, desc, total_sg * sizeof(struct vring_desc), + DMA_TO_DEVICE); + if (vring_mapping_error(vq, addr)) + goto unmap_release; + + vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT); + vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr); + + vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc)); + } + + /* We're using some buffers from the free list. */ + vq->vq.num_free -= descs_used; + /* Update free pointer */ if (indirect) vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next); else vq->free_head = i; - /* Set token. */ - vq->data[head] = data; + /* Store token and indirect buffer state. */ + vq->desc_state[head].data = data; + if (indirect) + vq->desc_state[head].indir_desc = desc; /* Put entry in available array (but don't update avail->idx until they * do sync). */ @@ -260,6 +398,24 @@ static inline int virtqueue_add(struct virtqueue *_vq, virtqueue_kick(_vq); return 0; + +unmap_release: + err_idx = i; + i = head; + + for (n = 0; n < total_sg; n++) { + if (i == err_idx) + break; + vring_unmap_one(vq, &desc[i]); + i = vq->vring.desc[i].next; + } + + vq->vq.num_free += total_sg; + + if (indirect) + kfree(desc); + + return -EIO; } /** @@ -430,27 +586,43 @@ EXPORT_SYMBOL_GPL(virtqueue_kick); static void detach_buf(struct vring_virtqueue *vq, unsigned int head) { - unsigned int i; + unsigned int i, j; + u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); /* Clear data ptr. */ - vq->data[head] = NULL; + vq->desc_state[head].data = NULL; - /* Put back on free list: find end */ + /* Put back on free list: unmap first-level descriptors and find end */ i = head; - /* Free the indirect table */ - if (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)) - kfree(phys_to_virt(virtio64_to_cpu(vq->vq.vdev, vq->vring.desc[i].addr))); - - while (vq->vring.desc[i].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT)) { + while (vq->vring.desc[i].flags & nextflag) { + vring_unmap_one(vq, &vq->vring.desc[i]); i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next); vq->vq.num_free++; } + vring_unmap_one(vq, &vq->vring.desc[i]); vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head); vq->free_head = head; + /* Plus final descriptor */ vq->vq.num_free++; + + /* Free the indirect table, if any, now that it's unmapped. */ + if (vq->desc_state[head].indir_desc) { + struct vring_desc *indir_desc = vq->desc_state[head].indir_desc; + u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len); + + BUG_ON(!(vq->vring.desc[head].flags & + cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); + BUG_ON(len == 0 || len % sizeof(struct vring_desc)); + + for (j = 0; j < len / sizeof(struct vring_desc); j++) + vring_unmap_one(vq, &indir_desc[j]); + + kfree(vq->desc_state[head].indir_desc); + vq->desc_state[head].indir_desc = NULL; + } } static inline bool more_used(const struct vring_virtqueue *vq) @@ -505,13 +677,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) BAD_RING(vq, "id %u out of range\n", i); return NULL; } - if (unlikely(!vq->data[i])) { + if (unlikely(!vq->desc_state[i].data)) { BAD_RING(vq, "id %u is not a head!\n", i); return NULL; } /* detach_buf clears data, so grab it now. */ - ret = vq->data[i]; + ret = vq->desc_state[i].data; detach_buf(vq, i); vq->last_used_idx++; /* If we expect an interrupt for the next entry, tell host @@ -685,10 +857,10 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) START_USE(vq); for (i = 0; i < vq->vring.num; i++) { - if (!vq->data[i]) + if (!vq->desc_state[i].data) continue; /* detach_buf clears data, so grab it now. */ - buf = vq->data[i]; + buf = vq->desc_state[i].data; detach_buf(vq, i); vq->avail_idx_shadow--; vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); @@ -723,35 +895,31 @@ irqreturn_t vring_interrupt(int irq, void *_vq) } EXPORT_SYMBOL_GPL(vring_interrupt); -struct virtqueue *vring_new_virtqueue(unsigned int index, - unsigned int num, - unsigned int vring_align, - struct virtio_device *vdev, - bool weak_barriers, - void *pages, - bool (*notify)(struct virtqueue *), - void (*callback)(struct virtqueue *), - const char *name) +struct virtqueue *__vring_new_virtqueue(unsigned int index, + struct vring vring, + struct virtio_device *vdev, + bool weak_barriers, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) { - struct vring_virtqueue *vq; unsigned int i; + struct vring_virtqueue *vq; - /* We assume num is a power of 2. */ - if (num & (num - 1)) { - dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); - return NULL; - } - - vq = kmalloc(sizeof(*vq) + sizeof(void *)*num, GFP_KERNEL); + vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state), + GFP_KERNEL); if (!vq) return NULL; - vring_init(&vq->vring, num, pages, vring_align); + vq->vring = vring; vq->vq.callback = callback; vq->vq.vdev = vdev; vq->vq.name = name; - vq->vq.num_free = num; + vq->vq.num_free = vring.num; vq->vq.index = index; + vq->we_own_ring = false; + vq->queue_dma_addr = 0; + vq->queue_size_in_bytes = 0; vq->notify = notify; vq->weak_barriers = weak_barriers; vq->broken = false; @@ -776,20 +944,145 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, /* Put everything in free lists. */ vq->free_head = 0; - for (i = 0; i < num-1; i++) { + for (i = 0; i < vring.num-1; i++) vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1); - vq->data[i] = NULL; - } - vq->data[i] = NULL; + memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state)); return &vq->vq; } +EXPORT_SYMBOL_GPL(__vring_new_virtqueue); + +static void *vring_alloc_queue(struct virtio_device *vdev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + if (vring_use_dma_api(vdev)) { + return dma_alloc_coherent(vdev->dev.parent, size, + dma_handle, flag); + } else { + void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag); + if (queue) { + phys_addr_t phys_addr = virt_to_phys(queue); + *dma_handle = (dma_addr_t)phys_addr; + + /* + * Sanity check: make sure we dind't truncate + * the address. The only arches I can find that + * have 64-bit phys_addr_t but 32-bit dma_addr_t + * are certain non-highmem MIPS and x86 + * configurations, but these configurations + * should never allocate physical pages above 32 + * bits, so this is fine. Just in case, throw a + * warning and abort if we end up with an + * unrepresentable address. + */ + if (WARN_ON_ONCE(*dma_handle != phys_addr)) { + free_pages_exact(queue, PAGE_ALIGN(size)); + return NULL; + } + } + return queue; + } +} + +static void vring_free_queue(struct virtio_device *vdev, size_t size, + void *queue, dma_addr_t dma_handle) +{ + if (vring_use_dma_api(vdev)) { + dma_free_coherent(vdev->dev.parent, size, queue, dma_handle); + } else { + free_pages_exact(queue, PAGE_ALIGN(size)); + } +} + +struct virtqueue *vring_create_virtqueue( + unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + bool may_reduce_num, + bool (*notify)(struct virtqueue *), + void (*callback)(struct virtqueue *), + const char *name) +{ + struct virtqueue *vq; + void *queue; + dma_addr_t dma_addr; + size_t queue_size_in_bytes; + struct vring vring; + + /* We assume num is a power of 2. */ + if (num & (num - 1)) { + dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num); + return NULL; + } + + /* TODO: allocate each queue chunk individually */ + for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) { + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), + &dma_addr, + GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO); + if (queue) + break; + } + + if (!num) + return NULL; + + if (!queue) { + /* Try to get a single page. You are my only hope! */ + queue = vring_alloc_queue(vdev, vring_size(num, vring_align), + &dma_addr, GFP_KERNEL|__GFP_ZERO); + } + if (!queue) + return NULL; + + queue_size_in_bytes = vring_size(num, vring_align); + vring_init(&vring, num, queue, vring_align); + + vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, + notify, callback, name); + if (!vq) { + vring_free_queue(vdev, queue_size_in_bytes, queue, + dma_addr); + return NULL; + } + + to_vvq(vq)->queue_dma_addr = dma_addr; + to_vvq(vq)->queue_size_in_bytes = queue_size_in_bytes; + to_vvq(vq)->we_own_ring = true; + + return vq; +} +EXPORT_SYMBOL_GPL(vring_create_virtqueue); + +struct virtqueue *vring_new_virtqueue(unsigned int index, + unsigned int num, + unsigned int vring_align, + struct virtio_device *vdev, + bool weak_barriers, + void *pages, + bool (*notify)(struct virtqueue *vq), + void (*callback)(struct virtqueue *vq), + const char *name) +{ + struct vring vring; + vring_init(&vring, num, pages, vring_align); + return __vring_new_virtqueue(index, vring, vdev, weak_barriers, + notify, callback, name); +} EXPORT_SYMBOL_GPL(vring_new_virtqueue); -void vring_del_virtqueue(struct virtqueue *vq) +void vring_del_virtqueue(struct virtqueue *_vq) { - list_del(&vq->list); - kfree(to_vvq(vq)); + struct vring_virtqueue *vq = to_vvq(_vq); + + if (vq->we_own_ring) { + vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes, + vq->vring.desc, vq->queue_dma_addr); + } + list_del(&_vq->list); + kfree(vq); } EXPORT_SYMBOL_GPL(vring_del_virtqueue); @@ -853,20 +1146,42 @@ void virtio_break_device(struct virtio_device *dev) } EXPORT_SYMBOL_GPL(virtio_break_device); -void *virtqueue_get_avail(struct virtqueue *_vq) +dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->vring.avail; + BUG_ON(!vq->we_own_ring); + + return vq->queue_dma_addr; } -EXPORT_SYMBOL_GPL(virtqueue_get_avail); +EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr); -void *virtqueue_get_used(struct virtqueue *_vq) +dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); - return vq->vring.used; + BUG_ON(!vq->we_own_ring); + + return vq->queue_dma_addr + + ((char *)vq->vring.avail - (char *)vq->vring.desc); +} +EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr); + +dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + + BUG_ON(!vq->we_own_ring); + + return vq->queue_dma_addr + + ((char *)vq->vring.used - (char *)vq->vring.desc); +} +EXPORT_SYMBOL_GPL(virtqueue_get_used_addr); + +const struct vring *virtqueue_get_vring(struct virtqueue *vq) +{ + return &to_vvq(vq)->vring; } -EXPORT_SYMBOL_GPL(virtqueue_get_used); +EXPORT_SYMBOL_GPL(virtqueue_get_vring); MODULE_LICENSE("GPL"); |