diff options
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 21 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_intrs.c | 60 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 1 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 119 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 80 | ||||
-rw-r--r-- | include/linux/vfio.h | 2 | ||||
-rw-r--r-- | include/uapi/linux/vfio.h | 1 |
7 files changed, 242 insertions, 42 deletions
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 7cc0122..f8a1863 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -239,9 +239,12 @@ static int vfio_pci_get_irq_count(struct vfio_pci_device *vdev, int irq_type) return (flags & PCI_MSIX_FLAGS_QSIZE) + 1; } - } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) + } else if (irq_type == VFIO_PCI_ERR_IRQ_INDEX) { if (pci_is_pcie(vdev->pdev)) return 1; + } else if (irq_type == VFIO_PCI_REQ_IRQ_INDEX) { + return 1; + } return 0; } @@ -464,6 +467,7 @@ static long vfio_pci_ioctl(void *device_data, switch (info.index) { case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSIX_IRQ_INDEX: + case VFIO_PCI_REQ_IRQ_INDEX: break; case VFIO_PCI_ERR_IRQ_INDEX: if (pci_is_pcie(vdev->pdev)) @@ -828,6 +832,20 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) req_len, vma->vm_page_prot); } +static void vfio_pci_request(void *device_data, unsigned int count) +{ + struct vfio_pci_device *vdev = device_data; + + mutex_lock(&vdev->igate); + + if (vdev->req_trigger) { + dev_dbg(&vdev->pdev->dev, "Requesting device from user\n"); + eventfd_signal(vdev->req_trigger, 1); + } + + mutex_unlock(&vdev->igate); +} + static const struct vfio_device_ops vfio_pci_ops = { .name = "vfio-pci", .open = vfio_pci_open, @@ -836,6 +854,7 @@ static const struct vfio_device_ops vfio_pci_ops = { .read = vfio_pci_read, .write = vfio_pci_write, .mmap = vfio_pci_mmap, + .request = vfio_pci_request, }; static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index e8d695b..f88bfdf 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -763,46 +763,70 @@ static int vfio_pci_set_msi_trigger(struct vfio_pci_device *vdev, return 0; } -static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, - unsigned index, unsigned start, - unsigned count, uint32_t flags, void *data) +static int vfio_pci_set_ctx_trigger_single(struct eventfd_ctx **ctx, + uint32_t flags, void *data) { int32_t fd = *(int32_t *)data; - if ((index != VFIO_PCI_ERR_IRQ_INDEX) || - !(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) + if (!(flags & VFIO_IRQ_SET_DATA_TYPE_MASK)) return -EINVAL; /* DATA_NONE/DATA_BOOL enables loopback testing */ if (flags & VFIO_IRQ_SET_DATA_NONE) { - if (vdev->err_trigger) - eventfd_signal(vdev->err_trigger, 1); + if (*ctx) + eventfd_signal(*ctx, 1); return 0; } else if (flags & VFIO_IRQ_SET_DATA_BOOL) { uint8_t trigger = *(uint8_t *)data; - if (trigger && vdev->err_trigger) - eventfd_signal(vdev->err_trigger, 1); + if (trigger && *ctx) + eventfd_signal(*ctx, 1); return 0; } /* Handle SET_DATA_EVENTFD */ if (fd == -1) { - if (vdev->err_trigger) - eventfd_ctx_put(vdev->err_trigger); - vdev->err_trigger = NULL; + if (*ctx) + eventfd_ctx_put(*ctx); + *ctx = NULL; return 0; } else if (fd >= 0) { struct eventfd_ctx *efdctx; efdctx = eventfd_ctx_fdget(fd); if (IS_ERR(efdctx)) return PTR_ERR(efdctx); - if (vdev->err_trigger) - eventfd_ctx_put(vdev->err_trigger); - vdev->err_trigger = efdctx; + if (*ctx) + eventfd_ctx_put(*ctx); + *ctx = efdctx; return 0; } else return -EINVAL; } + +static int vfio_pci_set_err_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_ERR_IRQ_INDEX) + return -EINVAL; + + /* + * We should sanitize start & count, but that wasn't caught + * originally, so this IRQ index must forever ignore them :-( + */ + + return vfio_pci_set_ctx_trigger_single(&vdev->err_trigger, flags, data); +} + +static int vfio_pci_set_req_trigger(struct vfio_pci_device *vdev, + unsigned index, unsigned start, + unsigned count, uint32_t flags, void *data) +{ + if (index != VFIO_PCI_REQ_IRQ_INDEX || start != 0 || count != 1) + return -EINVAL; + + return vfio_pci_set_ctx_trigger_single(&vdev->req_trigger, flags, data); +} + int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, unsigned index, unsigned start, unsigned count, void *data) @@ -844,6 +868,12 @@ int vfio_pci_set_irqs_ioctl(struct vfio_pci_device *vdev, uint32_t flags, func = vfio_pci_set_err_trigger; break; } + case VFIO_PCI_REQ_IRQ_INDEX: + switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) { + case VFIO_IRQ_SET_ACTION_TRIGGER: + func = vfio_pci_set_req_trigger; + break; + } } if (!func) diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 671c17a..c9f9b32 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -58,6 +58,7 @@ struct vfio_pci_device { struct pci_saved_state *pci_saved_state; int refcnt; struct eventfd_ctx *err_trigger; + struct eventfd_ctx *req_trigger; }; #define is_intx(vdev) (vdev->irq_type == VFIO_PCI_INTX_IRQ_INDEX) diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index f018d8d..4cde855 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -63,6 +63,11 @@ struct vfio_container { void *iommu_data; }; +struct vfio_unbound_dev { + struct device *dev; + struct list_head unbound_next; +}; + struct vfio_group { struct kref kref; int minor; @@ -75,6 +80,8 @@ struct vfio_group { struct notifier_block nb; struct list_head vfio_next; struct list_head container_next; + struct list_head unbound_list; + struct mutex unbound_lock; atomic_t opened; }; @@ -204,6 +211,8 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) kref_init(&group->kref); INIT_LIST_HEAD(&group->device_list); mutex_init(&group->device_lock); + INIT_LIST_HEAD(&group->unbound_list); + mutex_init(&group->unbound_lock); atomic_set(&group->container_users, 0); atomic_set(&group->opened, 0); group->iommu_group = iommu_group; @@ -264,13 +273,22 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) static void vfio_group_release(struct kref *kref) { struct vfio_group *group = container_of(kref, struct vfio_group, kref); + struct vfio_unbound_dev *unbound, *tmp; + struct iommu_group *iommu_group = group->iommu_group; WARN_ON(!list_empty(&group->device_list)); + list_for_each_entry_safe(unbound, tmp, + &group->unbound_list, unbound_next) { + list_del(&unbound->unbound_next); + kfree(unbound); + } + device_destroy(vfio.class, MKDEV(MAJOR(vfio.group_devt), group->minor)); list_del(&group->vfio_next); vfio_free_group_minor(group->minor); vfio_group_unlock_and_free(group); + iommu_group_put(iommu_group); } static void vfio_group_put(struct vfio_group *group) @@ -440,17 +458,36 @@ static bool vfio_whitelisted_driver(struct device_driver *drv) } /* - * A vfio group is viable for use by userspace if all devices are either - * driver-less or bound to a vfio or whitelisted driver. We test the - * latter by the existence of a struct vfio_device matching the dev. + * A vfio group is viable for use by userspace if all devices are in + * one of the following states: + * - driver-less + * - bound to a vfio driver + * - bound to a whitelisted driver + * + * We use two methods to determine whether a device is bound to a vfio + * driver. The first is to test whether the device exists in the vfio + * group. The second is to test if the device exists on the group + * unbound_list, indicating it's in the middle of transitioning from + * a vfio driver to driver-less. */ static int vfio_dev_viable(struct device *dev, void *data) { struct vfio_group *group = data; struct vfio_device *device; struct device_driver *drv = ACCESS_ONCE(dev->driver); + struct vfio_unbound_dev *unbound; + int ret = -EINVAL; - if (!drv || vfio_whitelisted_driver(drv)) + mutex_lock(&group->unbound_lock); + list_for_each_entry(unbound, &group->unbound_list, unbound_next) { + if (dev == unbound->dev) { + ret = 0; + break; + } + } + mutex_unlock(&group->unbound_lock); + + if (!ret || !drv || vfio_whitelisted_driver(drv)) return 0; device = vfio_group_get_device(group, dev); @@ -459,7 +496,7 @@ static int vfio_dev_viable(struct device *dev, void *data) return 0; } - return -EINVAL; + return ret; } /** @@ -501,6 +538,7 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, { struct vfio_group *group = container_of(nb, struct vfio_group, nb); struct device *dev = data; + struct vfio_unbound_dev *unbound; /* * Need to go through a group_lock lookup to get a reference or we @@ -550,6 +588,17 @@ static int vfio_iommu_group_notifier(struct notifier_block *nb, * stop the system to maintain isolation. At a minimum, we'd * want a toggle to disable driver auto probe for this device. */ + + mutex_lock(&group->unbound_lock); + list_for_each_entry(unbound, + &group->unbound_list, unbound_next) { + if (dev == unbound->dev) { + list_del(&unbound->unbound_next); + kfree(unbound); + break; + } + } + mutex_unlock(&group->unbound_lock); break; } @@ -578,6 +627,12 @@ int vfio_add_group_dev(struct device *dev, iommu_group_put(iommu_group); return PTR_ERR(group); } + } else { + /* + * A found vfio_group already holds a reference to the + * iommu_group. A created vfio_group keeps the reference. + */ + iommu_group_put(iommu_group); } device = vfio_group_get_device(group, dev); @@ -586,21 +641,19 @@ int vfio_add_group_dev(struct device *dev, dev_name(dev), iommu_group_id(iommu_group)); vfio_device_put(device); vfio_group_put(group); - iommu_group_put(iommu_group); return -EBUSY; } device = vfio_group_create_device(group, dev, ops, device_data); if (IS_ERR(device)) { vfio_group_put(group); - iommu_group_put(iommu_group); return PTR_ERR(device); } /* - * Added device holds reference to iommu_group and vfio_device - * (which in turn holds reference to vfio_group). Drop extra - * group reference used while acquiring device. + * Drop all but the vfio_device reference. The vfio_device holds + * a reference to the vfio_group, which holds a reference to the + * iommu_group. */ vfio_group_put(group); @@ -655,8 +708,9 @@ void *vfio_del_group_dev(struct device *dev) { struct vfio_device *device = dev_get_drvdata(dev); struct vfio_group *group = device->group; - struct iommu_group *iommu_group = group->iommu_group; void *device_data = device->device_data; + struct vfio_unbound_dev *unbound; + unsigned int i = 0; /* * The group exists so long as we have a device reference. Get @@ -664,14 +718,49 @@ void *vfio_del_group_dev(struct device *dev) */ vfio_group_get(group); + /* + * When the device is removed from the group, the group suddenly + * becomes non-viable; the device has a driver (until the unbind + * completes), but it's not present in the group. This is bad news + * for any external users that need to re-acquire a group reference + * in order to match and release their existing reference. To + * solve this, we track such devices on the unbound_list to bridge + * the gap until they're fully unbound. + */ + unbound = kzalloc(sizeof(*unbound), GFP_KERNEL); + if (unbound) { + unbound->dev = dev; + mutex_lock(&group->unbound_lock); + list_add(&unbound->unbound_next, &group->unbound_list); + mutex_unlock(&group->unbound_lock); + } + WARN_ON(!unbound); + vfio_device_put(device); - /* TODO send a signal to encourage this to be released */ - wait_event(vfio.release_q, !vfio_dev_present(group, dev)); + /* + * If the device is still present in the group after the above + * 'put', then it is in use and we need to request it from the + * bus driver. The driver may in turn need to request the + * device from the user. We send the request on an arbitrary + * interval with counter to allow the driver to take escalating + * measures to release the device if it has the ability to do so. + */ + do { + device = vfio_group_get_device(group, dev); + if (!device) + break; - vfio_group_put(group); + if (device->ops->request) + device->ops->request(device_data, i++); - iommu_group_put(iommu_group); + vfio_device_put(device); + + } while (wait_event_interruptible_timeout(vfio.release_q, + !vfio_dev_present(group, dev), + HZ * 10) <= 0); + + vfio_group_put(group); return device_data; } diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 4a9d666..57d8c37 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -66,6 +66,7 @@ struct vfio_domain { struct list_head next; struct list_head group_list; int prot; /* IOMMU_CACHE */ + bool fgsp; /* Fine-grained super pages */ }; struct vfio_dma { @@ -264,6 +265,7 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; bool lock_cap = capable(CAP_IPC_LOCK); long ret, i; + bool rsvd; if (!current->mm) return -ENODEV; @@ -272,10 +274,9 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, if (ret) return ret; - if (is_invalid_reserved_pfn(*pfn_base)) - return 1; + rsvd = is_invalid_reserved_pfn(*pfn_base); - if (!lock_cap && current->mm->locked_vm + 1 > limit) { + if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { put_pfn(*pfn_base, prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); @@ -283,7 +284,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, } if (unlikely(disable_hugepages)) { - vfio_lock_acct(1); + if (!rsvd) + vfio_lock_acct(1); return 1; } @@ -295,12 +297,14 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, if (ret) break; - if (pfn != *pfn_base + i || is_invalid_reserved_pfn(pfn)) { + if (pfn != *pfn_base + i || + rsvd != is_invalid_reserved_pfn(pfn)) { put_pfn(pfn, prot); break; } - if (!lock_cap && current->mm->locked_vm + i + 1 > limit) { + if (!rsvd && !lock_cap && + current->mm->locked_vm + i + 1 > limit) { put_pfn(pfn, prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); @@ -308,7 +312,8 @@ static long vfio_pin_pages(unsigned long vaddr, long npage, } } - vfio_lock_acct(i); + if (!rsvd) + vfio_lock_acct(i); return i; } @@ -346,12 +351,14 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) domain = d = list_first_entry(&iommu->domain_list, struct vfio_domain, next); - list_for_each_entry_continue(d, &iommu->domain_list, next) + list_for_each_entry_continue(d, &iommu->domain_list, next) { iommu_unmap(d->domain, dma->iova, dma->size); + cond_resched(); + } while (iova < end) { - size_t unmapped; - phys_addr_t phys; + size_t unmapped, len; + phys_addr_t phys, next; phys = iommu_iova_to_phys(domain->domain, iova); if (WARN_ON(!phys)) { @@ -359,7 +366,19 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) continue; } - unmapped = iommu_unmap(domain->domain, iova, PAGE_SIZE); + /* + * To optimize for fewer iommu_unmap() calls, each of which + * may require hardware cache flushing, try to find the + * largest contiguous physical memory chunk to unmap. + */ + for (len = PAGE_SIZE; + !domain->fgsp && iova + len < end; len += PAGE_SIZE) { + next = iommu_iova_to_phys(domain->domain, iova + len); + if (next != phys + len) + break; + } + + unmapped = iommu_unmap(domain->domain, iova, len); if (WARN_ON(!unmapped)) break; @@ -367,6 +386,8 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) unmapped >> PAGE_SHIFT, dma->prot, false); iova += unmapped; + + cond_resched(); } vfio_lock_acct(-unlocked); @@ -511,6 +532,8 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova, map_try_harder(d, iova, pfn, npage, prot)) goto unwind; } + + cond_resched(); } return 0; @@ -665,6 +688,39 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, return 0; } +/* + * We change our unmap behavior slightly depending on whether the IOMMU + * supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage + * for practically any contiguous power-of-two mapping we give it. This means + * we don't need to look for contiguous chunks ourselves to make unmapping + * more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d + * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks + * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when + * hugetlbfs is in use. + */ +static void vfio_test_domain_fgsp(struct vfio_domain *domain) +{ + struct page *pages; + int ret, order = get_order(PAGE_SIZE * 2); + + pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!pages) + return; + + ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2, + IOMMU_READ | IOMMU_WRITE | domain->prot); + if (!ret) { + size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE); + + if (unmapped == PAGE_SIZE) + iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE); + else + domain->fgsp = true; + } + + __free_pages(pages, order); +} + static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -758,6 +814,8 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, } } + vfio_test_domain_fgsp(domain); + /* replay mappings on new domains */ ret = vfio_iommu_replay(iommu, domain); if (ret) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index d320411..2d67b89 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -26,6 +26,7 @@ * @ioctl: Perform ioctl(2) on device file descriptor, supporting VFIO_DEVICE_* * operations documented below * @mmap: Perform mmap(2) on a region of the device file descriptor + * @request: Request for the bus driver to release the device */ struct vfio_device_ops { char *name; @@ -38,6 +39,7 @@ struct vfio_device_ops { long (*ioctl)(void *device_data, unsigned int cmd, unsigned long arg); int (*mmap)(void *device_data, struct vm_area_struct *vma); + void (*request)(void *device_data, unsigned int count); }; extern int vfio_add_group_dev(struct device *dev, diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 29715d2..82889c3 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -333,6 +333,7 @@ enum { VFIO_PCI_MSI_IRQ_INDEX, VFIO_PCI_MSIX_IRQ_INDEX, VFIO_PCI_ERR_IRQ_INDEX, + VFIO_PCI_REQ_IRQ_INDEX, VFIO_PCI_NUM_IRQS }; |