diff options
Diffstat (limited to 'drivers/vfio')
-rw-r--r-- | drivers/vfio/Kconfig | 1 | ||||
-rw-r--r-- | drivers/vfio/Makefile | 1 | ||||
-rw-r--r-- | drivers/vfio/mdev/Kconfig | 17 | ||||
-rw-r--r-- | drivers/vfio/mdev/Makefile | 5 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_core.c | 385 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_driver.c | 119 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_private.h | 41 | ||||
-rw-r--r-- | drivers/vfio/mdev/mdev_sysfs.c | 286 | ||||
-rw-r--r-- | drivers/vfio/mdev/vfio_mdev.c | 148 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 83 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 12 | ||||
-rw-r--r-- | drivers/vfio/platform/vfio_platform_common.c | 31 | ||||
-rw-r--r-- | drivers/vfio/vfio.c | 461 | ||||
-rw-r--r-- | drivers/vfio/vfio_iommu_type1.c | 885 |
14 files changed, 2209 insertions, 266 deletions
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig index da6e2ce..23eced0 100644 --- a/drivers/vfio/Kconfig +++ b/drivers/vfio/Kconfig @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU source "drivers/vfio/pci/Kconfig" source "drivers/vfio/platform/Kconfig" +source "drivers/vfio/mdev/Kconfig" source "virt/lib/Kconfig" diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile index 7b8a31f..4a23c13 100644 --- a/drivers/vfio/Makefile +++ b/drivers/vfio/Makefile @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o obj-$(CONFIG_VFIO_PCI) += pci/ obj-$(CONFIG_VFIO_PLATFORM) += platform/ +obj-$(CONFIG_VFIO_MDEV) += mdev/ diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig new file mode 100644 index 0000000..14fdb10 --- /dev/null +++ b/drivers/vfio/mdev/Kconfig @@ -0,0 +1,17 @@ + +config VFIO_MDEV + tristate "Mediated device driver framework" + depends on VFIO + default n + help + Provides a framework to virtualize devices. + See Documentation/vfio-mediated-device.txt for more details. + + If you don't know what do here, say N. + +config VFIO_MDEV_DEVICE + tristate "VFIO driver for Mediated devices" + depends on VFIO && VFIO_MDEV + default n + help + VFIO based driver for Mediated devices. diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile new file mode 100644 index 0000000..fa2d5ea --- /dev/null +++ b/drivers/vfio/mdev/Makefile @@ -0,0 +1,5 @@ + +mdev-y := mdev_core.o mdev_sysfs.o mdev_driver.o + +obj-$(CONFIG_VFIO_MDEV) += mdev.o +obj-$(CONFIG_VFIO_MDEV_DEVICE) += vfio_mdev.o diff --git a/drivers/vfio/mdev/mdev_core.c b/drivers/vfio/mdev/mdev_core.c new file mode 100644 index 0000000..be1ee89 --- /dev/null +++ b/drivers/vfio/mdev/mdev_core.c @@ -0,0 +1,385 @@ +/* + * Mediated device Core Driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/sysfs.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "Mediated device Core Driver" + +static LIST_HEAD(parent_list); +static DEFINE_MUTEX(parent_list_lock); +static struct class_compat *mdev_bus_compat_class; + +static int _find_mdev_device(struct device *dev, void *data) +{ + struct mdev_device *mdev; + + if (!dev_is_mdev(dev)) + return 0; + + mdev = to_mdev_device(dev); + + if (uuid_le_cmp(mdev->uuid, *(uuid_le *)data) == 0) + return 1; + + return 0; +} + +static bool mdev_device_exist(struct parent_device *parent, uuid_le uuid) +{ + struct device *dev; + + dev = device_find_child(parent->dev, &uuid, _find_mdev_device); + if (dev) { + put_device(dev); + return true; + } + + return false; +} + +/* Should be called holding parent_list_lock */ +static struct parent_device *__find_parent_device(struct device *dev) +{ + struct parent_device *parent; + + list_for_each_entry(parent, &parent_list, next) { + if (parent->dev == dev) + return parent; + } + return NULL; +} + +static void mdev_release_parent(struct kref *kref) +{ + struct parent_device *parent = container_of(kref, struct parent_device, + ref); + struct device *dev = parent->dev; + + kfree(parent); + put_device(dev); +} + +static +inline struct parent_device *mdev_get_parent(struct parent_device *parent) +{ + if (parent) + kref_get(&parent->ref); + + return parent; +} + +static inline void mdev_put_parent(struct parent_device *parent) +{ + if (parent) + kref_put(&parent->ref, mdev_release_parent); +} + +static int mdev_device_create_ops(struct kobject *kobj, + struct mdev_device *mdev) +{ + struct parent_device *parent = mdev->parent; + int ret; + + ret = parent->ops->create(kobj, mdev); + if (ret) + return ret; + + ret = sysfs_create_groups(&mdev->dev.kobj, + parent->ops->mdev_attr_groups); + if (ret) + parent->ops->remove(mdev); + + return ret; +} + +/* + * mdev_device_remove_ops gets called from sysfs's 'remove' and when parent + * device is being unregistered from mdev device framework. + * - 'force_remove' is set to 'false' when called from sysfs's 'remove' which + * indicates that if the mdev device is active, used by VMM or userspace + * application, vendor driver could return error then don't remove the device. + * - 'force_remove' is set to 'true' when called from mdev_unregister_device() + * which indicate that parent device is being removed from mdev device + * framework so remove mdev device forcefully. + */ +static int mdev_device_remove_ops(struct mdev_device *mdev, bool force_remove) +{ + struct parent_device *parent = mdev->parent; + int ret; + + /* + * Vendor driver can return error if VMM or userspace application is + * using this mdev device. + */ + ret = parent->ops->remove(mdev); + if (ret && !force_remove) + return -EBUSY; + + sysfs_remove_groups(&mdev->dev.kobj, parent->ops->mdev_attr_groups); + return 0; +} + +static int mdev_device_remove_cb(struct device *dev, void *data) +{ + if (!dev_is_mdev(dev)) + return 0; + + return mdev_device_remove(dev, data ? *(bool *)data : true); +} + +/* + * mdev_register_device : Register a device + * @dev: device structure representing parent device. + * @ops: Parent device operation structure to be registered. + * + * Add device to list of registered parent devices. + * Returns a negative value on error, otherwise 0. + */ +int mdev_register_device(struct device *dev, const struct parent_ops *ops) +{ + int ret; + struct parent_device *parent; + + /* check for mandatory ops */ + if (!ops || !ops->create || !ops->remove || !ops->supported_type_groups) + return -EINVAL; + + dev = get_device(dev); + if (!dev) + return -EINVAL; + + mutex_lock(&parent_list_lock); + + /* Check for duplicate */ + parent = __find_parent_device(dev); + if (parent) { + ret = -EEXIST; + goto add_dev_err; + } + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (!parent) { + ret = -ENOMEM; + goto add_dev_err; + } + + kref_init(&parent->ref); + mutex_init(&parent->lock); + + parent->dev = dev; + parent->ops = ops; + + if (!mdev_bus_compat_class) { + mdev_bus_compat_class = class_compat_register("mdev_bus"); + if (!mdev_bus_compat_class) { + ret = -ENOMEM; + goto add_dev_err; + } + } + + ret = parent_create_sysfs_files(parent); + if (ret) + goto add_dev_err; + + ret = class_compat_create_link(mdev_bus_compat_class, dev, NULL); + if (ret) + dev_warn(dev, "Failed to create compatibility class link\n"); + + list_add(&parent->next, &parent_list); + mutex_unlock(&parent_list_lock); + + dev_info(dev, "MDEV: Registered\n"); + return 0; + +add_dev_err: + mutex_unlock(&parent_list_lock); + if (parent) + mdev_put_parent(parent); + else + put_device(dev); + return ret; +} +EXPORT_SYMBOL(mdev_register_device); + +/* + * mdev_unregister_device : Unregister a parent device + * @dev: device structure representing parent device. + * + * Remove device from list of registered parent devices. Give a chance to free + * existing mediated devices for given device. + */ + +void mdev_unregister_device(struct device *dev) +{ + struct parent_device *parent; + bool force_remove = true; + + mutex_lock(&parent_list_lock); + parent = __find_parent_device(dev); + + if (!parent) { + mutex_unlock(&parent_list_lock); + return; + } + dev_info(dev, "MDEV: Unregistering\n"); + + list_del(&parent->next); + class_compat_remove_link(mdev_bus_compat_class, dev, NULL); + + device_for_each_child(dev, (void *)&force_remove, + mdev_device_remove_cb); + + parent_remove_sysfs_files(parent); + + mutex_unlock(&parent_list_lock); + mdev_put_parent(parent); +} +EXPORT_SYMBOL(mdev_unregister_device); + +static void mdev_device_release(struct device *dev) +{ + struct mdev_device *mdev = to_mdev_device(dev); + + dev_dbg(&mdev->dev, "MDEV: destroying\n"); + kfree(mdev); +} + +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid) +{ + int ret; + struct mdev_device *mdev; + struct parent_device *parent; + struct mdev_type *type = to_mdev_type(kobj); + + parent = mdev_get_parent(type->parent); + if (!parent) + return -EINVAL; + + mutex_lock(&parent->lock); + + /* Check for duplicate */ + if (mdev_device_exist(parent, uuid)) { + ret = -EEXIST; + goto create_err; + } + + mdev = kzalloc(sizeof(*mdev), GFP_KERNEL); + if (!mdev) { + ret = -ENOMEM; + goto create_err; + } + + memcpy(&mdev->uuid, &uuid, sizeof(uuid_le)); + mdev->parent = parent; + kref_init(&mdev->ref); + + mdev->dev.parent = dev; + mdev->dev.bus = &mdev_bus_type; + mdev->dev.release = mdev_device_release; + dev_set_name(&mdev->dev, "%pUl", uuid.b); + + ret = device_register(&mdev->dev); + if (ret) { + put_device(&mdev->dev); + goto create_err; + } + + ret = mdev_device_create_ops(kobj, mdev); + if (ret) + goto create_failed; + + ret = mdev_create_sysfs_files(&mdev->dev, type); + if (ret) { + mdev_device_remove_ops(mdev, true); + goto create_failed; + } + + mdev->type_kobj = kobj; + dev_dbg(&mdev->dev, "MDEV: created\n"); + + mutex_unlock(&parent->lock); + return ret; + +create_failed: + device_unregister(&mdev->dev); + +create_err: + mutex_unlock(&parent->lock); + mdev_put_parent(parent); + return ret; +} + +int mdev_device_remove(struct device *dev, bool force_remove) +{ + struct mdev_device *mdev; + struct parent_device *parent; + struct mdev_type *type; + int ret; + + mdev = to_mdev_device(dev); + type = to_mdev_type(mdev->type_kobj); + parent = mdev->parent; + mutex_lock(&parent->lock); + + ret = mdev_device_remove_ops(mdev, force_remove); + if (ret) { + mutex_unlock(&parent->lock); + return ret; + } + + mdev_remove_sysfs_files(dev, type); + device_unregister(dev); + mutex_unlock(&parent->lock); + mdev_put_parent(parent); + return ret; +} + +static int __init mdev_init(void) +{ + int ret; + + ret = mdev_bus_register(); + + /* + * Attempt to load known vfio_mdev. This gives us a working environment + * without the user needing to explicitly load vfio_mdev driver. + */ + if (!ret) + request_module_nowait("vfio_mdev"); + + return ret; +} + +static void __exit mdev_exit(void) +{ + if (mdev_bus_compat_class) + class_compat_unregister(mdev_bus_compat_class); + + mdev_bus_unregister(); +} + +module_init(mdev_init) +module_exit(mdev_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/mdev/mdev_driver.c b/drivers/vfio/mdev/mdev_driver.c new file mode 100644 index 0000000..6f0391f --- /dev/null +++ b/drivers/vfio/mdev/mdev_driver.c @@ -0,0 +1,119 @@ +/* + * MDEV driver + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/device.h> +#include <linux/iommu.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +static int mdev_attach_iommu(struct mdev_device *mdev) +{ + int ret; + struct iommu_group *group; + + group = iommu_group_alloc(); + if (IS_ERR(group)) + return PTR_ERR(group); + + ret = iommu_group_add_device(group, &mdev->dev); + if (!ret) + dev_info(&mdev->dev, "MDEV: group_id = %d\n", + iommu_group_id(group)); + + iommu_group_put(group); + return ret; +} + +static void mdev_detach_iommu(struct mdev_device *mdev) +{ + iommu_group_remove_device(&mdev->dev); + dev_info(&mdev->dev, "MDEV: detaching iommu\n"); +} + +static int mdev_probe(struct device *dev) +{ + struct mdev_driver *drv = to_mdev_driver(dev->driver); + struct mdev_device *mdev = to_mdev_device(dev); + int ret; + + ret = mdev_attach_iommu(mdev); + if (ret) + return ret; + + if (drv && drv->probe) { + ret = drv->probe(dev); + if (ret) + mdev_detach_iommu(mdev); + } + + return ret; +} + +static int mdev_remove(struct device *dev) +{ + struct mdev_driver *drv = to_mdev_driver(dev->driver); + struct mdev_device *mdev = to_mdev_device(dev); + + if (drv && drv->remove) + drv->remove(dev); + + mdev_detach_iommu(mdev); + + return 0; +} + +struct bus_type mdev_bus_type = { + .name = "mdev", + .probe = mdev_probe, + .remove = mdev_remove, +}; +EXPORT_SYMBOL_GPL(mdev_bus_type); + +/** + * mdev_register_driver - register a new MDEV driver + * @drv: the driver to register + * @owner: module owner of driver to be registered + * + * Returns a negative value on error, otherwise 0. + **/ +int mdev_register_driver(struct mdev_driver *drv, struct module *owner) +{ + /* initialize common driver fields */ + drv->driver.name = drv->name; + drv->driver.bus = &mdev_bus_type; + drv->driver.owner = owner; + + /* register with core */ + return driver_register(&drv->driver); +} +EXPORT_SYMBOL(mdev_register_driver); + +/* + * mdev_unregister_driver - unregister MDEV driver + * @drv: the driver to unregister + */ +void mdev_unregister_driver(struct mdev_driver *drv) +{ + driver_unregister(&drv->driver); +} +EXPORT_SYMBOL(mdev_unregister_driver); + +int mdev_bus_register(void) +{ + return bus_register(&mdev_bus_type); +} + +void mdev_bus_unregister(void) +{ + bus_unregister(&mdev_bus_type); +} diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h new file mode 100644 index 0000000..d35097c --- /dev/null +++ b/drivers/vfio/mdev/mdev_private.h @@ -0,0 +1,41 @@ +/* + * Mediated device interal definitions + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MDEV_PRIVATE_H +#define MDEV_PRIVATE_H + +int mdev_bus_register(void); +void mdev_bus_unregister(void); + +struct mdev_type { + struct kobject kobj; + struct kobject *devices_kobj; + struct parent_device *parent; + struct list_head next; + struct attribute_group *group; +}; + +#define to_mdev_type_attr(_attr) \ + container_of(_attr, struct mdev_type_attribute, attr) +#define to_mdev_type(_kobj) \ + container_of(_kobj, struct mdev_type, kobj) + +int parent_create_sysfs_files(struct parent_device *parent); +void parent_remove_sysfs_files(struct parent_device *parent); + +int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type); +void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type); + +int mdev_device_create(struct kobject *kobj, struct device *dev, uuid_le uuid); +int mdev_device_remove(struct device *dev, bool force_remove); + +#endif /* MDEV_PRIVATE_H */ diff --git a/drivers/vfio/mdev/mdev_sysfs.c b/drivers/vfio/mdev/mdev_sysfs.c new file mode 100644 index 0000000..1a53deb --- /dev/null +++ b/drivers/vfio/mdev/mdev_sysfs.c @@ -0,0 +1,286 @@ +/* + * File attributes for Mediated devices + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/sysfs.h> +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/slab.h> +#include <linux/uuid.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +/* Static functions */ + +static ssize_t mdev_type_attr_show(struct kobject *kobj, + struct attribute *__attr, char *buf) +{ + struct mdev_type_attribute *attr = to_mdev_type_attr(__attr); + struct mdev_type *type = to_mdev_type(kobj); + ssize_t ret = -EIO; + + if (attr->show) + ret = attr->show(kobj, type->parent->dev, buf); + return ret; +} + +static ssize_t mdev_type_attr_store(struct kobject *kobj, + struct attribute *__attr, + const char *buf, size_t count) +{ + struct mdev_type_attribute *attr = to_mdev_type_attr(__attr); + struct mdev_type *type = to_mdev_type(kobj); + ssize_t ret = -EIO; + + if (attr->store) + ret = attr->store(&type->kobj, type->parent->dev, buf, count); + return ret; +} + +static const struct sysfs_ops mdev_type_sysfs_ops = { + .show = mdev_type_attr_show, + .store = mdev_type_attr_store, +}; + +static ssize_t create_store(struct kobject *kobj, struct device *dev, + const char *buf, size_t count) +{ + char *str; + uuid_le uuid; + int ret; + + if ((count < UUID_STRING_LEN) || (count > UUID_STRING_LEN + 1)) + return -EINVAL; + + str = kstrndup(buf, count, GFP_KERNEL); + if (!str) + return -ENOMEM; + + ret = uuid_le_to_bin(str, &uuid); + kfree(str); + if (ret) + return ret; + + ret = mdev_device_create(kobj, dev, uuid); + if (ret) + return ret; + + return count; +} + +MDEV_TYPE_ATTR_WO(create); + +static void mdev_type_release(struct kobject *kobj) +{ + struct mdev_type *type = to_mdev_type(kobj); + + pr_debug("Releasing group %s\n", kobj->name); + kfree(type); +} + +static struct kobj_type mdev_type_ktype = { + .sysfs_ops = &mdev_type_sysfs_ops, + .release = mdev_type_release, +}; + +struct mdev_type *add_mdev_supported_type(struct parent_device *parent, + struct attribute_group *group) +{ + struct mdev_type *type; + int ret; + + if (!group->name) { + pr_err("%s: Type name empty!\n", __func__); + return ERR_PTR(-EINVAL); + } + + type = kzalloc(sizeof(*type), GFP_KERNEL); + if (!type) + return ERR_PTR(-ENOMEM); + + type->kobj.kset = parent->mdev_types_kset; + + ret = kobject_init_and_add(&type->kobj, &mdev_type_ktype, NULL, + "%s-%s", dev_driver_string(parent->dev), + group->name); + if (ret) { + kfree(type); + return ERR_PTR(ret); + } + + ret = sysfs_create_file(&type->kobj, &mdev_type_attr_create.attr); + if (ret) + goto attr_create_failed; + + type->devices_kobj = kobject_create_and_add("devices", &type->kobj); + if (!type->devices_kobj) { + ret = -ENOMEM; + goto attr_devices_failed; + } + + ret = sysfs_create_files(&type->kobj, + (const struct attribute **)group->attrs); + if (ret) { + ret = -ENOMEM; + goto attrs_failed; + } + + type->group = group; + type->parent = parent; + return type; + +attrs_failed: + kobject_put(type->devices_kobj); +attr_devices_failed: + sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); +attr_create_failed: + kobject_del(&type->kobj); + kobject_put(&type->kobj); + return ERR_PTR(ret); +} + +static void remove_mdev_supported_type(struct mdev_type *type) +{ + sysfs_remove_files(&type->kobj, + (const struct attribute **)type->group->attrs); + kobject_put(type->devices_kobj); + sysfs_remove_file(&type->kobj, &mdev_type_attr_create.attr); + kobject_del(&type->kobj); + kobject_put(&type->kobj); +} + +static int add_mdev_supported_type_groups(struct parent_device *parent) +{ + int i; + + for (i = 0; parent->ops->supported_type_groups[i]; i++) { + struct mdev_type *type; + + type = add_mdev_supported_type(parent, + parent->ops->supported_type_groups[i]); + if (IS_ERR(type)) { + struct mdev_type *ltype, *tmp; + + list_for_each_entry_safe(ltype, tmp, &parent->type_list, + next) { + list_del(<ype->next); + remove_mdev_supported_type(ltype); + } + return PTR_ERR(type); + } + list_add(&type->next, &parent->type_list); + } + return 0; +} + +/* mdev sysfs functions */ +void parent_remove_sysfs_files(struct parent_device *parent) +{ + struct mdev_type *type, *tmp; + + list_for_each_entry_safe(type, tmp, &parent->type_list, next) { + list_del(&type->next); + remove_mdev_supported_type(type); + } + + sysfs_remove_groups(&parent->dev->kobj, parent->ops->dev_attr_groups); + kset_unregister(parent->mdev_types_kset); +} + +int parent_create_sysfs_files(struct parent_device *parent) +{ + int ret; + + parent->mdev_types_kset = kset_create_and_add("mdev_supported_types", + NULL, &parent->dev->kobj); + + if (!parent->mdev_types_kset) + return -ENOMEM; + + INIT_LIST_HEAD(&parent->type_list); + + ret = sysfs_create_groups(&parent->dev->kobj, + parent->ops->dev_attr_groups); + if (ret) + goto create_err; + + ret = add_mdev_supported_type_groups(parent); + if (ret) + sysfs_remove_groups(&parent->dev->kobj, + parent->ops->dev_attr_groups); + else + return ret; + +create_err: + kset_unregister(parent->mdev_types_kset); + return ret; +} + +static ssize_t remove_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val; + + if (kstrtoul(buf, 0, &val) < 0) + return -EINVAL; + + if (val && device_remove_file_self(dev, attr)) { + int ret; + + ret = mdev_device_remove(dev, false); + if (ret) { + device_create_file(dev, attr); + return ret; + } + } + + return count; +} + +static DEVICE_ATTR_WO(remove); + +static const struct attribute *mdev_device_attrs[] = { + &dev_attr_remove.attr, + NULL, +}; + +int mdev_create_sysfs_files(struct device *dev, struct mdev_type *type) +{ + int ret; + + ret = sysfs_create_files(&dev->kobj, mdev_device_attrs); + if (ret) + return ret; + + ret = sysfs_create_link(type->devices_kobj, &dev->kobj, dev_name(dev)); + if (ret) + goto device_link_failed; + + ret = sysfs_create_link(&dev->kobj, &type->kobj, "mdev_type"); + if (ret) + goto type_link_failed; + + return ret; + +type_link_failed: + sysfs_remove_link(type->devices_kobj, dev_name(dev)); +device_link_failed: + sysfs_remove_files(&dev->kobj, mdev_device_attrs); + return ret; +} + +void mdev_remove_sysfs_files(struct device *dev, struct mdev_type *type) +{ + sysfs_remove_link(&dev->kobj, "mdev_type"); + sysfs_remove_link(type->devices_kobj, dev_name(dev)); + sysfs_remove_files(&dev->kobj, mdev_device_attrs); +} diff --git a/drivers/vfio/mdev/vfio_mdev.c b/drivers/vfio/mdev/vfio_mdev.c new file mode 100644 index 0000000..ffc3675 --- /dev/null +++ b/drivers/vfio/mdev/vfio_mdev.c @@ -0,0 +1,148 @@ +/* + * VFIO based driver for Mediated device + * + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved. + * Author: Neo Jia <cjia@nvidia.com> + * Kirti Wankhede <kwankhede@nvidia.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/device.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/vfio.h> +#include <linux/mdev.h> + +#include "mdev_private.h" + +#define DRIVER_VERSION "0.1" +#define DRIVER_AUTHOR "NVIDIA Corporation" +#define DRIVER_DESC "VFIO based driver for Mediated device" + +static int vfio_mdev_open(void *device_data) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + int ret; + + if (unlikely(!parent->ops->open)) + return -EINVAL; + + if (!try_module_get(THIS_MODULE)) + return -ENODEV; + + ret = parent->ops->open(mdev); + if (ret) + module_put(THIS_MODULE); + + return ret; +} + +static void vfio_mdev_release(void *device_data) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (likely(parent->ops->release)) + parent->ops->release(mdev); + + module_put(THIS_MODULE); +} + +static long vfio_mdev_unlocked_ioctl(void *device_data, + unsigned int cmd, unsigned long arg) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->ioctl)) + return -EINVAL; + + return parent->ops->ioctl(mdev, cmd, arg); +} + +static ssize_t vfio_mdev_read(void *device_data, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->read)) + return -EINVAL; + + return parent->ops->read(mdev, buf, count, ppos); +} + +static ssize_t vfio_mdev_write(void *device_data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->write)) + return -EINVAL; + + return parent->ops->write(mdev, buf, count, ppos); +} + +static int vfio_mdev_mmap(void *device_data, struct vm_area_struct *vma) +{ + struct mdev_device *mdev = device_data; + struct parent_device *parent = mdev->parent; + + if (unlikely(!parent->ops->mmap)) + return -EINVAL; + + return parent->ops->mmap(mdev, vma); +} + +static const struct vfio_device_ops vfio_mdev_dev_ops = { + .name = "vfio-mdev", + .open = vfio_mdev_open, + .release = vfio_mdev_release, + .ioctl = vfio_mdev_unlocked_ioctl, + .read = vfio_mdev_read, + .write = vfio_mdev_write, + .mmap = vfio_mdev_mmap, +}; + +int vfio_mdev_probe(struct device *dev) +{ + struct mdev_device *mdev = to_mdev_device(dev); + + return vfio_add_group_dev(dev, &vfio_mdev_dev_ops, mdev); +} + +void vfio_mdev_remove(struct device *dev) +{ + vfio_del_group_dev(dev); +} + +struct mdev_driver vfio_mdev_driver = { + .name = "vfio_mdev", + .probe = vfio_mdev_probe, + .remove = vfio_mdev_remove, +}; + +static int __init vfio_mdev_init(void) +{ + return mdev_register_driver(&vfio_mdev_driver, THIS_MODULE); +} + +static void __exit vfio_mdev_exit(void) +{ + mdev_unregister_driver(&vfio_mdev_driver); +} + +module_init(vfio_mdev_init) +module_exit(vfio_mdev_exit) + +MODULE_VERSION(DRIVER_VERSION); +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESC); diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 031bc08..dcd7c2a 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -558,10 +558,9 @@ static int vfio_pci_for_each_slot_or_bus(struct pci_dev *pdev, static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, struct vfio_info_cap *caps) { - struct vfio_info_cap_header *header; struct vfio_region_info_cap_sparse_mmap *sparse; size_t end, size; - int nr_areas = 2, i = 0; + int nr_areas = 2, i = 0, ret; end = pci_resource_len(vdev->pdev, vdev->msix_bar); @@ -572,13 +571,10 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, size = sizeof(*sparse) + (nr_areas * sizeof(*sparse->areas)); - header = vfio_info_cap_add(caps, size, - VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1); - if (IS_ERR(header)) - return PTR_ERR(header); + sparse = kzalloc(size, GFP_KERNEL); + if (!sparse) + return -ENOMEM; - sparse = container_of(header, - struct vfio_region_info_cap_sparse_mmap, header); sparse->nr_areas = nr_areas; if (vdev->msix_offset & PAGE_MASK) { @@ -594,26 +590,11 @@ static int msix_sparse_mmap_cap(struct vfio_pci_device *vdev, i++; } - return 0; -} - -static int region_type_cap(struct vfio_pci_device *vdev, - struct vfio_info_cap *caps, - unsigned int type, unsigned int subtype) -{ - struct vfio_info_cap_header *header; - struct vfio_region_info_cap_type *cap; - - header = vfio_info_cap_add(caps, sizeof(*cap), - VFIO_REGION_INFO_CAP_TYPE, 1); - if (IS_ERR(header)) - return PTR_ERR(header); - - cap = container_of(header, struct vfio_region_info_cap_type, header); - cap->type = type; - cap->subtype = subtype; + ret = vfio_info_add_capability(caps, VFIO_REGION_INFO_CAP_SPARSE_MMAP, + sparse); + kfree(sparse); - return 0; + return ret; } int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, @@ -752,6 +733,9 @@ static long vfio_pci_ioctl(void *device_data, break; default: + { + struct vfio_region_info_cap_type cap_type; + if (info.index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; @@ -762,11 +746,16 @@ static long vfio_pci_ioctl(void *device_data, info.size = vdev->region[i].size; info.flags = vdev->region[i].flags; - ret = region_type_cap(vdev, &caps, - vdev->region[i].type, - vdev->region[i].subtype); + cap_type.type = vdev->region[i].type; + cap_type.subtype = vdev->region[i].subtype; + + ret = vfio_info_add_capability(&caps, + VFIO_REGION_INFO_CAP_TYPE, + &cap_type); if (ret) return ret; + + } } if (caps.size) { @@ -829,45 +818,25 @@ static long vfio_pci_ioctl(void *device_data, } else if (cmd == VFIO_DEVICE_SET_IRQS) { struct vfio_irq_set hdr; - size_t size; u8 *data = NULL; int max, ret = 0; + size_t data_size = 0; minsz = offsetofend(struct vfio_irq_set, count); if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; - if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS || - hdr.count >= (U32_MAX - hdr.start) || - hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | - VFIO_IRQ_SET_ACTION_TYPE_MASK)) - return -EINVAL; - max = vfio_pci_get_irq_count(vdev, hdr.index); - if (hdr.start >= max || hdr.start + hdr.count > max) - return -EINVAL; - switch (hdr.flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { - case VFIO_IRQ_SET_DATA_NONE: - size = 0; - break; - case VFIO_IRQ_SET_DATA_BOOL: - size = sizeof(uint8_t); - break; - case VFIO_IRQ_SET_DATA_EVENTFD: - size = sizeof(int32_t); - break; - default: - return -EINVAL; - } - - if (size) { - if (hdr.argsz - minsz < hdr.count * size) - return -EINVAL; + ret = vfio_set_irqs_validate_and_prepare(&hdr, max, + VFIO_PCI_NUM_IRQS, &data_size); + if (ret) + return ret; + if (data_size) { data = memdup_user((void __user *)(arg + minsz), - hdr.count * size); + data_size); if (IS_ERR(data)) return PTR_ERR(data); } diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 65d4a30..330a570 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -31,8 +31,6 @@ #include "vfio_pci_private.h" -#define PCI_CFG_SPACE_SIZE 256 - /* Fake capability ID for standard config space */ #define PCI_CAP_ID_BASIC 0 @@ -152,7 +150,7 @@ static int vfio_user_config_read(struct pci_dev *pdev, int offset, *val = cpu_to_le32(tmp_val); - return pcibios_err_to_errno(ret); + return ret; } static int vfio_user_config_write(struct pci_dev *pdev, int offset, @@ -173,7 +171,7 @@ static int vfio_user_config_write(struct pci_dev *pdev, int offset, break; } - return pcibios_err_to_errno(ret); + return ret; } static int vfio_default_config_read(struct vfio_pci_device *vdev, int pos, @@ -257,7 +255,7 @@ static int vfio_direct_config_read(struct vfio_pci_device *vdev, int pos, ret = vfio_user_config_read(vdev->pdev, pos, val, count); if (ret) - return pcibios_err_to_errno(ret); + return ret; if (pos >= PCI_CFG_SPACE_SIZE) { /* Extended cap header mangling */ if (offset < 4) @@ -295,7 +293,7 @@ static int vfio_raw_config_read(struct vfio_pci_device *vdev, int pos, ret = vfio_user_config_read(vdev->pdev, pos, val, count); if (ret) - return pcibios_err_to_errno(ret); + return ret; return count; } @@ -1089,7 +1087,7 @@ static int vfio_msi_config_write(struct vfio_pci_device *vdev, int pos, start + PCI_MSI_FLAGS, flags); if (ret) - return pcibios_err_to_errno(ret); + return ret; } return count; diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index d781428..4c27f4b 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -364,36 +364,21 @@ static long vfio_platform_ioctl(void *device_data, struct vfio_irq_set hdr; u8 *data = NULL; int ret = 0; + size_t data_size = 0; minsz = offsetofend(struct vfio_irq_set, count); if (copy_from_user(&hdr, (void __user *)arg, minsz)) return -EFAULT; - if (hdr.argsz < minsz) - return -EINVAL; - - if (hdr.index >= vdev->num_irqs) - return -EINVAL; - - if (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | - VFIO_IRQ_SET_ACTION_TYPE_MASK)) - return -EINVAL; - - if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) { - size_t size; - - if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL) - size = sizeof(uint8_t); - else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD) - size = sizeof(int32_t); - else - return -EINVAL; - - if (hdr.argsz - minsz < size) - return -EINVAL; + ret = vfio_set_irqs_validate_and_prepare(&hdr, vdev->num_irqs, + vdev->num_irqs, &data_size); + if (ret) + return ret; - data = memdup_user((void __user *)(arg + minsz), size); + if (data_size) { + data = memdup_user((void __user *)(arg + minsz), + data_size); if (IS_ERR(data)) return PTR_ERR(data); } diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c index d1d70e0..9901c46 100644 --- a/drivers/vfio/vfio.c +++ b/drivers/vfio/vfio.c @@ -86,6 +86,8 @@ struct vfio_group { struct mutex unbound_lock; atomic_t opened; bool noiommu; + struct kvm *kvm; + struct blocking_notifier_head notifier; }; struct vfio_device { @@ -339,6 +341,7 @@ static struct vfio_group *vfio_create_group(struct iommu_group *iommu_group) #ifdef CONFIG_VFIO_NOIOMMU group->noiommu = (iommu_group_get_iommudata(iommu_group) == &noiommu); #endif + BLOCKING_INIT_NOTIFIER_HEAD(&group->notifier); group->nb.notifier_call = vfio_iommu_group_notifier; @@ -480,6 +483,21 @@ static struct vfio_group *vfio_group_get_from_minor(int minor) return group; } +static struct vfio_group *vfio_group_get_from_dev(struct device *dev) +{ + struct iommu_group *iommu_group; + struct vfio_group *group; + + iommu_group = iommu_group_get(dev); + if (!iommu_group) + return NULL; + + group = vfio_group_get_from_iommu(iommu_group); + iommu_group_put(iommu_group); + + return group; +} + /** * Device objects - create, release, get, put, search */ @@ -811,16 +829,10 @@ EXPORT_SYMBOL_GPL(vfio_add_group_dev); */ struct vfio_device *vfio_device_get_from_dev(struct device *dev) { - struct iommu_group *iommu_group; struct vfio_group *group; struct vfio_device *device; - iommu_group = iommu_group_get(dev); - if (!iommu_group) - return NULL; - - group = vfio_group_get_from_iommu(iommu_group); - iommu_group_put(iommu_group); + group = vfio_group_get_from_dev(dev); if (!group) return NULL; @@ -1376,6 +1388,23 @@ static bool vfio_group_viable(struct vfio_group *group) group, vfio_dev_viable) == 0); } +static int vfio_group_add_container_user(struct vfio_group *group) +{ + if (!atomic_inc_not_zero(&group->container_users)) + return -EINVAL; + + if (group->noiommu) { + atomic_dec(&group->container_users); + return -EPERM; + } + if (!group->container->iommu_driver || !vfio_group_viable(group)) { + atomic_dec(&group->container_users); + return -EINVAL; + } + + return 0; +} + static const struct file_operations vfio_device_fops; static int vfio_group_get_device_fd(struct vfio_group *group, char *buf) @@ -1555,6 +1584,9 @@ static int vfio_group_fops_release(struct inode *inode, struct file *filep) filep->private_data = NULL; + /* Any user didn't unregister? */ + WARN_ON(group->notifier.head); + vfio_group_try_dissolve_container(group); atomic_dec(&group->opened); @@ -1685,23 +1717,14 @@ static const struct file_operations vfio_device_fops = { struct vfio_group *vfio_group_get_external_user(struct file *filep) { struct vfio_group *group = filep->private_data; + int ret; if (filep->f_op != &vfio_group_fops) return ERR_PTR(-EINVAL); - if (!atomic_inc_not_zero(&group->container_users)) - return ERR_PTR(-EINVAL); - - if (group->noiommu) { - atomic_dec(&group->container_users); - return ERR_PTR(-EPERM); - } - - if (!group->container->iommu_driver || - !vfio_group_viable(group)) { - atomic_dec(&group->container_users); - return ERR_PTR(-EINVAL); - } + ret = vfio_group_add_container_user(group); + if (ret) + return ERR_PTR(ret); vfio_group_get(group); @@ -1763,7 +1786,7 @@ struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps, header->version = version; /* Add to the end of the capability chain */ - for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next) + for (tmp = buf; tmp->next; tmp = buf + tmp->next) ; /* nothing */ tmp->next = caps->size; @@ -1776,11 +1799,403 @@ EXPORT_SYMBOL_GPL(vfio_info_cap_add); void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset) { struct vfio_info_cap_header *tmp; + void *buf = (void *)caps->buf; - for (tmp = caps->buf; tmp->next; tmp = (void *)tmp + tmp->next - offset) + for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset) tmp->next += offset; } -EXPORT_SYMBOL_GPL(vfio_info_cap_shift); +EXPORT_SYMBOL(vfio_info_cap_shift); + +static int sparse_mmap_cap(struct vfio_info_cap *caps, void *cap_type) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_sparse_mmap *sparse_cap, *sparse = cap_type; + size_t size; + + size = sizeof(*sparse) + sparse->nr_areas * sizeof(*sparse->areas); + header = vfio_info_cap_add(caps, size, + VFIO_REGION_INFO_CAP_SPARSE_MMAP, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + sparse_cap = container_of(header, + struct vfio_region_info_cap_sparse_mmap, header); + sparse_cap->nr_areas = sparse->nr_areas; + memcpy(sparse_cap->areas, sparse->areas, + sparse->nr_areas * sizeof(*sparse->areas)); + return 0; +} + +static int region_type_cap(struct vfio_info_cap *caps, void *cap_type) +{ + struct vfio_info_cap_header *header; + struct vfio_region_info_cap_type *type_cap, *cap = cap_type; + + header = vfio_info_cap_add(caps, sizeof(*cap), + VFIO_REGION_INFO_CAP_TYPE, 1); + if (IS_ERR(header)) + return PTR_ERR(header); + + type_cap = container_of(header, struct vfio_region_info_cap_type, + header); + type_cap->type = cap->type; + type_cap->subtype = cap->subtype; + return 0; +} + +int vfio_info_add_capability(struct vfio_info_cap *caps, int cap_type_id, + void *cap_type) +{ + int ret = -EINVAL; + + if (!cap_type) + return 0; + + switch (cap_type_id) { + case VFIO_REGION_INFO_CAP_SPARSE_MMAP: + ret = sparse_mmap_cap(caps, cap_type); + break; + + case VFIO_REGION_INFO_CAP_TYPE: + ret = region_type_cap(caps, cap_type); + break; + } + + return ret; +} +EXPORT_SYMBOL(vfio_info_add_capability); + +int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs, + int max_irq_type, size_t *data_size) +{ + unsigned long minsz; + size_t size; + + minsz = offsetofend(struct vfio_irq_set, count); + + if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) || + (hdr->count >= (U32_MAX - hdr->start)) || + (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK | + VFIO_IRQ_SET_ACTION_TYPE_MASK))) + return -EINVAL; + + if (data_size) + *data_size = 0; + + if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs) + return -EINVAL; + + switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) { + case VFIO_IRQ_SET_DATA_NONE: + size = 0; + break; + case VFIO_IRQ_SET_DATA_BOOL: + size = sizeof(uint8_t); + break; + case VFIO_IRQ_SET_DATA_EVENTFD: + size = sizeof(int32_t); + break; + default: + return -EINVAL; + } + + if (size) { + if (hdr->argsz - minsz < hdr->count * size) + return -EINVAL; + + if (!data_size) + return -EINVAL; + + *data_size = hdr->count * size; + } + + return 0; +} +EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare); + +/* + * Pin a set of guest PFNs and return their associated host PFNs for local + * domain only. + * @dev [in] : device + * @user_pfn [in]: array of user/guest PFNs to be unpinned. + * @npage [in] : count of elements in user_pfn array. This count should not + * be greater VFIO_PIN_PAGES_MAX_ENTRIES. + * @prot [in] : protection flags + * @phys_pfn[out]: array of host PFNs + * Return error or number of pages pinned. + */ +int vfio_pin_pages(struct device *dev, unsigned long *user_pfn, int npage, + int prot, unsigned long *phys_pfn) +{ + struct vfio_container *container; + struct vfio_group *group; + struct vfio_iommu_driver *driver; + int ret; + + if (!dev || !user_pfn || !phys_pfn || !npage) + return -EINVAL; + + if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) + return -E2BIG; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + ret = vfio_group_add_container_user(group); + if (ret) + goto err_pin_pages; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->pin_pages)) + ret = driver->ops->pin_pages(container->iommu_data, user_pfn, + npage, prot, phys_pfn); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + +err_pin_pages: + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_pin_pages); + +/* + * Unpin set of host PFNs for local domain only. + * @dev [in] : device + * @user_pfn [in]: array of user/guest PFNs to be unpinned. Number of user/guest + * PFNs should not be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * @npage [in] : count of elements in user_pfn array. This count should not + * be greater than VFIO_PIN_PAGES_MAX_ENTRIES. + * Return error or number of pages unpinned. + */ +int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn, int npage) +{ + struct vfio_container *container; + struct vfio_group *group; + struct vfio_iommu_driver *driver; + int ret; + + if (!dev || !user_pfn || !npage) + return -EINVAL; + + if (npage > VFIO_PIN_PAGES_MAX_ENTRIES) + return -E2BIG; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + ret = vfio_group_add_container_user(group); + if (ret) + goto err_unpin_pages; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->unpin_pages)) + ret = driver->ops->unpin_pages(container->iommu_data, user_pfn, + npage); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + +err_unpin_pages: + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_unpin_pages); + +static int vfio_register_iommu_notifier(struct vfio_group *group, + unsigned long *events, + struct notifier_block *nb) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + int ret; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->register_notifier)) + ret = driver->ops->register_notifier(container->iommu_data, + events, nb); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +static int vfio_unregister_iommu_notifier(struct vfio_group *group, + struct notifier_block *nb) +{ + struct vfio_container *container; + struct vfio_iommu_driver *driver; + int ret; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + driver = container->iommu_driver; + if (likely(driver && driver->ops->unregister_notifier)) + ret = driver->ops->unregister_notifier(container->iommu_data, + nb); + else + ret = -ENOTTY; + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +void vfio_group_set_kvm(struct vfio_group *group, struct kvm *kvm) +{ + group->kvm = kvm; + blocking_notifier_call_chain(&group->notifier, + VFIO_GROUP_NOTIFY_SET_KVM, kvm); +} +EXPORT_SYMBOL_GPL(vfio_group_set_kvm); + +static int vfio_register_group_notifier(struct vfio_group *group, + unsigned long *events, + struct notifier_block *nb) +{ + struct vfio_container *container; + int ret; + bool set_kvm = false; + + if (*events & VFIO_GROUP_NOTIFY_SET_KVM) + set_kvm = true; + + /* clear known events */ + *events &= ~VFIO_GROUP_NOTIFY_SET_KVM; + + /* refuse to continue if still events remaining */ + if (*events) + return -EINVAL; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + ret = blocking_notifier_chain_register(&group->notifier, nb); + + /* + * The attaching of kvm and vfio_group might already happen, so + * here we replay once upon registration. + */ + if (!ret && set_kvm && group->kvm) + blocking_notifier_call_chain(&group->notifier, + VFIO_GROUP_NOTIFY_SET_KVM, group->kvm); + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +static int vfio_unregister_group_notifier(struct vfio_group *group, + struct notifier_block *nb) +{ + struct vfio_container *container; + int ret; + + ret = vfio_group_add_container_user(group); + if (ret) + return -EINVAL; + + container = group->container; + down_read(&container->group_lock); + + ret = blocking_notifier_chain_unregister(&group->notifier, nb); + + up_read(&container->group_lock); + vfio_group_try_dissolve_container(group); + + return ret; +} + +int vfio_register_notifier(struct device *dev, enum vfio_notify_type type, + unsigned long *events, struct notifier_block *nb) +{ + struct vfio_group *group; + int ret; + + if (!dev || !nb || !events || (*events == 0)) + return -EINVAL; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + switch (type) { + case VFIO_IOMMU_NOTIFY: + ret = vfio_register_iommu_notifier(group, events, nb); + break; + case VFIO_GROUP_NOTIFY: + ret = vfio_register_group_notifier(group, events, nb); + break; + default: + ret = -EINVAL; + } + + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_register_notifier); + +int vfio_unregister_notifier(struct device *dev, enum vfio_notify_type type, + struct notifier_block *nb) +{ + struct vfio_group *group; + int ret; + + if (!dev || !nb) + return -EINVAL; + + group = vfio_group_get_from_dev(dev); + if (!group) + return -ENODEV; + + switch (type) { + case VFIO_IOMMU_NOTIFY: + ret = vfio_unregister_iommu_notifier(group, nb); + break; + case VFIO_GROUP_NOTIFY: + ret = vfio_unregister_group_notifier(group, nb); + break; + default: + ret = -EINVAL; + } + + vfio_group_put(group); + return ret; +} +EXPORT_SYMBOL(vfio_unregister_notifier); /** * Module/class support diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 2ba1942..f3726ba 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -36,6 +36,9 @@ #include <linux/uaccess.h> #include <linux/vfio.h> #include <linux/workqueue.h> +#include <linux/pid_namespace.h> +#include <linux/mdev.h> +#include <linux/notifier.h> #define DRIVER_VERSION "0.2" #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>" @@ -55,8 +58,10 @@ MODULE_PARM_DESC(disable_hugepages, struct vfio_iommu { struct list_head domain_list; + struct vfio_domain *external_domain; /* domain for external user */ struct mutex lock; struct rb_root dma_list; + struct blocking_notifier_head notifier; bool v2; bool nesting; }; @@ -75,6 +80,9 @@ struct vfio_dma { unsigned long vaddr; /* Process virtual addr */ size_t size; /* Map size (bytes) */ int prot; /* IOMMU_READ/WRITE */ + bool iommu_mapped; + struct task_struct *task; + struct rb_root pfn_list; /* Ex-user pinned pfn list */ }; struct vfio_group { @@ -83,6 +91,21 @@ struct vfio_group { }; /* + * Guest RAM pinning working set or DMA target + */ +struct vfio_pfn { + struct rb_node node; + dma_addr_t iova; /* Device address */ + unsigned long pfn; /* Host pfn */ + atomic_t ref_count; +}; + +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ + (!list_empty(&iommu->domain_list)) + +static int put_pfn(unsigned long pfn, int prot); + +/* * This code handles mapping and unmapping of user data buffers * into DMA'ble space using the IOMMU */ @@ -130,6 +153,97 @@ static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old) rb_erase(&old->node, &iommu->dma_list); } +/* + * Helper Functions for host iova-pfn list + */ +static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova) +{ + struct vfio_pfn *vpfn; + struct rb_node *node = dma->pfn_list.rb_node; + + while (node) { + vpfn = rb_entry(node, struct vfio_pfn, node); + + if (iova < vpfn->iova) + node = node->rb_left; + else if (iova > vpfn->iova) + node = node->rb_right; + else + return vpfn; + } + return NULL; +} + +static void vfio_link_pfn(struct vfio_dma *dma, + struct vfio_pfn *new) +{ + struct rb_node **link, *parent = NULL; + struct vfio_pfn *vpfn; + + link = &dma->pfn_list.rb_node; + while (*link) { + parent = *link; + vpfn = rb_entry(parent, struct vfio_pfn, node); + + if (new->iova < vpfn->iova) + link = &(*link)->rb_left; + else + link = &(*link)->rb_right; + } + + rb_link_node(&new->node, parent, link); + rb_insert_color(&new->node, &dma->pfn_list); +} + +static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old) +{ + rb_erase(&old->node, &dma->pfn_list); +} + +static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova, + unsigned long pfn) +{ + struct vfio_pfn *vpfn; + + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); + if (!vpfn) + return -ENOMEM; + + vpfn->iova = iova; + vpfn->pfn = pfn; + atomic_set(&vpfn->ref_count, 1); + vfio_link_pfn(dma, vpfn); + return 0; +} + +static void vfio_remove_from_pfn_list(struct vfio_dma *dma, + struct vfio_pfn *vpfn) +{ + vfio_unlink_pfn(dma, vpfn); + kfree(vpfn); +} + +static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma, + unsigned long iova) +{ + struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); + + if (vpfn) + atomic_inc(&vpfn->ref_count); + return vpfn; +} + +static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) +{ + int ret = 0; + + if (atomic_dec_and_test(&vpfn->ref_count)) { + ret = put_pfn(vpfn->pfn, dma->prot); + vfio_remove_from_pfn_list(dma, vpfn); + } + return ret; +} + struct vwork { struct mm_struct *mm; long npage; @@ -150,17 +264,22 @@ static void vfio_lock_acct_bg(struct work_struct *work) kfree(vwork); } -static void vfio_lock_acct(long npage) +static void vfio_lock_acct(struct task_struct *task, long npage) { struct vwork *vwork; struct mm_struct *mm; - if (!current->mm || !npage) + if (!npage) + return; + + mm = get_task_mm(task); + if (!mm) return; /* process exited or nothing to do */ - if (down_write_trylock(¤t->mm->mmap_sem)) { - current->mm->locked_vm += npage; - up_write(¤t->mm->mmap_sem); + if (down_write_trylock(&mm->mmap_sem)) { + mm->locked_vm += npage; + up_write(&mm->mmap_sem); + mmput(mm); return; } @@ -170,11 +289,8 @@ static void vfio_lock_acct(long npage) * wouldn't need this silliness */ vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); - if (!vwork) - return; - mm = get_task_mm(current); - if (!mm) { - kfree(vwork); + if (!vwork) { + mmput(mm); return; } INIT_WORK(&vwork->work, vfio_lock_acct_bg); @@ -228,20 +344,36 @@ static int put_pfn(unsigned long pfn, int prot) return 0; } -static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) +static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, + int prot, unsigned long *pfn) { struct page *page[1]; struct vm_area_struct *vma; - int ret = -EFAULT; + int ret; + + if (mm == current->mm) { + ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), + page); + } else { + unsigned int flags = 0; + + if (prot & IOMMU_WRITE) + flags |= FOLL_WRITE; + + down_read(&mm->mmap_sem); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, + NULL, NULL); + up_read(&mm->mmap_sem); + } - if (get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE), page) == 1) { + if (ret == 1) { *pfn = page_to_pfn(page[0]); return 0; } - down_read(¤t->mm->mmap_sem); + down_read(&mm->mmap_sem); - vma = find_vma_intersection(current->mm, vaddr, vaddr + 1); + vma = find_vma_intersection(mm, vaddr, vaddr + 1); if (vma && vma->vm_flags & VM_PFNMAP) { *pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; @@ -249,8 +381,7 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) ret = 0; } - up_read(¤t->mm->mmap_sem); - + up_read(&mm->mmap_sem); return ret; } @@ -259,88 +390,299 @@ static int vaddr_get_pfn(unsigned long vaddr, int prot, unsigned long *pfn) * the iommu can only map chunks of consecutive pfns anyway, so get the * first page and all consecutive pages with the same locking. */ -static long vfio_pin_pages(unsigned long vaddr, long npage, - int prot, unsigned long *pfn_base) +static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, + long npage, unsigned long *pfn_base) { - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - bool lock_cap = capable(CAP_IPC_LOCK); - long ret, i; + unsigned long limit; + bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns, + CAP_IPC_LOCK); + struct mm_struct *mm; + long ret, i = 0, lock_acct = 0; bool rsvd; + dma_addr_t iova = vaddr - dma->vaddr + dma->iova; - if (!current->mm) + mm = get_task_mm(dma->task); + if (!mm) return -ENODEV; - ret = vaddr_get_pfn(vaddr, prot, pfn_base); + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); if (ret) - return ret; + goto pin_pg_remote_exit; rsvd = is_invalid_reserved_pfn(*pfn_base); + limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (!rsvd && !lock_cap && current->mm->locked_vm + 1 > limit) { - put_pfn(*pfn_base, prot); - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, - limit << PAGE_SHIFT); - return -ENOMEM; + /* + * Reserved pages aren't counted against the user, externally pinned + * pages are already counted against the user. + */ + if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!lock_cap && mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, dma->prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, + limit << PAGE_SHIFT); + ret = -ENOMEM; + goto pin_pg_remote_exit; + } + lock_acct++; } - if (unlikely(disable_hugepages)) { - if (!rsvd) - vfio_lock_acct(1); - return 1; - } + i++; + if (likely(!disable_hugepages)) { + /* Lock all the consecutive pages from pfn_base */ + for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage; + i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { + unsigned long pfn = 0; - /* Lock all the consecutive pages from pfn_base */ - for (i = 1, vaddr += PAGE_SIZE; i < npage; i++, vaddr += PAGE_SIZE) { - unsigned long pfn = 0; + ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn); + if (ret) + break; - ret = vaddr_get_pfn(vaddr, prot, &pfn); - if (ret) - break; + if (pfn != *pfn_base + i || + rsvd != is_invalid_reserved_pfn(pfn)) { + put_pfn(pfn, dma->prot); + break; + } - if (pfn != *pfn_base + i || - rsvd != is_invalid_reserved_pfn(pfn)) { - put_pfn(pfn, prot); - break; + if (!rsvd && !vfio_find_vpfn(dma, iova)) { + if (!lock_cap && + mm->locked_vm + lock_acct + 1 > limit) { + put_pfn(pfn, dma->prot); + pr_warn("%s: RLIMIT_MEMLOCK (%ld) " + "exceeded\n", __func__, + limit << PAGE_SHIFT); + break; + } + lock_acct++; + } } + } - if (!rsvd && !lock_cap && - current->mm->locked_vm + i + 1 > limit) { - put_pfn(pfn, prot); - pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", - __func__, limit << PAGE_SHIFT); - break; + vfio_lock_acct(dma->task, lock_acct); + ret = i; + +pin_pg_remote_exit: + mmput(mm); + return ret; +} + +static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, + unsigned long pfn, long npage, + bool do_accounting) +{ + long unlocked = 0, locked = 0; + long i; + + for (i = 0; i < npage; i++) { + if (put_pfn(pfn++, dma->prot)) { + unlocked++; + if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT))) + locked++; } } - if (!rsvd) - vfio_lock_acct(i); + if (do_accounting) + vfio_lock_acct(dma->task, locked - unlocked); - return i; + return unlocked; } -static long vfio_unpin_pages(unsigned long pfn, long npage, - int prot, bool do_accounting) +static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, + unsigned long *pfn_base, bool do_accounting) { - unsigned long unlocked = 0; - long i; + unsigned long limit; + bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns, + CAP_IPC_LOCK); + struct mm_struct *mm; + int ret; + bool rsvd; - for (i = 0; i < npage; i++) - unlocked += put_pfn(pfn++, prot); + mm = get_task_mm(dma->task); + if (!mm) + return -ENODEV; + + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); + if (ret) + goto pin_page_exit; + + rsvd = is_invalid_reserved_pfn(*pfn_base); + limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { + put_pfn(*pfn_base, dma->prot); + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n", + __func__, dma->task->comm, task_pid_nr(dma->task), + limit << PAGE_SHIFT); + ret = -ENOMEM; + goto pin_page_exit; + } + + if (!rsvd && do_accounting) + vfio_lock_acct(dma->task, 1); + ret = 1; + +pin_page_exit: + mmput(mm); + return ret; +} + +static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, + bool do_accounting) +{ + int unlocked; + struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova); + + if (!vpfn) + return 0; + + unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); if (do_accounting) - vfio_lock_acct(-unlocked); + vfio_lock_acct(dma->task, -unlocked); return unlocked; } -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) +static int vfio_iommu_type1_pin_pages(void *iommu_data, + unsigned long *user_pfn, + int npage, int prot, + unsigned long *phys_pfn) +{ + struct vfio_iommu *iommu = iommu_data; + int i, j, ret; + unsigned long remote_vaddr; + struct vfio_dma *dma; + bool do_accounting; + + if (!iommu || !user_pfn || !phys_pfn) + return -EINVAL; + + /* Supported for v2 version only */ + if (!iommu->v2) + return -EACCES; + + mutex_lock(&iommu->lock); + + /* Fail if notifier list is empty */ + if ((!iommu->external_domain) || (!iommu->notifier.head)) { + ret = -EINVAL; + goto pin_done; + } + + /* + * If iommu capable domain exist in the container then all pages are + * already pinned and accounted. Accouting should be done if there is no + * iommu capable domain in the container. + */ + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); + + for (i = 0; i < npage; i++) { + dma_addr_t iova; + struct vfio_pfn *vpfn; + + iova = user_pfn[i] << PAGE_SHIFT; + dma = vfio_find_dma(iommu, iova, PAGE_SIZE); + if (!dma) { + ret = -EINVAL; + goto pin_unwind; + } + + if ((dma->prot & prot) != prot) { + ret = -EPERM; + goto pin_unwind; + } + + vpfn = vfio_iova_get_vfio_pfn(dma, iova); + if (vpfn) { + phys_pfn[i] = vpfn->pfn; + continue; + } + + remote_vaddr = dma->vaddr + iova - dma->iova; + ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i], + do_accounting); + if (ret <= 0) { + WARN_ON(!ret); + goto pin_unwind; + } + + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); + if (ret) { + vfio_unpin_page_external(dma, iova, do_accounting); + goto pin_unwind; + } + } + + ret = i; + goto pin_done; + +pin_unwind: + phys_pfn[i] = 0; + for (j = 0; j < i; j++) { + dma_addr_t iova; + + iova = user_pfn[j] << PAGE_SHIFT; + dma = vfio_find_dma(iommu, iova, PAGE_SIZE); + vfio_unpin_page_external(dma, iova, do_accounting); + phys_pfn[j] = 0; + } +pin_done: + mutex_unlock(&iommu->lock); + return ret; +} + +static int vfio_iommu_type1_unpin_pages(void *iommu_data, + unsigned long *user_pfn, + int npage) +{ + struct vfio_iommu *iommu = iommu_data; + bool do_accounting; + int i; + + if (!iommu || !user_pfn) + return -EINVAL; + + /* Supported for v2 version only */ + if (!iommu->v2) + return -EACCES; + + mutex_lock(&iommu->lock); + + if (!iommu->external_domain) { + mutex_unlock(&iommu->lock); + return -EINVAL; + } + + do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu); + for (i = 0; i < npage; i++) { + struct vfio_dma *dma; + dma_addr_t iova; + + iova = user_pfn[i] << PAGE_SHIFT; + dma = vfio_find_dma(iommu, iova, PAGE_SIZE); + if (!dma) + goto unpin_exit; + vfio_unpin_page_external(dma, iova, do_accounting); + } + +unpin_exit: + mutex_unlock(&iommu->lock); + return i > npage ? npage : (i > 0 ? i : -EINVAL); +} + +static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, + bool do_accounting) { dma_addr_t iova = dma->iova, end = dma->iova + dma->size; struct vfio_domain *domain, *d; long unlocked = 0; if (!dma->size) - return; + return 0; + + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) + return 0; + /* * We use the IOMMU to track the physical addresses, otherwise we'd * need a much more complicated tracking system. Unfortunately that @@ -382,21 +724,28 @@ static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) if (WARN_ON(!unmapped)) break; - unlocked += vfio_unpin_pages(phys >> PAGE_SHIFT, - unmapped >> PAGE_SHIFT, - dma->prot, false); + unlocked += vfio_unpin_pages_remote(dma, iova, + phys >> PAGE_SHIFT, + unmapped >> PAGE_SHIFT, + false); iova += unmapped; cond_resched(); } - vfio_lock_acct(-unlocked); + dma->iommu_mapped = false; + if (do_accounting) { + vfio_lock_acct(dma->task, -unlocked); + return 0; + } + return unlocked; } static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma) { - vfio_unmap_unpin(iommu, dma); + vfio_unmap_unpin(iommu, dma, true); vfio_unlink_dma(iommu, dma); + put_task_struct(dma->task); kfree(dma); } @@ -430,9 +779,9 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_unmap *unmap) { uint64_t mask; - struct vfio_dma *dma; + struct vfio_dma *dma, *dma_last = NULL; size_t unmapped = 0; - int ret = 0; + int ret = 0, retries = 0; mask = ((uint64_t)1 << __ffs(vfio_pgsize_bitmap(iommu))) - 1; @@ -442,7 +791,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, return -EINVAL; WARN_ON(mask & PAGE_MASK); - +again: mutex_lock(&iommu->lock); /* @@ -477,7 +826,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, * mappings within the range. */ if (iommu->v2) { - dma = vfio_find_dma(iommu, unmap->iova, 0); + dma = vfio_find_dma(iommu, unmap->iova, 1); if (dma && dma->iova != unmap->iova) { ret = -EINVAL; goto unlock; @@ -492,6 +841,38 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu, while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) { if (!iommu->v2 && unmap->iova > dma->iova) break; + /* + * Task with same address space who mapped this iova range is + * allowed to unmap the iova range. + */ + if (dma->task->mm != current->mm) + break; + + if (!RB_EMPTY_ROOT(&dma->pfn_list)) { + struct vfio_iommu_type1_dma_unmap nb_unmap; + + if (dma_last == dma) { + BUG_ON(++retries > 10); + } else { + dma_last = dma; + retries = 0; + } + + nb_unmap.iova = dma->iova; + nb_unmap.size = dma->size; + + /* + * Notify anyone (mdev vendor drivers) to invalidate and + * unmap iovas within the range we're about to unmap. + * Vendor drivers MUST unpin pages in response to an + * invalidation. + */ + mutex_unlock(&iommu->lock); + blocking_notifier_call_chain(&iommu->notifier, + VFIO_IOMMU_NOTIFY_DMA_UNMAP, + &nb_unmap); + goto again; + } unmapped += dma->size; vfio_remove_dma(iommu, dma); } @@ -558,17 +939,56 @@ unwind: return ret; } +static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, + size_t map_size) +{ + dma_addr_t iova = dma->iova; + unsigned long vaddr = dma->vaddr; + size_t size = map_size; + long npage; + unsigned long pfn; + int ret = 0; + + while (size) { + /* Pin a contiguous chunk of memory */ + npage = vfio_pin_pages_remote(dma, vaddr + dma->size, + size >> PAGE_SHIFT, &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + break; + } + + /* Map it! */ + ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, + dma->prot); + if (ret) { + vfio_unpin_pages_remote(dma, iova + dma->size, pfn, + npage, true); + break; + } + + size -= npage << PAGE_SHIFT; + dma->size += npage << PAGE_SHIFT; + } + + dma->iommu_mapped = true; + + if (ret) + vfio_remove_dma(iommu, dma); + + return ret; +} + static int vfio_dma_do_map(struct vfio_iommu *iommu, struct vfio_iommu_type1_dma_map *map) { dma_addr_t iova = map->iova; unsigned long vaddr = map->vaddr; size_t size = map->size; - long npage; int ret = 0, prot = 0; uint64_t mask; struct vfio_dma *dma; - unsigned long pfn; /* Verify that none of our __u64 fields overflow */ if (map->size != size || map->vaddr != vaddr || map->iova != iova) @@ -594,47 +1014,33 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu, mutex_lock(&iommu->lock); if (vfio_find_dma(iommu, iova, size)) { - mutex_unlock(&iommu->lock); - return -EEXIST; + ret = -EEXIST; + goto out_unlock; } dma = kzalloc(sizeof(*dma), GFP_KERNEL); if (!dma) { - mutex_unlock(&iommu->lock); - return -ENOMEM; + ret = -ENOMEM; + goto out_unlock; } dma->iova = iova; dma->vaddr = vaddr; dma->prot = prot; + get_task_struct(current); + dma->task = current; + dma->pfn_list = RB_ROOT; /* Insert zero-sized and grow as we map chunks of it */ vfio_link_dma(iommu, dma); - while (size) { - /* Pin a contiguous chunk of memory */ - npage = vfio_pin_pages(vaddr + dma->size, - size >> PAGE_SHIFT, prot, &pfn); - if (npage <= 0) { - WARN_ON(!npage); - ret = (int)npage; - break; - } - - /* Map it! */ - ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage, prot); - if (ret) { - vfio_unpin_pages(pfn, npage, prot, true); - break; - } - - size -= npage << PAGE_SHIFT; - dma->size += npage << PAGE_SHIFT; - } - - if (ret) - vfio_remove_dma(iommu, dma); + /* Don't pin and map if container doesn't contain IOMMU capable domain*/ + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) + dma->size = size; + else + ret = vfio_pin_map_dma(iommu, dma, size); +out_unlock: mutex_unlock(&iommu->lock); return ret; } @@ -662,10 +1068,6 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, d = list_first_entry(&iommu->domain_list, struct vfio_domain, next); n = rb_first(&iommu->dma_list); - /* If there's not a domain, there better not be any mappings */ - if (WARN_ON(n && !d)) - return -EINVAL; - for (; n; n = rb_next(n)) { struct vfio_dma *dma; dma_addr_t iova; @@ -674,21 +1076,49 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, iova = dma->iova; while (iova < dma->iova + dma->size) { - phys_addr_t phys = iommu_iova_to_phys(d->domain, iova); + phys_addr_t phys; size_t size; - if (WARN_ON(!phys)) { - iova += PAGE_SIZE; - continue; + if (dma->iommu_mapped) { + phys_addr_t p; + dma_addr_t i; + + phys = iommu_iova_to_phys(d->domain, iova); + + if (WARN_ON(!phys)) { + iova += PAGE_SIZE; + continue; + } + + size = PAGE_SIZE; + p = phys + size; + i = iova + size; + while (i < dma->iova + dma->size && + p == iommu_iova_to_phys(d->domain, i)) { + size += PAGE_SIZE; + p += PAGE_SIZE; + i += PAGE_SIZE; + } + } else { + unsigned long pfn; + unsigned long vaddr = dma->vaddr + + (iova - dma->iova); + size_t n = dma->iova + dma->size - iova; + long npage; + + npage = vfio_pin_pages_remote(dma, vaddr, + n >> PAGE_SHIFT, + &pfn); + if (npage <= 0) { + WARN_ON(!npage); + ret = (int)npage; + return ret; + } + + phys = pfn << PAGE_SHIFT; + size = npage << PAGE_SHIFT; } - size = PAGE_SIZE; - - while (iova + size < dma->iova + dma->size && - phys + size == iommu_iova_to_phys(d->domain, - iova + size)) - size += PAGE_SIZE; - ret = iommu_map(domain->domain, iova, phys, size, dma->prot | domain->prot); if (ret) @@ -696,8 +1126,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, iova += size; } + dma->iommu_mapped = true; } - return 0; } @@ -734,22 +1164,39 @@ static void vfio_test_domain_fgsp(struct vfio_domain *domain) __free_pages(pages, order); } +static struct vfio_group *find_iommu_group(struct vfio_domain *domain, + struct iommu_group *iommu_group) +{ + struct vfio_group *g; + + list_for_each_entry(g, &domain->group_list, next) { + if (g->iommu_group == iommu_group) + return g; + } + + return NULL; +} + static int vfio_iommu_type1_attach_group(void *iommu_data, struct iommu_group *iommu_group) { struct vfio_iommu *iommu = iommu_data; - struct vfio_group *group, *g; + struct vfio_group *group; struct vfio_domain *domain, *d; - struct bus_type *bus = NULL; + struct bus_type *bus = NULL, *mdev_bus; int ret; mutex_lock(&iommu->lock); list_for_each_entry(d, &iommu->domain_list, next) { - list_for_each_entry(g, &d->group_list, next) { - if (g->iommu_group != iommu_group) - continue; + if (find_iommu_group(d, iommu_group)) { + mutex_unlock(&iommu->lock); + return -EINVAL; + } + } + if (iommu->external_domain) { + if (find_iommu_group(iommu->external_domain, iommu_group)) { mutex_unlock(&iommu->lock); return -EINVAL; } @@ -769,6 +1216,25 @@ static int vfio_iommu_type1_attach_group(void *iommu_data, if (ret) goto out_free; + mdev_bus = symbol_get(mdev_bus_type); + + if (mdev_bus) { + if ((bus == mdev_bus) && !iommu_present(bus)) { + symbol_put(mdev_bus_type); + if (!iommu->external_domain) { + INIT_LIST_HEAD(&domain->group_list); + iommu->external_domain = domain; + } else + kfree(domain); + + list_add(&group->next, + &iommu->external_domain->group_list); + mutex_unlock(&iommu->lock); + return 0; + } + symbol_put(mdev_bus_type); + } + domain->domain = iommu_domain_alloc(bus); if (!domain->domain) { ret = -EIO; @@ -859,6 +1325,46 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu) vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node)); } +static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) +{ + struct rb_node *n, *p; + + n = rb_first(&iommu->dma_list); + for (; n; n = rb_next(n)) { + struct vfio_dma *dma; + long locked = 0, unlocked = 0; + + dma = rb_entry(n, struct vfio_dma, node); + unlocked += vfio_unmap_unpin(iommu, dma, false); + p = rb_first(&dma->pfn_list); + for (; p; p = rb_next(p)) { + struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, + node); + + if (!is_invalid_reserved_pfn(vpfn->pfn)) + locked++; + } + vfio_lock_acct(dma->task, locked - unlocked); + } +} + +static void vfio_sanity_check_pfn_list(struct vfio_iommu *iommu) +{ + struct rb_node *n; + + n = rb_first(&iommu->dma_list); + for (; n; n = rb_next(n)) { + struct vfio_dma *dma; + + dma = rb_entry(n, struct vfio_dma, node); + + if (WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list))) + break; + } + /* mdev vendor driver must unregister notifier */ + WARN_ON(iommu->notifier.head); +} + static void vfio_iommu_type1_detach_group(void *iommu_data, struct iommu_group *iommu_group) { @@ -868,31 +1374,55 @@ static void vfio_iommu_type1_detach_group(void *iommu_data, mutex_lock(&iommu->lock); - list_for_each_entry(domain, &iommu->domain_list, next) { - list_for_each_entry(group, &domain->group_list, next) { - if (group->iommu_group != iommu_group) - continue; - - iommu_detach_group(domain->domain, iommu_group); + if (iommu->external_domain) { + group = find_iommu_group(iommu->external_domain, iommu_group); + if (group) { list_del(&group->next); kfree(group); - /* - * Group ownership provides privilege, if the group - * list is empty, the domain goes away. If it's the - * last domain, then all the mappings go away too. - */ - if (list_empty(&domain->group_list)) { - if (list_is_singular(&iommu->domain_list)) + + if (list_empty(&iommu->external_domain->group_list)) { + vfio_sanity_check_pfn_list(iommu); + + if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) vfio_iommu_unmap_unpin_all(iommu); - iommu_domain_free(domain->domain); - list_del(&domain->next); - kfree(domain); + + kfree(iommu->external_domain); + iommu->external_domain = NULL; + } + goto detach_group_done; + } + } + + list_for_each_entry(domain, &iommu->domain_list, next) { + group = find_iommu_group(domain, iommu_group); + if (!group) + continue; + + iommu_detach_group(domain->domain, iommu_group); + list_del(&group->next); + kfree(group); + /* + * Group ownership provides privilege, if the group list is + * empty, the domain goes away. If it's the last domain with + * iommu and external domain doesn't exist, then all the + * mappings go away too. If it's the last domain with iommu and + * external domain exist, update accounting + */ + if (list_empty(&domain->group_list)) { + if (list_is_singular(&iommu->domain_list)) { + if (!iommu->external_domain) + vfio_iommu_unmap_unpin_all(iommu); + else + vfio_iommu_unmap_unpin_reaccount(iommu); } - goto done; + iommu_domain_free(domain->domain); + list_del(&domain->next); + kfree(domain); } + break; } -done: +detach_group_done: mutex_unlock(&iommu->lock); } @@ -920,31 +1450,46 @@ static void *vfio_iommu_type1_open(unsigned long arg) INIT_LIST_HEAD(&iommu->domain_list); iommu->dma_list = RB_ROOT; mutex_init(&iommu->lock); + BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier); return iommu; } +static void vfio_release_domain(struct vfio_domain *domain, bool external) +{ + struct vfio_group *group, *group_tmp; + + list_for_each_entry_safe(group, group_tmp, + &domain->group_list, next) { + if (!external) + iommu_detach_group(domain->domain, group->iommu_group); + list_del(&group->next); + kfree(group); + } + + if (!external) + iommu_domain_free(domain->domain); +} + static void vfio_iommu_type1_release(void *iommu_data) { struct vfio_iommu *iommu = iommu_data; struct vfio_domain *domain, *domain_tmp; - struct vfio_group *group, *group_tmp; + + if (iommu->external_domain) { + vfio_release_domain(iommu->external_domain, true); + vfio_sanity_check_pfn_list(iommu); + kfree(iommu->external_domain); + } vfio_iommu_unmap_unpin_all(iommu); list_for_each_entry_safe(domain, domain_tmp, &iommu->domain_list, next) { - list_for_each_entry_safe(group, group_tmp, - &domain->group_list, next) { - iommu_detach_group(domain->domain, group->iommu_group); - list_del(&group->next); - kfree(group); - } - iommu_domain_free(domain->domain); + vfio_release_domain(domain, false); list_del(&domain->next); kfree(domain); } - kfree(iommu); } @@ -1040,14 +1585,42 @@ static long vfio_iommu_type1_ioctl(void *iommu_data, return -ENOTTY; } +static int vfio_iommu_type1_register_notifier(void *iommu_data, + unsigned long *events, + struct notifier_block *nb) +{ + struct vfio_iommu *iommu = iommu_data; + + /* clear known events */ + *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP; + + /* refuse to register if still events remaining */ + if (*events) + return -EINVAL; + + return blocking_notifier_chain_register(&iommu->notifier, nb); +} + +static int vfio_iommu_type1_unregister_notifier(void *iommu_data, + struct notifier_block *nb) +{ + struct vfio_iommu *iommu = iommu_data; + + return blocking_notifier_chain_unregister(&iommu->notifier, nb); +} + static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = { - .name = "vfio-iommu-type1", - .owner = THIS_MODULE, - .open = vfio_iommu_type1_open, - .release = vfio_iommu_type1_release, - .ioctl = vfio_iommu_type1_ioctl, - .attach_group = vfio_iommu_type1_attach_group, - .detach_group = vfio_iommu_type1_detach_group, + .name = "vfio-iommu-type1", + .owner = THIS_MODULE, + .open = vfio_iommu_type1_open, + .release = vfio_iommu_type1_release, + .ioctl = vfio_iommu_type1_ioctl, + .attach_group = vfio_iommu_type1_attach_group, + .detach_group = vfio_iommu_type1_detach_group, + .pin_pages = vfio_iommu_type1_pin_pages, + .unpin_pages = vfio_iommu_type1_unpin_pages, + .register_notifier = vfio_iommu_type1_register_notifier, + .unregister_notifier = vfio_iommu_type1_unregister_notifier, }; static int __init vfio_iommu_type1_init(void) |