Message ID | 20120312223254.30321.74405.stgit@bling.home |
---|---|
State | New |
Headers | show |
On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > --- > > drivers/base/Kconfig | 10 + > drivers/base/Makefile | 1 > drivers/base/base.h | 5 > drivers/base/isolation.c | 798 +++++++++++++++++++++++++++++++++++++++++++++ > include/linux/device.h | 4 > include/linux/isolation.h | 138 ++++++++ > 6 files changed, 956 insertions(+), 0 deletions(-) > create mode 100644 drivers/base/isolation.c > create mode 100644 include/linux/isolation.h > > diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig > index 7be9f79..e98a5f3 100644 > --- a/drivers/base/Kconfig > +++ b/drivers/base/Kconfig > @@ -189,4 +189,14 @@ config DMA_SHARED_BUFFER > APIs extension; the file's descriptor can then be passed on to other > driver. > > +config ISOLATION_GROUPS > + bool "Enable Isolation Group API" > + default n > + depends on EXPERIMENTAL && IOMMU_API > + help > + This option enables grouping of devices into Isolation Groups > + which may be used by other subsystems to perform quirks across > + sets of devices as well as userspace drivers for guaranteeing > + devices are isolated from the rest of the system. > + > endmenu > diff --git a/drivers/base/Makefile b/drivers/base/Makefile > index 610f999..047b5f9 100644 > --- a/drivers/base/Makefile > +++ b/drivers/base/Makefile > @@ -19,6 +19,7 @@ obj-$(CONFIG_MODULES) += module.o > endif > obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o > obj-$(CONFIG_REGMAP) += regmap/ > +obj-$(CONFIG_ISOLATION_GROUPS) += isolation.o > > ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG > > diff --git a/drivers/base/base.h b/drivers/base/base.h > index b858dfd..376758a 100644 > --- a/drivers/base/base.h > +++ b/drivers/base/base.h > @@ -1,4 +1,5 @@ > #include <linux/notifier.h> > +#include <linux/isolation.h> > > /** > * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. > @@ -108,6 +109,10 @@ extern int driver_probe_device(struct device_driver *drv, struct device *dev); > static inline int driver_match_device(struct device_driver *drv, > struct device *dev) > { > + if (isolation_group_managed(to_isolation_group(dev)) && > + !isolation_group_manager_driver(to_isolation_group(dev), drv)) > + return 0; > + > return drv->bus->match ? drv->bus->match(dev, drv) : 1; > } > > diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c > new file mode 100644 > index 0000000..c01365c > --- /dev/null > +++ b/drivers/base/isolation.c > @@ -0,0 +1,798 @@ > +/* > + * Isolation group interface > + * > + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. > + * Author: Alex Williamson <alex.williamson@redhat.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + * > + */ > + > +#include <linux/device.h> > +#include <linux/iommu.h> > +#include <linux/isolation.h> > +#include <linux/list.h> > +#include <linux/mutex.h> > +#include <linux/notifier.h> > +#include <linux/slab.h> > +#include <linux/uuid.h> > + > +static struct kset *isolation_kset; > +/* XXX add more complete locking, maybe rcu */ > +static DEFINE_MUTEX(isolation_lock); > +static LIST_HEAD(isolation_groups); > +static LIST_HEAD(isolation_notifiers); > + > +/* Keep these private */ > +struct isolation_manager_driver { > + struct device_driver *drv; > + struct list_head list; > +}; > + > +struct isolation_manager { > + struct list_head drivers; > + /* Likely need managers to register some callbacks */ > +}; > + > +struct isolation_group { > + struct list_head list; > + struct list_head devices; > + struct kobject kobj; > + struct kobject *devices_kobj; > + struct iommu_domain *iommu_domain; > + struct iommu_ops *iommu_ops; > + struct isolation_manager *manager; > + uuid_le uuid; > +}; > + > +struct isolation_device { > + struct device *dev; > + struct list_head list; > +}; > + > +struct isolation_notifier { > + struct bus_type *bus; > + struct notifier_block nb; > + struct blocking_notifier_head notifier; > + struct list_head list; > + int refcnt; > +}; > + > +struct iso_attribute { > + struct attribute attr; > + ssize_t (*show)(struct isolation_group *group, char *buf); > + ssize_t (*store)(struct isolation_group *group, > + const char *buf, size_t count); > +}; > + > +#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr) > +#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj) > + > +static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr, > + char *buf) > +{ > + struct iso_attribute *iso_attr = to_iso_attr(attr); > + struct isolation_group *group = to_iso_group(kobj); > + ssize_t ret = -EIO; > + > + if (iso_attr->show) > + ret = iso_attr->show(group, buf); > + return ret; > +} > + > +static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr, > + const char *buf, size_t count) > +{ > + struct iso_attribute *iso_attr = to_iso_attr(attr); > + struct isolation_group *group = to_iso_group(kobj); > + ssize_t ret = -EIO; > + > + if (iso_attr->store) > + ret = iso_attr->store(group, buf, count); > + return ret; > +} > + > +static void iso_release(struct kobject *kobj) > +{ > + struct isolation_group *group = to_iso_group(kobj); > + kfree(group); > +} > + > +static const struct sysfs_ops iso_sysfs_ops = { > + .show = iso_attr_show, > + .store = iso_attr_store, > +}; > + > +static struct kobj_type iso_ktype = { > + .sysfs_ops = &iso_sysfs_ops, > + .release = iso_release, > +}; > + > +/* > + * Create an isolation group. Isolation group "providers" register a > + * notifier for their bus(es) and call this to create a new isolation > + * group. > + */ > +struct isolation_group *isolation_create_group(void) > +{ > + struct isolation_group *group, *tmp; > + int ret; > + > + if (unlikely(!isolation_kset)) > + return ERR_PTR(-ENODEV); > + > + group = kzalloc(sizeof(*group), GFP_KERNEL); > + if (!group) > + return ERR_PTR(-ENOMEM); > + > + mutex_lock(&isolation_lock); > + > + /* > + * Use a UUID for group identification simply because it seems wrong > + * to expose it as a kernel pointer (%p). Neither is user friendly. > + * Mostly only expect userspace to do anything with this. > + */ So, my plan was to require the isolation provider to give a unique name - it can construct something that's actually meaningful (and with luck, stable across boots). For Power we'd use the PE number, for VT-d, I was thinking the PCI device address would do it (of the "base" device for the non-separable bridge and broken multifunction cases). > +new_uuid: > + uuid_le_gen(&group->uuid); > + > + /* Unlikely, but nothing prevents duplicates */ > + list_for_each_entry(tmp, &isolation_groups, list) > + if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0) > + goto new_uuid; > + > + INIT_LIST_HEAD(&group->devices); > + group->kobj.kset = isolation_kset; > + > + ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL, > + "%pUl", &group->uuid); Um.. isn't this setting the kobject name to the address of the uuid plus "Ul", not the content of the uuid? > + if (ret) { > + kfree(group); > + mutex_unlock(&isolation_lock); > + return ERR_PTR(ret); > + } > + > + /* Add a subdirectory to save links for devices withing the group. */ > + group->devices_kobj = kobject_create_and_add("devices", &group->kobj); > + if (!group->devices_kobj) { > + kobject_put(&group->kobj); > + mutex_unlock(&isolation_lock); > + return ERR_PTR(-ENOMEM); > + } > + > + list_add(&group->list, &isolation_groups); > + > + mutex_unlock(&isolation_lock); > + > + return group; > +} > + > +/* > + * Free isolation group. XXX need to cleanup/reject based on manager status. > + */ > +int isolation_free_group(struct isolation_group *group) > +{ > + mutex_lock(&isolation_lock); > + > + if (!list_empty(&group->devices)) { > + mutex_unlock(&isolation_lock); > + return -EBUSY; > + } > + > + list_del(&group->list); > + kobject_put(group->devices_kobj); > + kobject_put(&group->kobj); > + > + mutex_unlock(&isolation_lock); > + return 0; > +} > + > +/* > + * Add a device to an isolation group. Isolation groups start empty and > + * must be told about the devices they contain. Expect this to be called > + * from isolation group providers via notifier. > + */ Doesn't necessarily have to be from a notifier, particularly if the provider is integrated into host bridge code. > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev) > +{ > + struct isolation_device *device; > + int ret = 0; > + > + mutex_lock(&isolation_lock); > + > + if (dev->isolation_group) { > + ret = -EBUSY; > + goto out; This should probably be a BUG_ON() - the isolation provider shouldn't be putting the same device into two different groups. > + } > + > + device = kzalloc(sizeof(*device), GFP_KERNEL); > + if (!device) { > + ret = -ENOMEM; > + goto out; > + } > + > + device->dev = dev; > + > + /* Cross link the device in sysfs */ > + ret = sysfs_create_link(&dev->kobj, &group->kobj, > + "isolation_group"); > + if (ret) { > + kfree(device); > + goto out; > + } > + > + ret = sysfs_create_link(group->devices_kobj, &dev->kobj, > + kobject_name(&dev->kobj)); So, a problem both here and in my version is what to name the device links. Because they could be potentially be quite widely scattered, I'm not sure that kobject_name() is guaranteed to be sufficiently unique. > + if (ret) { > + sysfs_remove_link(&dev->kobj, "isolation_group"); > + kfree(device); > + goto out; > + } > + > + list_add(&device->list, &group->devices); > + > + dev->isolation_group = group; > + > + if (group->iommu_domain) { > + ret = group->iommu_ops->attach_dev(group->iommu_domain, dev); > + if (ret) { > + /* XXX fail device? */ > + } So, I presume the idea is that this is a stop-gap until iommu_ops is converted to be in terms of groups rather than indivdual devices? > + } > + > + /* XXX signal manager? */ Uh, what did you have in mind here? > +out: > + mutex_unlock(&isolation_lock); > + > + return 0; > +} > + > +/* > + * Remove a device from an isolation group, likely because the device > + * went away. > + */ > +int isolation_group_del_dev(struct device *dev) > +{ > + struct isolation_group *group = dev->isolation_group; > + struct isolation_device *device; > + > + if (!group) > + return -ENODEV; > + > + mutex_lock(&isolation_lock); > + > + list_for_each_entry(device, &group->devices, list) { > + if (device->dev == dev) { > + /* XXX signal manager? */ > + > + if (group->iommu_domain) > + group->iommu_ops->detach_dev( > + group->iommu_domain, dev); > + list_del(&device->list); > + kfree(device); > + dev->isolation_group = NULL; > + sysfs_remove_link(group->devices_kobj, > + kobject_name(&dev->kobj)); > + sysfs_remove_link(&dev->kobj, "isolation_group"); > + break; > + } > + } > + > + mutex_unlock(&isolation_lock); > + > + return 0; > +} > + > +/* > + * Filter and call out to our own notifier chains > + */ So, I haven't quite worked out what this isolation notifier infrastructure gives you, as opposed to having isolation providers directly register bus notifiers. > +static int isolation_bus_notify(struct notifier_block *nb, > + unsigned long action, void *data) > +{ > + struct isolation_notifier *notifier = > + container_of(nb, struct isolation_notifier, nb); > + struct device *dev = data; > + > + switch (action) { > + case BUS_NOTIFY_ADD_DEVICE: > + if (!dev->isolation_group) > + blocking_notifier_call_chain(¬ifier->notifier, > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > + break; > + case BUS_NOTIFY_DEL_DEVICE: > + if (dev->isolation_group) > + blocking_notifier_call_chain(¬ifier->notifier, > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > + break; > + } > + > + return NOTIFY_DONE; > +} > + > +/* > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add > + * a new notifier. > + */ > +static int isolation_do_add_notify(struct device *dev, void *data) > +{ > + struct notifier_block *nb = data; > + > + if (!dev->isolation_group) > + nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev); > + > + return 0; > +} > + > +/* > + * Register a notifier. This is for isolation group providers. It's > + * possible we could have multiple isolation group providers for the same > + * bus type, So the bit above doesn't seem to connect to the bit below. We can indeed have multiple isolation providers for one bus type, but the below seems to be covering the (also possible, but different) case of one isolation provider covering multiple bus types. > so we create a struct isolation_notifier per bus_type, each > + * with a notifier block. Providers are therefore welcome to register > + * as many buses as they can handle. > + */ > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb) > +{ > + struct isolation_notifier *notifier; > + bool found = false; > + int ret; > + > + mutex_lock(&isolation_lock); > + list_for_each_entry(notifier, &isolation_notifiers, list) { > + if (notifier->bus == bus) { > + found = true; > + break; > + } > + } > + > + if (!found) { > + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); > + if (!notifier) { > + mutex_unlock(&isolation_lock); > + return -ENOMEM; > + } > + notifier->bus = bus; > + notifier->nb.notifier_call = isolation_bus_notify; > + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->notifier); > + bus_register_notifier(bus, ¬ifier->nb); > + list_add(¬ifier->list, &isolation_notifiers); > + } > + > + ret = blocking_notifier_chain_register(¬ifier->notifier, nb); > + if (ret) { > + if (notifier->refcnt == 0) { > + bus_unregister_notifier(bus, ¬ifier->nb); > + list_del(¬ifier->list); > + kfree(notifier); > + } > + mutex_unlock(&isolation_lock); > + return ret; > + } > + notifier->refcnt++; > + > + mutex_unlock(&isolation_lock); > + > + bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify); > + > + return 0; > +} So, somewhere, I think we need a fallback path, but I'm not sure exactly where. If an isolation provider doesn't explicitly put a device into a group, the device should go into the group of its parent bridge. This covers the case of a bus with IOMMU which has below it a bridge to a different type of DMA capable bus (which the IOMMU isn't explicitly aware of). DMAs from devices on the subordinate bus can be translated by the top-level IOMMU (assuming it sees them as coming from the bridge), but they all need to be treated as one group. > +/* > + * Unregister... > + */ > +int isolation_unregister_notifier(struct bus_type *bus, > + struct notifier_block *nb) > +{ > + struct isolation_notifier *notifier; > + bool found = false; > + int ret; > + > + mutex_lock(&isolation_lock); > + list_for_each_entry(notifier, &isolation_notifiers, list) { > + if (notifier->bus == bus) { > + found = true; > + break; > + } > + } > + > + if (!found) { > + mutex_unlock(&isolation_lock); > + return -ENODEV; > + } > + > + ret = blocking_notifier_chain_unregister(¬ifier->notifier, nb); > + if (ret) { > + mutex_unlock(&isolation_lock); > + return ret; > + } > + > + /* Free at nobody left in the notifier block */ > + if (--notifier->refcnt == 0) { > + bus_unregister_notifier(bus, ¬ifier->nb); > + list_del(¬ifier->list); > + kfree(notifier); > + } > + > + mutex_unlock(&isolation_lock); > + > + return 0; > +} > + > +/* > + * Set iommu_ops on an isolation group. Hopefully all DMA will eventually > + * be done through isolation groups, for now, we just provide another > + * interface to the iommu api. > + */ > +int isolation_group_set_iommu_ops(struct isolation_group *group, > + struct iommu_ops *iommu_ops) > +{ > + mutex_lock(&isolation_lock); > + > + if (group->iommu_ops) { > + mutex_unlock(&isolation_lock); > + return -EBUSY; > + } > + > + group->iommu_ops = iommu_ops; > + > + mutex_unlock(&isolation_lock); > + > + return 0; > +} > + > +/* > + * Attach all the devices for a group to the specified iommu domain. > + */ > +static int __isolation_group_domain_attach_devs(struct iommu_domain *domain, > + struct list_head *devices) > +{ > + struct isolation_device *device; > + struct device *dev; > + int ret = 0; > + > + list_for_each_entry(device, devices, list) { > + dev = device->dev; > + > + ret = domain->ops->attach_dev(domain, dev); > + if (ret) { > + list_for_each_entry_continue_reverse(device, > + devices, list) { > + dev = device->dev; > + domain->ops->detach_dev(domain, dev); > + } > + break; > + } > + } > + > + return ret; > +} > + > +/* > + * And detach... > + */ > +static void __isolation_group_domain_detach_devs(struct iommu_domain *domain, > + struct list_head *devices) > +{ > + struct isolation_device *device; > + struct device *dev; > + > + list_for_each_entry(device, devices, list) { > + dev = device->dev; > + > + domain->ops->detach_dev(domain, dev); > + } > +} > + > +/* > + * Initialize the iommu domain for an isolation group. This is to ease the > + * transition to using isolation groups and expected to be used by current > + * users of the iommu api for now. > + */ > +int isolation_group_init_iommu_domain(struct isolation_group *group) > +{ > + int ret; > + > + mutex_lock(&isolation_lock); > + > + if (!group->iommu_ops || list_empty(&group->devices)) { > + mutex_unlock(&isolation_lock); > + return -EINVAL; > + } > + > + if (group->iommu_domain) { > + mutex_unlock(&isolation_lock); > + return -EBUSY; > + } > + > + group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL); > + if (!group->iommu_domain) { > + mutex_unlock(&isolation_lock); > + return -ENOMEM; > + } > + > + group->iommu_domain->ops = group->iommu_ops; > + > + ret = group->iommu_ops->domain_init(group->iommu_domain); > + if (ret) { > + kfree(group->iommu_domain); > + group->iommu_domain = NULL; > + mutex_unlock(&isolation_lock); > + return ret; > + } > + > + /* Automatically attach all the devices in the group. */ > + ret = __isolation_group_domain_attach_devs(group->iommu_domain, > + &group->devices); > + if (ret) { > + group->iommu_ops->domain_destroy(group->iommu_domain); > + kfree(group->iommu_domain); > + group->iommu_domain = NULL; > + mutex_unlock(&isolation_lock); > + return ret; > + } > + > + mutex_unlock(&isolation_lock); > + return 0; > +} > + > +/* > + * And free... > + * Note just below, we add the ability to add another group to an iommu > + * domain, so we don't always destroy and free the domain here. > + */ > +void isolation_group_free_iommu_domain(struct isolation_group *group) > +{ > + struct isolation_group *tmp; > + bool inuse = false; > + > + if (!group->iommu_domain || !group->iommu_ops) > + return; > + > + mutex_lock(&isolation_lock); > + > + __isolation_group_domain_detach_devs(group->iommu_domain, > + &group->devices); > + > + list_for_each_entry(tmp, &isolation_groups, list) { > + if (tmp == group) > + continue; > + if (tmp->iommu_domain == group->iommu_domain) { > + inuse = true; > + break; > + } > + } > + > + if (!inuse) { > + group->iommu_ops->domain_destroy(group->iommu_domain); > + kfree(group->iommu_domain); > + } > + > + group->iommu_domain = NULL; > + > + mutex_unlock(&isolation_lock); > +} > + > +/* > + * This handles the VFIO "merge" interface, allowing us to manage multiple > + * isolation groups with a single domain. We just rely on attach_dev to > + * tell us whether this is ok. > + */ > +int isolation_group_iommu_domain_add_group(struct isolation_group *groupa, > + struct isolation_group *groupb) > +{ > + int ret; > + > + if (!groupa->iommu_domain || > + groupb->iommu_domain || list_empty(&groupb->devices)) > + return -EINVAL; > + > + if (groupa->iommu_ops != groupb->iommu_ops) > + return -EIO; > + > + mutex_lock(&isolation_lock); > + > + ret = __isolation_group_domain_attach_devs(groupa->iommu_domain, > + &groupb->devices); > + if (ret) { > + mutex_unlock(&isolation_lock); > + return ret; > + } > + > + groupb->iommu_domain = groupa->iommu_domain; > + > + mutex_unlock(&isolation_lock); > + > + return 0; > +} > + > +/* > + * XXX page mapping/unmapping and more iommu api wrappers > + */ > + > +/* > + * Do we have a group manager? Group managers restrict what drivers are > + * allowed to attach to devices. A group is "locked" when all of the devices > + * for a group belong to group manager drivers (or no driver at all). At > + * that point, the group manager can initialize the iommu. vfio is an example > + * of a group manager and vfio-pci is an exanple of a driver that a group > + * manager may add as a "managed" driver. Note that we don't gate iommu > + * manipulation on being managed because we want to use it for regular DMA > + * at some point as well. > + */ > +bool isolation_group_managed(struct isolation_group *group) > +{ > + return group->manager != NULL; > +} > + > +/* > + * Initialize the group manager struct. At this point the isolation group > + * becomes "managed". > + */ This assumes a separate manager struct for each group. I would have thought the VFIO merge case would be more obviously represented by having a single manager struct for all the groups in the VFIO instance (== iommu domain). > +int isolation_group_init_manager(struct isolation_group *group) > +{ > + mutex_lock(&isolation_lock); > + > + if (group->manager) { > + mutex_unlock(&isolation_lock); > + return -EBUSY; > + } > + > + group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL); > + if (!group->manager) { > + mutex_unlock(&isolation_lock); > + return -ENOMEM; > + } > + > + mutex_unlock(&isolation_lock); > + > + return 0; > +} > + > +/* > + * And free... cleanup registerd managed drivers too. > + */ > +void isolation_group_free_manager(struct isolation_group *group) > +{ > + struct isolation_manager_driver *driver, *tmp; > + > + if (!group->manager) > + return; > + > + mutex_lock(&isolation_lock); > + > + list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) { > + list_del(&driver->list); > + kfree(driver); > + } > + > + kfree(group->manager); > + group->manager = NULL; > + mutex_unlock(&isolation_lock); > +} > + > +/* > + * Add a managed driver. Drivers added here are the only ones that will > + * be allowed to attach to a managed isolation group. > + */ > +int isolation_group_manager_add_driver(struct isolation_group *group, > + struct device_driver *drv) > +{ > + struct isolation_manager_driver *driver; > + > + if (!group->manager) > + return -EINVAL; > + > + driver = kzalloc(sizeof(*driver), GFP_KERNEL); > + if (!driver) > + return -ENOMEM; > + > + driver->drv = drv; > + > + mutex_lock(&isolation_lock); > + > + list_add(&driver->list, &group->manager->drivers); > + > + mutex_unlock(&isolation_lock); > + > + return 0; > +} > + > +/* > + * And remove a driver. Don't really expect to need this much. > + */ > +void isolation_group_manager_del_driver(struct isolation_group *group, > + struct device_driver *drv) > +{ > + struct isolation_manager_driver *driver; > + > + if (!group->manager) > + return; > + > + mutex_lock(&isolation_lock); > + > + list_for_each_entry(driver, &group->manager->drivers, list) { > + if (driver->drv == drv) { > + list_del(&driver->list); > + kfree(driver); > + break; > + } > + } > + > + mutex_unlock(&isolation_lock); > +} > + > +/* > + * Test whether a driver is a "managed" driver for the group. This allows > + * is to preempt normal driver matching and only let our drivers in. > + */ > +bool isolation_group_manager_driver(struct isolation_group *group, > + struct device_driver *drv) > +{ > + struct isolation_manager_driver *driver; > + bool found = false; > + > + if (!group->manager) > + return found; > + > + mutex_lock(&isolation_lock); > + > + list_for_each_entry(driver, &group->manager->drivers, list) { > + if (driver->drv == drv) { > + found = true; > + break; > + } > + } > + > + mutex_unlock(&isolation_lock); > + > + return found; > +} > + > +/* > + * Does the group manager have control of all of the devices in the group? > + * We consider driver-less devices to be ok here as they don't do DMA and > + * don't present any interfaces to subvert the rest of the group. > + */ > +bool isolation_group_manager_locked(struct isolation_group *group) > +{ > + struct isolation_device *device; > + struct isolation_manager_driver *driver; > + bool found, locked = true; > + > + if (!group->manager) > + return false; > + > + mutex_lock(&isolation_lock); > + > + list_for_each_entry(device, &group->devices, list) { > + found = false; > + > + if (!device->dev->driver) > + continue; > + You could simplify this using isolation_group_manager_driver(), couldn't you? > + list_for_each_entry(driver, &group->manager->drivers, list) { > + if (device->dev->driver == driver->drv) { > + found = true; > + break; > + } > + } > + > + if (!found) { > + locked = false; > + break; > + } > + } > + > + mutex_unlock(&isolation_lock); > + > + return locked; > +} > + > +static int __init isolation_init(void) > +{ > + isolation_kset = kset_create_and_add("isolation", NULL, NULL); > + > + WARN(!isolation_kset, "Failed to initialize isolation group kset\n"); > + > + return isolation_kset ? 0 : -1; I'd be tempted to just BUG() here if you can't add the kset - I can't see any reason it would fail unless you're so short of RAM that you have bigger problems. Making this a fatal fail would save having to double check if the kset is around in the later paths. > +} > +subsys_initcall(isolation_init); > diff --git a/include/linux/device.h b/include/linux/device.h > index b63fb39..5805c56 100644 > --- a/include/linux/device.h > +++ b/include/linux/device.h > @@ -663,6 +663,10 @@ struct device { > > struct device_dma_parameters *dma_parms; > > +#ifdef CONFIG_ISOLATION_GROUPS > + struct isolation_group *isolation_group; > +#endif > + > struct list_head dma_pools; /* dma pools (if dma'ble) */ > > struct dma_coherent_mem *dma_mem; /* internal for coherent mem > diff --git a/include/linux/isolation.h b/include/linux/isolation.h > new file mode 100644 > index 0000000..1d87caf > --- /dev/null > +++ b/include/linux/isolation.h > @@ -0,0 +1,138 @@ > +/* > + * Isolation group interface > + * > + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. > + * Author: Alex Williamson <alex.williamson@redhat.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation. > + * > + */ > + > +#ifndef _LINUX_ISOLATION_H > +#define _LINUX_ISOLATION_H > + > +#define ISOLATION_NOTIFY_ADD_DEVICE 1 > +#define ISOLATION_NOTIFY_DEL_DEVICE 2 > + > +#ifdef CONFIG_ISOLATION_GROUPS > + > +extern struct isolation_group *isolation_create_group(void); > +extern int isolation_free_group(struct isolation_group *group); > +extern int isolation_group_add_dev(struct isolation_group *group, > + struct device *dev); > +extern int isolation_group_del_dev(struct device *dev); > +extern int isolation_register_notifier(struct bus_type *bus, > + struct notifier_block *nb); > +extern int isolation_unregister_notifier(struct bus_type *bus, > + struct notifier_block *nb); > +extern int isolation_group_set_iommu_ops(struct isolation_group *group, > + struct iommu_ops *iommu_ops); > +extern int isolation_group_init_iommu_domain(struct isolation_group *group); > +extern void isolation_group_free_iommu_domain(struct isolation_group *group); > +extern int isolation_group_iommu_domain_add_group( > + struct isolation_group *groupa, struct isolation_group *groupb); > +extern bool isolation_group_managed(struct isolation_group *group); > +extern int isolation_group_init_manager(struct isolation_group *group); > +extern void isolation_group_free_manager(struct isolation_group *group); > +extern int isolation_group_manager_add_driver(struct isolation_group *group, > + struct device_driver *drv); > +extern void isolation_group_manager_del_driver(struct isolation_group *group, > + struct device_driver *drv); > +extern bool isolation_group_manager_driver(struct isolation_group *group, > + struct device_driver *drv); > +extern bool isolation_group_manager_locked(struct isolation_group *group); > + > +#define to_isolation_group(dev) ((dev)->isolation_group) > + > +#else /* CONFIG_ISOLATION_GROUPS */ > + > +static inline struct isolation_group *isolation_create_group(void) > +{ > + return NULL; > +} > + > +static inline int isolation_free_group(struct isolation_group *group) > +{ > + return 0; > +} > + > +static inline int isolation_group_add_dev(struct isolation_group *group, > + struct device *dev) > +{ > + return 0; > +} > + > +static inline int isolation_group_del_dev(struct device *dev) > +{ > + return 0; > +} > + > +static int isolation_register_notifier(struct bus_type *bus, > + struct notifier_block *nb) > +{ > + return 0; > +} > + > +static int isolation_unregister_notifier(struct bus_type *bus, > + struct notifier_block *nb) > +{ > + return 0; > +} > + > +static int isolation_group_set_iommu_ops(struct isolation_group *group, > + struct iommu_ops *iommu_ops) > +{ > + return -ENOSYS; > +} > + > +static int isolation_group_init_iommu_domain(struct isolation_group *group) > +{ > + return -ENOSYS; > +} > + > +static void isolation_group_free_iommu_domain(struct isolation_group *group) > +{ > +} > + > +static int isolation_group_iommu_domain_add_group( > + struct isolation_group *groupa, struct isolation_group *groupb) > +{ > + return -ENOSYS; > +} > + > +static int isolation_group_init_manager(struct isolation_group *group) > +{ > + return -ENOSYS; > +} > + > +static void isolation_group_free_manager(struct isolation_group *group) > +{ > +} > + > +static int isolation_group_manager_add_driver(struct isolation_group *group, > + struct device_driver *drv) > +{ > + return -ENOSYS; > +} > + > +static void isolation_group_manager_del_driver(struct isolation_group *group, > + struct device_driver *drv) > +{ > +} > + > +static bool isolation_group_manager_locked(struct isolation_group *group) > +{ > + return false; > +} > + > +#define to_isolation_group(dev) (NULL) > + > +#define isolation_group_managed(group) (false) > + > +#define isolation_group_manager_driver(group, drv) (false) > + > +#endif /* CONFIG_ISOLATION_GROUPS */ > + > +#endif /* _LINUX_ISOLATION_H */ >
On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > > --- > > > > drivers/base/Kconfig | 10 + > > drivers/base/Makefile | 1 > > drivers/base/base.h | 5 > > drivers/base/isolation.c | 798 +++++++++++++++++++++++++++++++++++++++++++++ > > include/linux/device.h | 4 > > include/linux/isolation.h | 138 ++++++++ > > 6 files changed, 956 insertions(+), 0 deletions(-) > > create mode 100644 drivers/base/isolation.c > > create mode 100644 include/linux/isolation.h > > > > diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig > > index 7be9f79..e98a5f3 100644 > > --- a/drivers/base/Kconfig > > +++ b/drivers/base/Kconfig > > @@ -189,4 +189,14 @@ config DMA_SHARED_BUFFER > > APIs extension; the file's descriptor can then be passed on to other > > driver. > > > > +config ISOLATION_GROUPS > > + bool "Enable Isolation Group API" > > + default n > > + depends on EXPERIMENTAL && IOMMU_API > > + help > > + This option enables grouping of devices into Isolation Groups > > + which may be used by other subsystems to perform quirks across > > + sets of devices as well as userspace drivers for guaranteeing > > + devices are isolated from the rest of the system. > > + > > endmenu > > diff --git a/drivers/base/Makefile b/drivers/base/Makefile > > index 610f999..047b5f9 100644 > > --- a/drivers/base/Makefile > > +++ b/drivers/base/Makefile > > @@ -19,6 +19,7 @@ obj-$(CONFIG_MODULES) += module.o > > endif > > obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o > > obj-$(CONFIG_REGMAP) += regmap/ > > +obj-$(CONFIG_ISOLATION_GROUPS) += isolation.o > > > > ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG > > > > diff --git a/drivers/base/base.h b/drivers/base/base.h > > index b858dfd..376758a 100644 > > --- a/drivers/base/base.h > > +++ b/drivers/base/base.h > > @@ -1,4 +1,5 @@ > > #include <linux/notifier.h> > > +#include <linux/isolation.h> > > > > /** > > * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. > > @@ -108,6 +109,10 @@ extern int driver_probe_device(struct device_driver *drv, struct device *dev); > > static inline int driver_match_device(struct device_driver *drv, > > struct device *dev) > > { > > + if (isolation_group_managed(to_isolation_group(dev)) && > > + !isolation_group_manager_driver(to_isolation_group(dev), drv)) > > + return 0; > > + > > return drv->bus->match ? drv->bus->match(dev, drv) : 1; > > } > > > > diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c > > new file mode 100644 > > index 0000000..c01365c > > --- /dev/null > > +++ b/drivers/base/isolation.c > > @@ -0,0 +1,798 @@ > > +/* > > + * Isolation group interface > > + * > > + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. > > + * Author: Alex Williamson <alex.williamson@redhat.com> > > + * > > + * This program is free software; you can redistribute it and/or modify > > + * it under the terms of the GNU General Public License version 2 as > > + * published by the Free Software Foundation. > > + * > > + */ > > + > > +#include <linux/device.h> > > +#include <linux/iommu.h> > > +#include <linux/isolation.h> > > +#include <linux/list.h> > > +#include <linux/mutex.h> > > +#include <linux/notifier.h> > > +#include <linux/slab.h> > > +#include <linux/uuid.h> > > + > > +static struct kset *isolation_kset; > > +/* XXX add more complete locking, maybe rcu */ > > +static DEFINE_MUTEX(isolation_lock); > > +static LIST_HEAD(isolation_groups); > > +static LIST_HEAD(isolation_notifiers); > > + > > +/* Keep these private */ > > +struct isolation_manager_driver { > > + struct device_driver *drv; > > + struct list_head list; > > +}; > > + > > +struct isolation_manager { > > + struct list_head drivers; > > + /* Likely need managers to register some callbacks */ > > +}; > > + > > +struct isolation_group { > > + struct list_head list; > > + struct list_head devices; > > + struct kobject kobj; > > + struct kobject *devices_kobj; > > + struct iommu_domain *iommu_domain; > > + struct iommu_ops *iommu_ops; > > + struct isolation_manager *manager; > > + uuid_le uuid; > > +}; > > + > > +struct isolation_device { > > + struct device *dev; > > + struct list_head list; > > +}; > > + > > +struct isolation_notifier { > > + struct bus_type *bus; > > + struct notifier_block nb; > > + struct blocking_notifier_head notifier; > > + struct list_head list; > > + int refcnt; > > +}; > > + > > +struct iso_attribute { > > + struct attribute attr; > > + ssize_t (*show)(struct isolation_group *group, char *buf); > > + ssize_t (*store)(struct isolation_group *group, > > + const char *buf, size_t count); > > +}; > > + > > +#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr) > > +#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj) > > + > > +static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr, > > + char *buf) > > +{ > > + struct iso_attribute *iso_attr = to_iso_attr(attr); > > + struct isolation_group *group = to_iso_group(kobj); > > + ssize_t ret = -EIO; > > + > > + if (iso_attr->show) > > + ret = iso_attr->show(group, buf); > > + return ret; > > +} > > + > > +static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr, > > + const char *buf, size_t count) > > +{ > > + struct iso_attribute *iso_attr = to_iso_attr(attr); > > + struct isolation_group *group = to_iso_group(kobj); > > + ssize_t ret = -EIO; > > + > > + if (iso_attr->store) > > + ret = iso_attr->store(group, buf, count); > > + return ret; > > +} > > + > > +static void iso_release(struct kobject *kobj) > > +{ > > + struct isolation_group *group = to_iso_group(kobj); > > + kfree(group); > > +} > > + > > +static const struct sysfs_ops iso_sysfs_ops = { > > + .show = iso_attr_show, > > + .store = iso_attr_store, > > +}; > > + > > +static struct kobj_type iso_ktype = { > > + .sysfs_ops = &iso_sysfs_ops, > > + .release = iso_release, > > +}; > > + > > +/* > > + * Create an isolation group. Isolation group "providers" register a > > + * notifier for their bus(es) and call this to create a new isolation > > + * group. > > + */ > > +struct isolation_group *isolation_create_group(void) > > +{ > > + struct isolation_group *group, *tmp; > > + int ret; > > + > > + if (unlikely(!isolation_kset)) > > + return ERR_PTR(-ENODEV); > > + > > + group = kzalloc(sizeof(*group), GFP_KERNEL); > > + if (!group) > > + return ERR_PTR(-ENOMEM); > > + > > + mutex_lock(&isolation_lock); > > + > > + /* > > + * Use a UUID for group identification simply because it seems wrong > > + * to expose it as a kernel pointer (%p). Neither is user friendly. > > + * Mostly only expect userspace to do anything with this. > > + */ > > So, my plan was to require the isolation provider to give a unique > name - it can construct something that's actually meaningful (and with > luck, stable across boots). For Power we'd use the PE number, for > VT-d, I was thinking the PCI device address would do it (of the "base" > device for the non-separable bridge and broken multifunction cases). Naming always seems to end in "that's not unique" or "that doesn't apply to us", so I figured I'd just avoid the problem by using random numbers. We can allow providers to specify a name, but that still presents challenges to uniqueness and consistency if we intend to generically allow heterogeneous providers on a system. In the VFIO use case I expect userspace will get the name from readlink on the isolation_group sysfs entry and open a vfio provided device file of the same name. So in the end, it doesn't matter what the name is. > > +new_uuid: > > + uuid_le_gen(&group->uuid); > > + > > + /* Unlikely, but nothing prevents duplicates */ > > + list_for_each_entry(tmp, &isolation_groups, list) > > + if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0) > > + goto new_uuid; > > + > > + INIT_LIST_HEAD(&group->devices); > > + group->kobj.kset = isolation_kset; > > + > > + ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL, > > + "%pUl", &group->uuid); > > Um.. isn't this setting the kobject name to the address of the uuid > plus "Ul", not the content of the uuid? We have kernel extensions for %p, %pUl is a little endian UUID. It prints in the right format, but I'll have to check if I'm getting the actual UUID or a UUID-ified pointer address. > > + if (ret) { > > + kfree(group); > > + mutex_unlock(&isolation_lock); > > + return ERR_PTR(ret); > > + } > > + > > + /* Add a subdirectory to save links for devices withing the group. */ > > + group->devices_kobj = kobject_create_and_add("devices", &group->kobj); > > + if (!group->devices_kobj) { > > + kobject_put(&group->kobj); > > + mutex_unlock(&isolation_lock); > > + return ERR_PTR(-ENOMEM); > > + } > > + > > + list_add(&group->list, &isolation_groups); > > + > > + mutex_unlock(&isolation_lock); > > + > > + return group; > > +} > > + > > +/* > > + * Free isolation group. XXX need to cleanup/reject based on manager status. > > + */ > > +int isolation_free_group(struct isolation_group *group) > > +{ > > + mutex_lock(&isolation_lock); > > + > > + if (!list_empty(&group->devices)) { > > + mutex_unlock(&isolation_lock); > > + return -EBUSY; > > + } > > + > > + list_del(&group->list); > > + kobject_put(group->devices_kobj); > > + kobject_put(&group->kobj); > > + > > + mutex_unlock(&isolation_lock); > > + return 0; > > +} > > + > > +/* > > + * Add a device to an isolation group. Isolation groups start empty and > > + * must be told about the devices they contain. Expect this to be called > > + * from isolation group providers via notifier. > > + */ > > Doesn't necessarily have to be from a notifier, particularly if the > provider is integrated into host bridge code. Sure, a provider could do this on it's own if it wants. This just provides some infrastructure for a common path. Also note that this helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet to be seen whether that can reasonably be the case once isolation groups are added to streaming DMA paths. > > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev) > > +{ > > + struct isolation_device *device; > > + int ret = 0; > > + > > + mutex_lock(&isolation_lock); > > + > > + if (dev->isolation_group) { > > + ret = -EBUSY; > > + goto out; > > This should probably be a BUG_ON() - the isolation provider shouldn't > be putting the same device into two different groups. Yeah, probably. > > + } > > + > > + device = kzalloc(sizeof(*device), GFP_KERNEL); > > + if (!device) { > > + ret = -ENOMEM; > > + goto out; > > + } > > + > > + device->dev = dev; > > + > > + /* Cross link the device in sysfs */ > > + ret = sysfs_create_link(&dev->kobj, &group->kobj, > > + "isolation_group"); > > + if (ret) { > > + kfree(device); > > + goto out; > > + } > > + > > + ret = sysfs_create_link(group->devices_kobj, &dev->kobj, > > + kobject_name(&dev->kobj)); > > So, a problem both here and in my version is what to name the device > links. Because they could be potentially be quite widely scattered, > I'm not sure that kobject_name() is guaranteed to be sufficiently > unique. Even if the name is not, we're pointing to a unique sysfs location. I expect the primary user of this will be when userspace translates a device to a group (via the isolation_group link below), then tries to walk all the devices in the group to determine effected host devices and manager driver status. So it would probably be traversing the link back rather than relying solely on the name of the link. Right? > > + if (ret) { > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > + kfree(device); > > + goto out; > > + } > > + > > + list_add(&device->list, &group->devices); > > + > > + dev->isolation_group = group; > > + > > + if (group->iommu_domain) { > > + ret = group->iommu_ops->attach_dev(group->iommu_domain, dev); > > + if (ret) { > > + /* XXX fail device? */ > > + } > > So, I presume the idea is that this is a stop-gap until iommu_ops is > converted to be in terms of groups rather than indivdual devices? Yes, I thought we could just back it by the iommu api for now so we can implement managers, but eventually domain_init and attach_dev would probably be a single callback in some kind of group aware iommu api. > > + } > > + > > + /* XXX signal manager? */ > > Uh, what did you have in mind here? Another notifier? Maybe just a callback struct registered when a manager is added to a group. On one hand, maybe it's not necessary because adding a device to a managed group already restricts driver matching, but on the other, the manager may want to be informed about new devices and try to actively bind a driver to it. For instance, if assigning an entire PE to a guest, which might include empty slots, would the manager want to be informed about the addition of a device so it could mirror the addition to the guest assigned the PE? > > +out: > > + mutex_unlock(&isolation_lock); > > + > > + return 0; > > +} > > + > > +/* > > + * Remove a device from an isolation group, likely because the device > > + * went away. > > + */ > > +int isolation_group_del_dev(struct device *dev) > > +{ > > + struct isolation_group *group = dev->isolation_group; > > + struct isolation_device *device; > > + > > + if (!group) > > + return -ENODEV; > > + > > + mutex_lock(&isolation_lock); > > + > > + list_for_each_entry(device, &group->devices, list) { > > + if (device->dev == dev) { > > + /* XXX signal manager? */ > > + > > + if (group->iommu_domain) > > + group->iommu_ops->detach_dev( > > + group->iommu_domain, dev); > > + list_del(&device->list); > > + kfree(device); > > + dev->isolation_group = NULL; > > + sysfs_remove_link(group->devices_kobj, > > + kobject_name(&dev->kobj)); > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > + break; > > + } > > + } > > + > > + mutex_unlock(&isolation_lock); > > + > > + return 0; > > +} > > + > > +/* > > + * Filter and call out to our own notifier chains > > + */ > > So, I haven't quite worked out what this isolation notifier > infrastructure gives you, as opposed to having isolation providers > directly register bus notifiers. For now, it nicely separates CONFIG_ISOLATION code so I don't litter intel-iommu with #ifdefs. It also embraces that devices are always being added and removed and supports that with very little change to existing code paths. > > +static int isolation_bus_notify(struct notifier_block *nb, > > + unsigned long action, void *data) > > +{ > > + struct isolation_notifier *notifier = > > + container_of(nb, struct isolation_notifier, nb); > > + struct device *dev = data; > > + > > + switch (action) { > > + case BUS_NOTIFY_ADD_DEVICE: > > + if (!dev->isolation_group) > > + blocking_notifier_call_chain(¬ifier->notifier, > > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > > + break; > > + case BUS_NOTIFY_DEL_DEVICE: > > + if (dev->isolation_group) > > + blocking_notifier_call_chain(¬ifier->notifier, > > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > > + break; > > + } > > + > > + return NOTIFY_DONE; > > +} > > + > > +/* > > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add > > + * a new notifier. > > + */ > > +static int isolation_do_add_notify(struct device *dev, void *data) > > +{ > > + struct notifier_block *nb = data; > > + > > + if (!dev->isolation_group) > > + nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev); > > + > > + return 0; > > +} > > + > > +/* > > + * Register a notifier. This is for isolation group providers. It's > > + * possible we could have multiple isolation group providers for the same > > + * bus type, > > So the bit above doesn't seem to connect to the bit below. We can > indeed have multiple isolation providers for one bus type, but the > below seems to be covering the (also possible, but different) case of > one isolation provider covering multiple bus types. It covers both. If a provider covers multiple buses it will call this function once for each bus. Each new bus creates a new struct isolation_notifier and does a bus_register_notifier (using isolation_bus_notify). The provider notifier block is added to our own notifier call chain for that struct isolation_notifier. This way we only ever register a single notifier per bus, but support multiple providers for the bus. > > so we create a struct isolation_notifier per bus_type, each > > + * with a notifier block. Providers are therefore welcome to register > > + * as many buses as they can handle. > > + */ > > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb) > > +{ > > + struct isolation_notifier *notifier; > > + bool found = false; > > + int ret; > > + > > + mutex_lock(&isolation_lock); > > + list_for_each_entry(notifier, &isolation_notifiers, list) { > > + if (notifier->bus == bus) { > > + found = true; > > + break; > > + } > > + } > > + > > + if (!found) { > > + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); > > + if (!notifier) { > > + mutex_unlock(&isolation_lock); > > + return -ENOMEM; > > + } > > + notifier->bus = bus; > > + notifier->nb.notifier_call = isolation_bus_notify; > > + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->notifier); > > + bus_register_notifier(bus, ¬ifier->nb); > > + list_add(¬ifier->list, &isolation_notifiers); > > + } > > + > > + ret = blocking_notifier_chain_register(¬ifier->notifier, nb); > > + if (ret) { > > + if (notifier->refcnt == 0) { > > + bus_unregister_notifier(bus, ¬ifier->nb); > > + list_del(¬ifier->list); > > + kfree(notifier); > > + } > > + mutex_unlock(&isolation_lock); > > + return ret; > > + } > > + notifier->refcnt++; > > + > > + mutex_unlock(&isolation_lock); > > + > > + bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify); > > + > > + return 0; > > +} > > So, somewhere, I think we need a fallback path, but I'm not sure > exactly where. If an isolation provider doesn't explicitly put a > device into a group, the device should go into the group of its parent > bridge. This covers the case of a bus with IOMMU which has below it a > bridge to a different type of DMA capable bus (which the IOMMU isn't > explicitly aware of). DMAs from devices on the subordinate bus can be > translated by the top-level IOMMU (assuming it sees them as coming > from the bridge), but they all need to be treated as one group. Why would the top level IOMMU provider not set the isolation group in this case. I agree there's a gap here, but it's fuzzy when and how it can occur and if it matters (devices without an isolation group can't be used by managers, and apparently don't make use of any of the services of a provider...). > > +/* > > + * Unregister... > > + */ > > +int isolation_unregister_notifier(struct bus_type *bus, > > + struct notifier_block *nb) > > +{ > > + struct isolation_notifier *notifier; > > + bool found = false; > > + int ret; > > + > > + mutex_lock(&isolation_lock); > > + list_for_each_entry(notifier, &isolation_notifiers, list) { > > + if (notifier->bus == bus) { > > + found = true; > > + break; > > + } > > + } > > + > > + if (!found) { > > + mutex_unlock(&isolation_lock); > > + return -ENODEV; > > + } > > + > > + ret = blocking_notifier_chain_unregister(¬ifier->notifier, nb); > > + if (ret) { > > + mutex_unlock(&isolation_lock); > > + return ret; > > + } > > + > > + /* Free at nobody left in the notifier block */ > > + if (--notifier->refcnt == 0) { > > + bus_unregister_notifier(bus, ¬ifier->nb); > > + list_del(¬ifier->list); > > + kfree(notifier); > > + } > > + > > + mutex_unlock(&isolation_lock); > > + > > + return 0; > > +} > > + > > +/* > > + * Set iommu_ops on an isolation group. Hopefully all DMA will eventually > > + * be done through isolation groups, for now, we just provide another > > + * interface to the iommu api. > > + */ > > +int isolation_group_set_iommu_ops(struct isolation_group *group, > > + struct iommu_ops *iommu_ops) > > +{ > > + mutex_lock(&isolation_lock); > > + > > + if (group->iommu_ops) { > > + mutex_unlock(&isolation_lock); > > + return -EBUSY; > > + } > > + > > + group->iommu_ops = iommu_ops; > > + > > + mutex_unlock(&isolation_lock); > > + > > + return 0; > > +} > > + > > +/* > > + * Attach all the devices for a group to the specified iommu domain. > > + */ > > +static int __isolation_group_domain_attach_devs(struct iommu_domain *domain, > > + struct list_head *devices) > > +{ > > + struct isolation_device *device; > > + struct device *dev; > > + int ret = 0; > > + > > + list_for_each_entry(device, devices, list) { > > + dev = device->dev; > > + > > + ret = domain->ops->attach_dev(domain, dev); > > + if (ret) { > > + list_for_each_entry_continue_reverse(device, > > + devices, list) { > > + dev = device->dev; > > + domain->ops->detach_dev(domain, dev); > > + } > > + break; > > + } > > + } > > + > > + return ret; > > +} > > + > > +/* > > + * And detach... > > + */ > > +static void __isolation_group_domain_detach_devs(struct iommu_domain *domain, > > + struct list_head *devices) > > +{ > > + struct isolation_device *device; > > + struct device *dev; > > + > > + list_for_each_entry(device, devices, list) { > > + dev = device->dev; > > + > > + domain->ops->detach_dev(domain, dev); > > + } > > +} > > + > > +/* > > + * Initialize the iommu domain for an isolation group. This is to ease the > > + * transition to using isolation groups and expected to be used by current > > + * users of the iommu api for now. > > + */ > > +int isolation_group_init_iommu_domain(struct isolation_group *group) > > +{ > > + int ret; > > + > > + mutex_lock(&isolation_lock); > > + > > + if (!group->iommu_ops || list_empty(&group->devices)) { > > + mutex_unlock(&isolation_lock); > > + return -EINVAL; > > + } > > + > > + if (group->iommu_domain) { > > + mutex_unlock(&isolation_lock); > > + return -EBUSY; > > + } > > + > > + group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL); > > + if (!group->iommu_domain) { > > + mutex_unlock(&isolation_lock); > > + return -ENOMEM; > > + } > > + > > + group->iommu_domain->ops = group->iommu_ops; > > + > > + ret = group->iommu_ops->domain_init(group->iommu_domain); > > + if (ret) { > > + kfree(group->iommu_domain); > > + group->iommu_domain = NULL; > > + mutex_unlock(&isolation_lock); > > + return ret; > > + } > > + > > + /* Automatically attach all the devices in the group. */ > > + ret = __isolation_group_domain_attach_devs(group->iommu_domain, > > + &group->devices); > > + if (ret) { > > + group->iommu_ops->domain_destroy(group->iommu_domain); > > + kfree(group->iommu_domain); > > + group->iommu_domain = NULL; > > + mutex_unlock(&isolation_lock); > > + return ret; > > + } > > + > > + mutex_unlock(&isolation_lock); > > + return 0; > > +} > > + > > +/* > > + * And free... > > + * Note just below, we add the ability to add another group to an iommu > > + * domain, so we don't always destroy and free the domain here. > > + */ > > +void isolation_group_free_iommu_domain(struct isolation_group *group) > > +{ > > + struct isolation_group *tmp; > > + bool inuse = false; > > + > > + if (!group->iommu_domain || !group->iommu_ops) > > + return; > > + > > + mutex_lock(&isolation_lock); > > + > > + __isolation_group_domain_detach_devs(group->iommu_domain, > > + &group->devices); > > + > > + list_for_each_entry(tmp, &isolation_groups, list) { > > + if (tmp == group) > > + continue; > > + if (tmp->iommu_domain == group->iommu_domain) { > > + inuse = true; > > + break; > > + } > > + } > > + > > + if (!inuse) { > > + group->iommu_ops->domain_destroy(group->iommu_domain); > > + kfree(group->iommu_domain); > > + } > > + > > + group->iommu_domain = NULL; > > + > > + mutex_unlock(&isolation_lock); > > +} > > + > > +/* > > + * This handles the VFIO "merge" interface, allowing us to manage multiple > > + * isolation groups with a single domain. We just rely on attach_dev to > > + * tell us whether this is ok. > > + */ > > +int isolation_group_iommu_domain_add_group(struct isolation_group *groupa, > > + struct isolation_group *groupb) > > +{ > > + int ret; > > + > > + if (!groupa->iommu_domain || > > + groupb->iommu_domain || list_empty(&groupb->devices)) > > + return -EINVAL; > > + > > + if (groupa->iommu_ops != groupb->iommu_ops) > > + return -EIO; > > + > > + mutex_lock(&isolation_lock); > > + > > + ret = __isolation_group_domain_attach_devs(groupa->iommu_domain, > > + &groupb->devices); > > + if (ret) { > > + mutex_unlock(&isolation_lock); > > + return ret; > > + } > > + > > + groupb->iommu_domain = groupa->iommu_domain; > > + > > + mutex_unlock(&isolation_lock); > > + > > + return 0; > > +} > > + > > +/* > > + * XXX page mapping/unmapping and more iommu api wrappers > > + */ > > + > > +/* > > + * Do we have a group manager? Group managers restrict what drivers are > > + * allowed to attach to devices. A group is "locked" when all of the devices > > + * for a group belong to group manager drivers (or no driver at all). At > > + * that point, the group manager can initialize the iommu. vfio is an example > > + * of a group manager and vfio-pci is an exanple of a driver that a group > > + * manager may add as a "managed" driver. Note that we don't gate iommu > > + * manipulation on being managed because we want to use it for regular DMA > > + * at some point as well. > > + */ > > +bool isolation_group_managed(struct isolation_group *group) > > +{ > > + return group->manager != NULL; > > +} > > + > > +/* > > + * Initialize the group manager struct. At this point the isolation group > > + * becomes "managed". > > + */ > > This assumes a separate manager struct for each group. I would have > thought the VFIO merge case would be more obviously represented by > having a single manager struct for all the groups in the VFIO instance > (== iommu domain). Given the objections to merging groups, I thought perhaps it would be safer to explicitly make it pertain to the iommu domain here. My vision of managers and merging is still solidifying, but the manager init seems like a first step to claiming a group while attempting to share resources comes a while later. I don't see much overhead on either the grouping code or the manager code for keeping these separate. Do you? > > +int isolation_group_init_manager(struct isolation_group *group) > > +{ > > + mutex_lock(&isolation_lock); > > + > > + if (group->manager) { > > + mutex_unlock(&isolation_lock); > > + return -EBUSY; > > + } > > + > > + group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL); > > + if (!group->manager) { > > + mutex_unlock(&isolation_lock); > > + return -ENOMEM; > > + } > > + > > + mutex_unlock(&isolation_lock); > > + > > + return 0; > > +} > > + > > +/* > > + * And free... cleanup registerd managed drivers too. > > + */ > > +void isolation_group_free_manager(struct isolation_group *group) > > +{ > > + struct isolation_manager_driver *driver, *tmp; > > + > > + if (!group->manager) > > + return; > > + > > + mutex_lock(&isolation_lock); > > + > > + list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) { > > + list_del(&driver->list); > > + kfree(driver); > > + } > > + > > + kfree(group->manager); > > + group->manager = NULL; > > + mutex_unlock(&isolation_lock); > > +} > > + > > +/* > > + * Add a managed driver. Drivers added here are the only ones that will > > + * be allowed to attach to a managed isolation group. > > + */ > > +int isolation_group_manager_add_driver(struct isolation_group *group, > > + struct device_driver *drv) > > +{ > > + struct isolation_manager_driver *driver; > > + > > + if (!group->manager) > > + return -EINVAL; > > + > > + driver = kzalloc(sizeof(*driver), GFP_KERNEL); > > + if (!driver) > > + return -ENOMEM; > > + > > + driver->drv = drv; > > + > > + mutex_lock(&isolation_lock); > > + > > + list_add(&driver->list, &group->manager->drivers); > > + > > + mutex_unlock(&isolation_lock); > > + > > + return 0; > > +} > > + > > +/* > > + * And remove a driver. Don't really expect to need this much. > > + */ > > +void isolation_group_manager_del_driver(struct isolation_group *group, > > + struct device_driver *drv) > > +{ > > + struct isolation_manager_driver *driver; > > + > > + if (!group->manager) > > + return; > > + > > + mutex_lock(&isolation_lock); > > + > > + list_for_each_entry(driver, &group->manager->drivers, list) { > > + if (driver->drv == drv) { > > + list_del(&driver->list); > > + kfree(driver); > > + break; > > + } > > + } > > + > > + mutex_unlock(&isolation_lock); > > +} > > + > > +/* > > + * Test whether a driver is a "managed" driver for the group. This allows > > + * is to preempt normal driver matching and only let our drivers in. > > + */ > > +bool isolation_group_manager_driver(struct isolation_group *group, > > + struct device_driver *drv) > > +{ > > + struct isolation_manager_driver *driver; > > + bool found = false; > > + > > + if (!group->manager) > > + return found; > > + > > + mutex_lock(&isolation_lock); > > + > > + list_for_each_entry(driver, &group->manager->drivers, list) { > > + if (driver->drv == drv) { > > + found = true; > > + break; > > + } > > + } > > + > > + mutex_unlock(&isolation_lock); > > + > > + return found; > > +} > > + > > +/* > > + * Does the group manager have control of all of the devices in the group? > > + * We consider driver-less devices to be ok here as they don't do DMA and > > + * don't present any interfaces to subvert the rest of the group. > > + */ > > +bool isolation_group_manager_locked(struct isolation_group *group) > > +{ > > + struct isolation_device *device; > > + struct isolation_manager_driver *driver; > > + bool found, locked = true; > > + > > + if (!group->manager) > > + return false; > > + > > + mutex_lock(&isolation_lock); > > + > > + list_for_each_entry(device, &group->devices, list) { > > + found = false; > > + > > + if (!device->dev->driver) > > + continue; > > + > > You could simplify this using isolation_group_manager_driver(), > couldn't you? Yeah, probably. > > + list_for_each_entry(driver, &group->manager->drivers, list) { > > + if (device->dev->driver == driver->drv) { > > + found = true; > > + break; > > + } > > + } > > + > > + if (!found) { > > + locked = false; > > + break; > > + } > > + } > > + > > + mutex_unlock(&isolation_lock); > > + > > + return locked; > > +} > > + > > +static int __init isolation_init(void) > > +{ > > + isolation_kset = kset_create_and_add("isolation", NULL, NULL); > > + > > + WARN(!isolation_kset, "Failed to initialize isolation group kset\n"); > > + > > + return isolation_kset ? 0 : -1; > > I'd be tempted to just BUG() here if you can't add the kset - I can't > see any reason it would fail unless you're so short of RAM that you > have bigger problems. Making this a fatal fail would save having to > double check if the kset is around in the later paths. Yep. Thanks for the comments, Alex
On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > Signed-off-by: Alex Williamson <alex.williamson@redhat.com> > > > --- > > > > > > drivers/base/Kconfig | 10 + > > > drivers/base/Makefile | 1 > > > drivers/base/base.h | 5 > > > drivers/base/isolation.c | 798 +++++++++++++++++++++++++++++++++++++++++++++ > > > include/linux/device.h | 4 > > > include/linux/isolation.h | 138 ++++++++ > > > 6 files changed, 956 insertions(+), 0 deletions(-) > > > create mode 100644 drivers/base/isolation.c > > > create mode 100644 include/linux/isolation.h > > > > > > diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig > > > index 7be9f79..e98a5f3 100644 > > > --- a/drivers/base/Kconfig > > > +++ b/drivers/base/Kconfig > > > @@ -189,4 +189,14 @@ config DMA_SHARED_BUFFER > > > APIs extension; the file's descriptor can then be passed on to other > > > driver. > > > > > > +config ISOLATION_GROUPS > > > + bool "Enable Isolation Group API" > > > + default n > > > + depends on EXPERIMENTAL && IOMMU_API > > > + help > > > + This option enables grouping of devices into Isolation Groups > > > + which may be used by other subsystems to perform quirks across > > > + sets of devices as well as userspace drivers for guaranteeing > > > + devices are isolated from the rest of the system. > > > + > > > endmenu > > > diff --git a/drivers/base/Makefile b/drivers/base/Makefile > > > index 610f999..047b5f9 100644 > > > --- a/drivers/base/Makefile > > > +++ b/drivers/base/Makefile > > > @@ -19,6 +19,7 @@ obj-$(CONFIG_MODULES) += module.o > > > endif > > > obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o > > > obj-$(CONFIG_REGMAP) += regmap/ > > > +obj-$(CONFIG_ISOLATION_GROUPS) += isolation.o > > > > > > ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG > > > > > > diff --git a/drivers/base/base.h b/drivers/base/base.h > > > index b858dfd..376758a 100644 > > > --- a/drivers/base/base.h > > > +++ b/drivers/base/base.h > > > @@ -1,4 +1,5 @@ > > > #include <linux/notifier.h> > > > +#include <linux/isolation.h> > > > > > > /** > > > * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. > > > @@ -108,6 +109,10 @@ extern int driver_probe_device(struct device_driver *drv, struct device *dev); > > > static inline int driver_match_device(struct device_driver *drv, > > > struct device *dev) > > > { > > > + if (isolation_group_managed(to_isolation_group(dev)) && > > > + !isolation_group_manager_driver(to_isolation_group(dev), drv)) > > > + return 0; > > > + > > > return drv->bus->match ? drv->bus->match(dev, drv) : 1; > > > } > > > > > > diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c > > > new file mode 100644 > > > index 0000000..c01365c > > > --- /dev/null > > > +++ b/drivers/base/isolation.c > > > @@ -0,0 +1,798 @@ > > > +/* > > > + * Isolation group interface > > > + * > > > + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. > > > + * Author: Alex Williamson <alex.williamson@redhat.com> > > > + * > > > + * This program is free software; you can redistribute it and/or modify > > > + * it under the terms of the GNU General Public License version 2 as > > > + * published by the Free Software Foundation. > > > + * > > > + */ > > > + > > > +#include <linux/device.h> > > > +#include <linux/iommu.h> > > > +#include <linux/isolation.h> > > > +#include <linux/list.h> > > > +#include <linux/mutex.h> > > > +#include <linux/notifier.h> > > > +#include <linux/slab.h> > > > +#include <linux/uuid.h> > > > + > > > +static struct kset *isolation_kset; > > > +/* XXX add more complete locking, maybe rcu */ > > > +static DEFINE_MUTEX(isolation_lock); > > > +static LIST_HEAD(isolation_groups); > > > +static LIST_HEAD(isolation_notifiers); > > > + > > > +/* Keep these private */ > > > +struct isolation_manager_driver { > > > + struct device_driver *drv; > > > + struct list_head list; > > > +}; > > > + > > > +struct isolation_manager { > > > + struct list_head drivers; > > > + /* Likely need managers to register some callbacks */ > > > +}; > > > + > > > +struct isolation_group { > > > + struct list_head list; > > > + struct list_head devices; > > > + struct kobject kobj; > > > + struct kobject *devices_kobj; > > > + struct iommu_domain *iommu_domain; > > > + struct iommu_ops *iommu_ops; > > > + struct isolation_manager *manager; > > > + uuid_le uuid; > > > +}; > > > + > > > +struct isolation_device { > > > + struct device *dev; > > > + struct list_head list; > > > +}; > > > + > > > +struct isolation_notifier { > > > + struct bus_type *bus; > > > + struct notifier_block nb; > > > + struct blocking_notifier_head notifier; > > > + struct list_head list; > > > + int refcnt; > > > +}; > > > + > > > +struct iso_attribute { > > > + struct attribute attr; > > > + ssize_t (*show)(struct isolation_group *group, char *buf); > > > + ssize_t (*store)(struct isolation_group *group, > > > + const char *buf, size_t count); > > > +}; > > > + > > > +#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr) > > > +#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj) > > > + > > > +static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr, > > > + char *buf) > > > +{ > > > + struct iso_attribute *iso_attr = to_iso_attr(attr); > > > + struct isolation_group *group = to_iso_group(kobj); > > > + ssize_t ret = -EIO; > > > + > > > + if (iso_attr->show) > > > + ret = iso_attr->show(group, buf); > > > + return ret; > > > +} > > > + > > > +static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr, > > > + const char *buf, size_t count) > > > +{ > > > + struct iso_attribute *iso_attr = to_iso_attr(attr); > > > + struct isolation_group *group = to_iso_group(kobj); > > > + ssize_t ret = -EIO; > > > + > > > + if (iso_attr->store) > > > + ret = iso_attr->store(group, buf, count); > > > + return ret; > > > +} > > > + > > > +static void iso_release(struct kobject *kobj) > > > +{ > > > + struct isolation_group *group = to_iso_group(kobj); > > > + kfree(group); > > > +} > > > + > > > +static const struct sysfs_ops iso_sysfs_ops = { > > > + .show = iso_attr_show, > > > + .store = iso_attr_store, > > > +}; > > > + > > > +static struct kobj_type iso_ktype = { > > > + .sysfs_ops = &iso_sysfs_ops, > > > + .release = iso_release, > > > +}; > > > + > > > +/* > > > + * Create an isolation group. Isolation group "providers" register a > > > + * notifier for their bus(es) and call this to create a new isolation > > > + * group. > > > + */ > > > +struct isolation_group *isolation_create_group(void) > > > +{ > > > + struct isolation_group *group, *tmp; > > > + int ret; > > > + > > > + if (unlikely(!isolation_kset)) > > > + return ERR_PTR(-ENODEV); > > > + > > > + group = kzalloc(sizeof(*group), GFP_KERNEL); > > > + if (!group) > > > + return ERR_PTR(-ENOMEM); > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + /* > > > + * Use a UUID for group identification simply because it seems wrong > > > + * to expose it as a kernel pointer (%p). Neither is user friendly. > > > + * Mostly only expect userspace to do anything with this. > > > + */ > > > > So, my plan was to require the isolation provider to give a unique > > name - it can construct something that's actually meaningful (and with > > luck, stable across boots). For Power we'd use the PE number, for > > VT-d, I was thinking the PCI device address would do it (of the "base" > > device for the non-separable bridge and broken multifunction cases). > > Naming always seems to end in "that's not unique" or "that doesn't apply > to us", so I figured I'd just avoid the problem by using random numbers. > We can allow providers to specify a name, but that still presents > challenges to uniqueness and consistency if we intend to generically > allow heterogeneous providers on a system. In the VFIO use case I > expect userspace will get the name from readlink on the isolation_group > sysfs entry and open a vfio provided device file of the same name. So > in the end, it doesn't matter what the name is. Hrm, maybe. > > > +new_uuid: > > > + uuid_le_gen(&group->uuid); > > > + > > > + /* Unlikely, but nothing prevents duplicates */ > > > + list_for_each_entry(tmp, &isolation_groups, list) > > > + if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0) > > > + goto new_uuid; > > > + > > > + INIT_LIST_HEAD(&group->devices); > > > + group->kobj.kset = isolation_kset; > > > + > > > + ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL, > > > + "%pUl", &group->uuid); > > > > Um.. isn't this setting the kobject name to the address of the uuid > > plus "Ul", not the content of the uuid? > > We have kernel extensions for %p, %pUl is a little endian UUID. It > prints in the right format, but I'll have to check if I'm getting the > actual UUID or a UUID-ified pointer address. Heh, and the modifiers bizarrely go after the 'p'. Wow, ok. > > > + if (ret) { > > > + kfree(group); > > > + mutex_unlock(&isolation_lock); > > > + return ERR_PTR(ret); > > > + } > > > + > > > + /* Add a subdirectory to save links for devices withing the group. */ > > > + group->devices_kobj = kobject_create_and_add("devices", &group->kobj); > > > + if (!group->devices_kobj) { > > > + kobject_put(&group->kobj); > > > + mutex_unlock(&isolation_lock); > > > + return ERR_PTR(-ENOMEM); > > > + } > > > + > > > + list_add(&group->list, &isolation_groups); > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return group; > > > +} > > > + > > > +/* > > > + * Free isolation group. XXX need to cleanup/reject based on manager status. > > > + */ > > > +int isolation_free_group(struct isolation_group *group) > > > +{ > > > + mutex_lock(&isolation_lock); > > > + > > > + if (!list_empty(&group->devices)) { > > > + mutex_unlock(&isolation_lock); > > > + return -EBUSY; > > > + } > > > + > > > + list_del(&group->list); > > > + kobject_put(group->devices_kobj); > > > + kobject_put(&group->kobj); > > > + > > > + mutex_unlock(&isolation_lock); > > > + return 0; > > > +} > > > + > > > +/* > > > + * Add a device to an isolation group. Isolation groups start empty and > > > + * must be told about the devices they contain. Expect this to be called > > > + * from isolation group providers via notifier. > > > + */ > > > > Doesn't necessarily have to be from a notifier, particularly if the > > provider is integrated into host bridge code. > > Sure, a provider could do this on it's own if it wants. This just > provides some infrastructure for a common path. Also note that this > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > to be seen whether that can reasonably be the case once isolation groups > are added to streaming DMA paths. Right, but other than the #ifdef safety, which could be achieved more simply, I'm not seeing what benefit the infrastructure provides over directly calling the bus notifier function. The infrastructure groups the notifiers by bus type internally, but AFAICT exactly one bus notifier call would become exactly one isolation notifier call, and the notifier callback itself would be almost identical. > > > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev) > > > +{ > > > + struct isolation_device *device; > > > + int ret = 0; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + if (dev->isolation_group) { > > > + ret = -EBUSY; > > > + goto out; > > > > This should probably be a BUG_ON() - the isolation provider shouldn't > > be putting the same device into two different groups. > > Yeah, probably. > > > > + } > > > + > > > + device = kzalloc(sizeof(*device), GFP_KERNEL); > > > + if (!device) { > > > + ret = -ENOMEM; > > > + goto out; > > > + } > > > + > > > + device->dev = dev; > > > + > > > + /* Cross link the device in sysfs */ > > > + ret = sysfs_create_link(&dev->kobj, &group->kobj, > > > + "isolation_group"); > > > + if (ret) { > > > + kfree(device); > > > + goto out; > > > + } > > > + > > > + ret = sysfs_create_link(group->devices_kobj, &dev->kobj, > > > + kobject_name(&dev->kobj)); > > > > So, a problem both here and in my version is what to name the device > > links. Because they could be potentially be quite widely scattered, > > I'm not sure that kobject_name() is guaranteed to be sufficiently > > unique. > > Even if the name is not, we're pointing to a unique sysfs location. I > expect the primary user of this will be when userspace translates a > device to a group (via the isolation_group link below), then tries to > walk all the devices in the group to determine effected host devices and > manager driver status. So it would probably be traversing the link back > rather than relying solely on the name of the link. Right? Yes, but if we get a duplicate kobject_name() we could get duplicate link names within the same directory so we're still in trouble. We could replace the names with just an index number. > > > + if (ret) { > > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > > + kfree(device); > > > + goto out; > > > + } > > > + > > > + list_add(&device->list, &group->devices); > > > + > > > + dev->isolation_group = group; > > > + > > > + if (group->iommu_domain) { > > > + ret = group->iommu_ops->attach_dev(group->iommu_domain, dev); > > > + if (ret) { > > > + /* XXX fail device? */ > > > + } > > > > So, I presume the idea is that this is a stop-gap until iommu_ops is > > converted to be in terms of groups rather than indivdual devices? > > Yes, I thought we could just back it by the iommu api for now so we can > implement managers, but eventually domain_init and attach_dev would > probably be a single callback in some kind of group aware iommu api. Makes sense. > > > + } > > > + > > > + /* XXX signal manager? */ > > > > Uh, what did you have in mind here? > > Another notifier? Maybe just a callback struct registered when a > manager is added to a group. On one hand, maybe it's not necessary > because adding a device to a managed group already restricts driver > matching, but on the other, the manager may want to be informed about > new devices and try to actively bind a driver to it. For instance, if > assigning an entire PE to a guest, which might include empty slots, > would the manager want to be informed about the addition of a device so > it could mirror the addition to the guest assigned the PE? Oh, right, I was misinterpreting the comment. Instead of reading it as "notify the isolation group manager that something is happening" I read it as "do something to manage when some process is given a Unix signal". Hence my complete confusion. > > > +out: > > > + mutex_unlock(&isolation_lock); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * Remove a device from an isolation group, likely because the device > > > + * went away. > > > + */ > > > +int isolation_group_del_dev(struct device *dev) > > > +{ > > > + struct isolation_group *group = dev->isolation_group; > > > + struct isolation_device *device; > > > + > > > + if (!group) > > > + return -ENODEV; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + list_for_each_entry(device, &group->devices, list) { > > > + if (device->dev == dev) { > > > + /* XXX signal manager? */ > > > + > > > + if (group->iommu_domain) > > > + group->iommu_ops->detach_dev( > > > + group->iommu_domain, dev); > > > + list_del(&device->list); > > > + kfree(device); > > > + dev->isolation_group = NULL; > > > + sysfs_remove_link(group->devices_kobj, > > > + kobject_name(&dev->kobj)); > > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > > + break; > > > + } > > > + } > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * Filter and call out to our own notifier chains > > > + */ > > > > So, I haven't quite worked out what this isolation notifier > > infrastructure gives you, as opposed to having isolation providers > > directly register bus notifiers. > > For now, it nicely separates CONFIG_ISOLATION code so I don't litter > intel-iommu with #ifdefs. It also embraces that devices are always > being added and removed and supports that with very little change to > existing code paths. Well, I get the first bit, but it's a long way from the minimum needed to de-#ifdef. I still haven't seen what it's giving the provider that wouldn't be just as simple with direct bus_notifier calls. > > > +static int isolation_bus_notify(struct notifier_block *nb, > > > + unsigned long action, void *data) > > > +{ > > > + struct isolation_notifier *notifier = > > > + container_of(nb, struct isolation_notifier, nb); > > > + struct device *dev = data; > > > + > > > + switch (action) { > > > + case BUS_NOTIFY_ADD_DEVICE: > > > + if (!dev->isolation_group) > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > + break; > > > + case BUS_NOTIFY_DEL_DEVICE: > > > + if (dev->isolation_group) > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > > > + break; > > > + } > > > + > > > + return NOTIFY_DONE; > > > +} > > > + > > > +/* > > > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add > > > + * a new notifier. > > > + */ > > > +static int isolation_do_add_notify(struct device *dev, void *data) > > > +{ > > > + struct notifier_block *nb = data; > > > + > > > + if (!dev->isolation_group) > > > + nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * Register a notifier. This is for isolation group providers. It's > > > + * possible we could have multiple isolation group providers for the same > > > + * bus type, > > > > So the bit above doesn't seem to connect to the bit below. We can > > indeed have multiple isolation providers for one bus type, but the > > below seems to be covering the (also possible, but different) case of > > one isolation provider covering multiple bus types. > > It covers both. If a provider covers multiple buses it will call this > function once for each bus. Each new bus creates a new struct > isolation_notifier and does a bus_register_notifier (using > isolation_bus_notify). The provider notifier block is added to our own > notifier call chain for that struct isolation_notifier. This way we > only ever register a single notifier per bus, but support multiple > providers for the bus. Um, right, but what would be the problem with registering multiple notifiers per bus. > > > so we create a struct isolation_notifier per bus_type, each > > > + * with a notifier block. Providers are therefore welcome to register > > > + * as many buses as they can handle. > > > + */ > > > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb) > > > +{ > > > + struct isolation_notifier *notifier; > > > + bool found = false; > > > + int ret; > > > + > > > + mutex_lock(&isolation_lock); > > > + list_for_each_entry(notifier, &isolation_notifiers, list) { > > > + if (notifier->bus == bus) { > > > + found = true; > > > + break; > > > + } > > > + } > > > + > > > + if (!found) { > > > + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); > > > + if (!notifier) { > > > + mutex_unlock(&isolation_lock); > > > + return -ENOMEM; > > > + } > > > + notifier->bus = bus; > > > + notifier->nb.notifier_call = isolation_bus_notify; > > > + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->notifier); > > > + bus_register_notifier(bus, ¬ifier->nb); > > > + list_add(¬ifier->list, &isolation_notifiers); > > > + } > > > + > > > + ret = blocking_notifier_chain_register(¬ifier->notifier, nb); > > > + if (ret) { > > > + if (notifier->refcnt == 0) { > > > + bus_unregister_notifier(bus, ¬ifier->nb); > > > + list_del(¬ifier->list); > > > + kfree(notifier); > > > + } > > > + mutex_unlock(&isolation_lock); > > > + return ret; > > > + } > > > + notifier->refcnt++; > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify); > > > + > > > + return 0; > > > +} > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > exactly where. If an isolation provider doesn't explicitly put a > > device into a group, the device should go into the group of its parent > > bridge. This covers the case of a bus with IOMMU which has below it a > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > explicitly aware of). DMAs from devices on the subordinate bus can be > > translated by the top-level IOMMU (assuming it sees them as coming > > from the bridge), but they all need to be treated as one group. > > Why would the top level IOMMU provider not set the isolation group in > this case. Because it knows nothing about the subordinate bus. For example imagine a VT-d system, with a wacky PCI card into which you plug some other type of DMA capable device. The PCI card is acting as a bridge from PCI to this, let's call it FooBus. Obviously the VT-d code won't have a FooBus notifier, since it knows nothing about FooBus. But the FooBus devices still need to end up in the group of the PCI bridge device, since their DMA operations will appear as coming from the PCI bridge card to the IOMMU, and can be isolated from the rest of the system (but not each other) on that basis. > I agree there's a gap here, but it's fuzzy when and how it > can occur and if it matters (devices without an isolation group can't be > used by managers, and apparently don't make use of any of the services > of a provider...). > > > +/* > > > + * Unregister... > > > + */ > > > +int isolation_unregister_notifier(struct bus_type *bus, > > > + struct notifier_block *nb) > > > +{ > > > + struct isolation_notifier *notifier; > > > + bool found = false; > > > + int ret; > > > + > > > + mutex_lock(&isolation_lock); > > > + list_for_each_entry(notifier, &isolation_notifiers, list) { > > > + if (notifier->bus == bus) { > > > + found = true; > > > + break; > > > + } > > > + } > > > + > > > + if (!found) { > > > + mutex_unlock(&isolation_lock); > > > + return -ENODEV; > > > + } > > > + > > > + ret = blocking_notifier_chain_unregister(¬ifier->notifier, nb); > > > + if (ret) { > > > + mutex_unlock(&isolation_lock); > > > + return ret; > > > + } > > > + > > > + /* Free at nobody left in the notifier block */ > > > + if (--notifier->refcnt == 0) { > > > + bus_unregister_notifier(bus, ¬ifier->nb); > > > + list_del(¬ifier->list); > > > + kfree(notifier); > > > + } > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * Set iommu_ops on an isolation group. Hopefully all DMA will eventually > > > + * be done through isolation groups, for now, we just provide another > > > + * interface to the iommu api. > > > + */ > > > +int isolation_group_set_iommu_ops(struct isolation_group *group, > > > + struct iommu_ops *iommu_ops) > > > +{ > > > + mutex_lock(&isolation_lock); > > > + > > > + if (group->iommu_ops) { > > > + mutex_unlock(&isolation_lock); > > > + return -EBUSY; > > > + } > > > + > > > + group->iommu_ops = iommu_ops; > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * Attach all the devices for a group to the specified iommu domain. > > > + */ > > > +static int __isolation_group_domain_attach_devs(struct iommu_domain *domain, > > > + struct list_head *devices) > > > +{ > > > + struct isolation_device *device; > > > + struct device *dev; > > > + int ret = 0; > > > + > > > + list_for_each_entry(device, devices, list) { > > > + dev = device->dev; > > > + > > > + ret = domain->ops->attach_dev(domain, dev); > > > + if (ret) { > > > + list_for_each_entry_continue_reverse(device, > > > + devices, list) { > > > + dev = device->dev; > > > + domain->ops->detach_dev(domain, dev); > > > + } > > > + break; > > > + } > > > + } > > > + > > > + return ret; > > > +} > > > + > > > +/* > > > + * And detach... > > > + */ > > > +static void __isolation_group_domain_detach_devs(struct iommu_domain *domain, > > > + struct list_head *devices) > > > +{ > > > + struct isolation_device *device; > > > + struct device *dev; > > > + > > > + list_for_each_entry(device, devices, list) { > > > + dev = device->dev; > > > + > > > + domain->ops->detach_dev(domain, dev); > > > + } > > > +} > > > + > > > +/* > > > + * Initialize the iommu domain for an isolation group. This is to ease the > > > + * transition to using isolation groups and expected to be used by current > > > + * users of the iommu api for now. > > > + */ > > > +int isolation_group_init_iommu_domain(struct isolation_group *group) > > > +{ > > > + int ret; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + if (!group->iommu_ops || list_empty(&group->devices)) { > > > + mutex_unlock(&isolation_lock); > > > + return -EINVAL; > > > + } > > > + > > > + if (group->iommu_domain) { > > > + mutex_unlock(&isolation_lock); > > > + return -EBUSY; > > > + } > > > + > > > + group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL); > > > + if (!group->iommu_domain) { > > > + mutex_unlock(&isolation_lock); > > > + return -ENOMEM; > > > + } > > > + > > > + group->iommu_domain->ops = group->iommu_ops; > > > + > > > + ret = group->iommu_ops->domain_init(group->iommu_domain); > > > + if (ret) { > > > + kfree(group->iommu_domain); > > > + group->iommu_domain = NULL; > > > + mutex_unlock(&isolation_lock); > > > + return ret; > > > + } > > > + > > > + /* Automatically attach all the devices in the group. */ > > > + ret = __isolation_group_domain_attach_devs(group->iommu_domain, > > > + &group->devices); > > > + if (ret) { > > > + group->iommu_ops->domain_destroy(group->iommu_domain); > > > + kfree(group->iommu_domain); > > > + group->iommu_domain = NULL; > > > + mutex_unlock(&isolation_lock); > > > + return ret; > > > + } > > > + > > > + mutex_unlock(&isolation_lock); > > > + return 0; > > > +} > > > + > > > +/* > > > + * And free... > > > + * Note just below, we add the ability to add another group to an iommu > > > + * domain, so we don't always destroy and free the domain here. > > > + */ > > > +void isolation_group_free_iommu_domain(struct isolation_group *group) > > > +{ > > > + struct isolation_group *tmp; > > > + bool inuse = false; > > > + > > > + if (!group->iommu_domain || !group->iommu_ops) > > > + return; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + __isolation_group_domain_detach_devs(group->iommu_domain, > > > + &group->devices); > > > + > > > + list_for_each_entry(tmp, &isolation_groups, list) { > > > + if (tmp == group) > > > + continue; > > > + if (tmp->iommu_domain == group->iommu_domain) { > > > + inuse = true; > > > + break; > > > + } > > > + } > > > + > > > + if (!inuse) { > > > + group->iommu_ops->domain_destroy(group->iommu_domain); > > > + kfree(group->iommu_domain); > > > + } > > > + > > > + group->iommu_domain = NULL; > > > + > > > + mutex_unlock(&isolation_lock); > > > +} > > > + > > > +/* > > > + * This handles the VFIO "merge" interface, allowing us to manage multiple > > > + * isolation groups with a single domain. We just rely on attach_dev to > > > + * tell us whether this is ok. > > > + */ > > > +int isolation_group_iommu_domain_add_group(struct isolation_group *groupa, > > > + struct isolation_group *groupb) > > > +{ > > > + int ret; > > > + > > > + if (!groupa->iommu_domain || > > > + groupb->iommu_domain || list_empty(&groupb->devices)) > > > + return -EINVAL; > > > + > > > + if (groupa->iommu_ops != groupb->iommu_ops) > > > + return -EIO; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + ret = __isolation_group_domain_attach_devs(groupa->iommu_domain, > > > + &groupb->devices); > > > + if (ret) { > > > + mutex_unlock(&isolation_lock); > > > + return ret; > > > + } > > > + > > > + groupb->iommu_domain = groupa->iommu_domain; > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * XXX page mapping/unmapping and more iommu api wrappers > > > + */ > > > + > > > +/* > > > + * Do we have a group manager? Group managers restrict what drivers are > > > + * allowed to attach to devices. A group is "locked" when all of the devices > > > + * for a group belong to group manager drivers (or no driver at all). At > > > + * that point, the group manager can initialize the iommu. vfio is an example > > > + * of a group manager and vfio-pci is an exanple of a driver that a group > > > + * manager may add as a "managed" driver. Note that we don't gate iommu > > > + * manipulation on being managed because we want to use it for regular DMA > > > + * at some point as well. > > > + */ > > > +bool isolation_group_managed(struct isolation_group *group) > > > +{ > > > + return group->manager != NULL; > > > +} > > > + > > > +/* > > > + * Initialize the group manager struct. At this point the isolation group > > > + * becomes "managed". > > > + */ > > > > This assumes a separate manager struct for each group. I would have > > thought the VFIO merge case would be more obviously represented by > > having a single manager struct for all the groups in the VFIO instance > > (== iommu domain). > > Given the objections to merging groups, I thought perhaps it would be > safer to explicitly make it pertain to the iommu domain here. My problem with merging is about the interface to it starting with just group handles. I have no problem with a scheme where you start an instance/context/domain and then attach multiple groups to it. > My vision > of managers and merging is still solidifying, but the manager init seems > like a first step to claiming a group while attempting to share > resources comes a while later. I don't see much overhead on either the > grouping code or the manager code for keeping these separate. Do you? Hrm, not sure yet. > > > +int isolation_group_init_manager(struct isolation_group *group) > > > +{ > > > + mutex_lock(&isolation_lock); > > > + > > > + if (group->manager) { > > > + mutex_unlock(&isolation_lock); > > > + return -EBUSY; > > > + } > > > + > > > + group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL); > > > + if (!group->manager) { > > > + mutex_unlock(&isolation_lock); > > > + return -ENOMEM; > > > + } > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * And free... cleanup registerd managed drivers too. > > > + */ > > > +void isolation_group_free_manager(struct isolation_group *group) > > > +{ > > > + struct isolation_manager_driver *driver, *tmp; > > > + > > > + if (!group->manager) > > > + return; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) { > > > + list_del(&driver->list); > > > + kfree(driver); > > > + } > > > + > > > + kfree(group->manager); > > > + group->manager = NULL; > > > + mutex_unlock(&isolation_lock); > > > +} > > > + > > > +/* > > > + * Add a managed driver. Drivers added here are the only ones that will > > > + * be allowed to attach to a managed isolation group. > > > + */ > > > +int isolation_group_manager_add_driver(struct isolation_group *group, > > > + struct device_driver *drv) > > > +{ > > > + struct isolation_manager_driver *driver; > > > + > > > + if (!group->manager) > > > + return -EINVAL; > > > + > > > + driver = kzalloc(sizeof(*driver), GFP_KERNEL); > > > + if (!driver) > > > + return -ENOMEM; > > > + > > > + driver->drv = drv; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + list_add(&driver->list, &group->manager->drivers); > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return 0; > > > +} > > > + > > > +/* > > > + * And remove a driver. Don't really expect to need this much. > > > + */ > > > +void isolation_group_manager_del_driver(struct isolation_group *group, > > > + struct device_driver *drv) > > > +{ > > > + struct isolation_manager_driver *driver; > > > + > > > + if (!group->manager) > > > + return; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + list_for_each_entry(driver, &group->manager->drivers, list) { > > > + if (driver->drv == drv) { > > > + list_del(&driver->list); > > > + kfree(driver); > > > + break; > > > + } > > > + } > > > + > > > + mutex_unlock(&isolation_lock); > > > +} > > > + > > > +/* > > > + * Test whether a driver is a "managed" driver for the group. This allows > > > + * is to preempt normal driver matching and only let our drivers in. > > > + */ > > > +bool isolation_group_manager_driver(struct isolation_group *group, > > > + struct device_driver *drv) > > > +{ > > > + struct isolation_manager_driver *driver; > > > + bool found = false; > > > + > > > + if (!group->manager) > > > + return found; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + list_for_each_entry(driver, &group->manager->drivers, list) { > > > + if (driver->drv == drv) { > > > + found = true; > > > + break; > > > + } > > > + } > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return found; > > > +} > > > + > > > +/* > > > + * Does the group manager have control of all of the devices in the group? > > > + * We consider driver-less devices to be ok here as they don't do DMA and > > > + * don't present any interfaces to subvert the rest of the group. > > > + */ > > > +bool isolation_group_manager_locked(struct isolation_group *group) > > > +{ > > > + struct isolation_device *device; > > > + struct isolation_manager_driver *driver; > > > + bool found, locked = true; > > > + > > > + if (!group->manager) > > > + return false; > > > + > > > + mutex_lock(&isolation_lock); > > > + > > > + list_for_each_entry(device, &group->devices, list) { > > > + found = false; > > > + > > > + if (!device->dev->driver) > > > + continue; > > > + > > > > You could simplify this using isolation_group_manager_driver(), > > couldn't you? > > Yeah, probably. > > > > + list_for_each_entry(driver, &group->manager->drivers, list) { > > > + if (device->dev->driver == driver->drv) { > > > + found = true; > > > + break; > > > + } > > > + } > > > + > > > + if (!found) { > > > + locked = false; > > > + break; > > > + } > > > + } > > > + > > > + mutex_unlock(&isolation_lock); > > > + > > > + return locked; > > > +} > > > + > > > +static int __init isolation_init(void) > > > +{ > > > + isolation_kset = kset_create_and_add("isolation", NULL, NULL); > > > + > > > + WARN(!isolation_kset, "Failed to initialize isolation group kset\n"); > > > + > > > + return isolation_kset ? 0 : -1; > > > > I'd be tempted to just BUG() here if you can't add the kset - I can't > > see any reason it would fail unless you're so short of RAM that you > > have bigger problems. Making this a fatal fail would save having to > > double check if the kset is around in the later paths. > > Yep. Thanks for the comments, > > Alex > >
On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote: > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > > +/* > > > > + * Add a device to an isolation group. Isolation groups start empty and > > > > + * must be told about the devices they contain. Expect this to be called > > > > + * from isolation group providers via notifier. > > > > + */ > > > > > > Doesn't necessarily have to be from a notifier, particularly if the > > > provider is integrated into host bridge code. > > > > Sure, a provider could do this on it's own if it wants. This just > > provides some infrastructure for a common path. Also note that this > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > > to be seen whether that can reasonably be the case once isolation groups > > are added to streaming DMA paths. > > Right, but other than the #ifdef safety, which could be achieved more > simply, I'm not seeing what benefit the infrastructure provides over > directly calling the bus notifier function. The infrastructure groups > the notifiers by bus type internally, but AFAICT exactly one bus > notifier call would become exactly one isolation notifier call, and > the notifier callback itself would be almost identical. I guess I don't see this as a fundamental design point of the proposal, it's just a convenient way to initialize groups as a side-band addition until isolation groups become a more fundamental part of the iommu infrastructure. If you want to do that level of integration in your provider, do it and make the callbacks w/o the notifier. If nobody ends up using them, we'll remove them. Maybe it will just end up being a bootstrap. In the typical case, yes, one bus notifier is one isolation notifier. It does however also allow one bus notifier to become multiple isolation notifiers, and includes some filtering that would just be duplicated if every provider decided to implement their own bus notifier. > > > > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev) > > > > +{ > > > > + struct isolation_device *device; > > > > + int ret = 0; > > > > + > > > > + mutex_lock(&isolation_lock); > > > > + > > > > + if (dev->isolation_group) { > > > > + ret = -EBUSY; > > > > + goto out; > > > > > > This should probably be a BUG_ON() - the isolation provider shouldn't > > > be putting the same device into two different groups. > > > > Yeah, probably. > > > > > > + } > > > > + > > > > + device = kzalloc(sizeof(*device), GFP_KERNEL); > > > > + if (!device) { > > > > + ret = -ENOMEM; > > > > + goto out; > > > > + } > > > > + > > > > + device->dev = dev; > > > > + > > > > + /* Cross link the device in sysfs */ > > > > + ret = sysfs_create_link(&dev->kobj, &group->kobj, > > > > + "isolation_group"); > > > > + if (ret) { > > > > + kfree(device); > > > > + goto out; > > > > + } > > > > + > > > > + ret = sysfs_create_link(group->devices_kobj, &dev->kobj, > > > > + kobject_name(&dev->kobj)); > > > > > > So, a problem both here and in my version is what to name the device > > > links. Because they could be potentially be quite widely scattered, > > > I'm not sure that kobject_name() is guaranteed to be sufficiently > > > unique. > > > > Even if the name is not, we're pointing to a unique sysfs location. I > > expect the primary user of this will be when userspace translates a > > device to a group (via the isolation_group link below), then tries to > > walk all the devices in the group to determine effected host devices and > > manager driver status. So it would probably be traversing the link back > > rather than relying solely on the name of the link. Right? > > Yes, but if we get a duplicate kobject_name() we could get duplicate > link names within the same directory so we're still in trouble. We > could replace the names with just an index number. This seems exceptionally unlikely, doesn't it? Maybe sysfs_create_link() will fail in such a case as we can append some identifier to the name? > > > > + if (ret) { > > > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > > > + kfree(device); > > > > + goto out; > > > > + } > > > > + > > > > + list_add(&device->list, &group->devices); > > > > + > > > > + dev->isolation_group = group; > > > > + > > > > + if (group->iommu_domain) { > > > > + ret = group->iommu_ops->attach_dev(group->iommu_domain, dev); > > > > + if (ret) { > > > > + /* XXX fail device? */ > > > > + } > > > > > > So, I presume the idea is that this is a stop-gap until iommu_ops is > > > converted to be in terms of groups rather than indivdual devices? > > > > Yes, I thought we could just back it by the iommu api for now so we can > > implement managers, but eventually domain_init and attach_dev would > > probably be a single callback in some kind of group aware iommu api. > > Makes sense. > > > > > + } > > > > + > > > > + /* XXX signal manager? */ > > > > > > Uh, what did you have in mind here? > > > > Another notifier? Maybe just a callback struct registered when a > > manager is added to a group. On one hand, maybe it's not necessary > > because adding a device to a managed group already restricts driver > > matching, but on the other, the manager may want to be informed about > > new devices and try to actively bind a driver to it. For instance, if > > assigning an entire PE to a guest, which might include empty slots, > > would the manager want to be informed about the addition of a device so > > it could mirror the addition to the guest assigned the PE? > > Oh, right, I was misinterpreting the comment. Instead of reading it > as "notify the isolation group manager that something is happening" I > read it as "do something to manage when some process is given a Unix > signal". Hence my complete confusion. > > > > > +out: > > > > + mutex_unlock(&isolation_lock); > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +/* > > > > + * Remove a device from an isolation group, likely because the device > > > > + * went away. > > > > + */ > > > > +int isolation_group_del_dev(struct device *dev) > > > > +{ > > > > + struct isolation_group *group = dev->isolation_group; > > > > + struct isolation_device *device; > > > > + > > > > + if (!group) > > > > + return -ENODEV; > > > > + > > > > + mutex_lock(&isolation_lock); > > > > + > > > > + list_for_each_entry(device, &group->devices, list) { > > > > + if (device->dev == dev) { > > > > + /* XXX signal manager? */ > > > > + > > > > + if (group->iommu_domain) > > > > + group->iommu_ops->detach_dev( > > > > + group->iommu_domain, dev); > > > > + list_del(&device->list); > > > > + kfree(device); > > > > + dev->isolation_group = NULL; > > > > + sysfs_remove_link(group->devices_kobj, > > > > + kobject_name(&dev->kobj)); > > > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > > > + break; > > > > + } > > > > + } > > > > + > > > > + mutex_unlock(&isolation_lock); > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +/* > > > > + * Filter and call out to our own notifier chains > > > > + */ > > > > > > So, I haven't quite worked out what this isolation notifier > > > infrastructure gives you, as opposed to having isolation providers > > > directly register bus notifiers. > > > > For now, it nicely separates CONFIG_ISOLATION code so I don't litter > > intel-iommu with #ifdefs. It also embraces that devices are always > > being added and removed and supports that with very little change to > > existing code paths. > > Well, I get the first bit, but it's a long way from the minimum needed > to de-#ifdef. I still haven't seen what it's giving the provider that > wouldn't be just as simple with direct bus_notifier calls. I think this is covered above. > > > > +static int isolation_bus_notify(struct notifier_block *nb, > > > > + unsigned long action, void *data) > > > > +{ > > > > + struct isolation_notifier *notifier = > > > > + container_of(nb, struct isolation_notifier, nb); > > > > + struct device *dev = data; > > > > + > > > > + switch (action) { > > > > + case BUS_NOTIFY_ADD_DEVICE: > > > > + if (!dev->isolation_group) > > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > > + break; > > > > + case BUS_NOTIFY_DEL_DEVICE: > > > > + if (dev->isolation_group) > > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > > > > + break; > > > > + } > > > > + > > > > + return NOTIFY_DONE; > > > > +} > > > > + > > > > +/* > > > > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add > > > > + * a new notifier. > > > > + */ > > > > +static int isolation_do_add_notify(struct device *dev, void *data) > > > > +{ > > > > + struct notifier_block *nb = data; > > > > + > > > > + if (!dev->isolation_group) > > > > + nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > > + > > > > + return 0; > > > > +} > > > > + > > > > +/* > > > > + * Register a notifier. This is for isolation group providers. It's > > > > + * possible we could have multiple isolation group providers for the same > > > > + * bus type, > > > > > > So the bit above doesn't seem to connect to the bit below. We can > > > indeed have multiple isolation providers for one bus type, but the > > > below seems to be covering the (also possible, but different) case of > > > one isolation provider covering multiple bus types. > > > > It covers both. If a provider covers multiple buses it will call this > > function once for each bus. Each new bus creates a new struct > > isolation_notifier and does a bus_register_notifier (using > > isolation_bus_notify). The provider notifier block is added to our own > > notifier call chain for that struct isolation_notifier. This way we > > only ever register a single notifier per bus, but support multiple > > providers for the bus. > > Um, right, but what would be the problem with registering multiple > notifiers per bus. It's a minor optimization, but this way we can stop when one of the providers decides it "owns" the device. As noted, I'm not stuck to this notifier and in the end, it doesn't gate creating groups using the isolation functions directly. > > > > so we create a struct isolation_notifier per bus_type, each > > > > + * with a notifier block. Providers are therefore welcome to register > > > > + * as many buses as they can handle. > > > > + */ > > > > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb) > > > > +{ > > > > + struct isolation_notifier *notifier; > > > > + bool found = false; > > > > + int ret; > > > > + > > > > + mutex_lock(&isolation_lock); > > > > + list_for_each_entry(notifier, &isolation_notifiers, list) { > > > > + if (notifier->bus == bus) { > > > > + found = true; > > > > + break; > > > > + } > > > > + } > > > > + > > > > + if (!found) { > > > > + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); > > > > + if (!notifier) { > > > > + mutex_unlock(&isolation_lock); > > > > + return -ENOMEM; > > > > + } > > > > + notifier->bus = bus; > > > > + notifier->nb.notifier_call = isolation_bus_notify; > > > > + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->notifier); > > > > + bus_register_notifier(bus, ¬ifier->nb); > > > > + list_add(¬ifier->list, &isolation_notifiers); > > > > + } > > > > + > > > > + ret = blocking_notifier_chain_register(¬ifier->notifier, nb); > > > > + if (ret) { > > > > + if (notifier->refcnt == 0) { > > > > + bus_unregister_notifier(bus, ¬ifier->nb); > > > > + list_del(¬ifier->list); > > > > + kfree(notifier); > > > > + } > > > > + mutex_unlock(&isolation_lock); > > > > + return ret; > > > > + } > > > > + notifier->refcnt++; > > > > + > > > > + mutex_unlock(&isolation_lock); > > > > + > > > > + bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify); > > > > + > > > > + return 0; > > > > +} > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > > exactly where. If an isolation provider doesn't explicitly put a > > > device into a group, the device should go into the group of its parent > > > bridge. This covers the case of a bus with IOMMU which has below it a > > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > > explicitly aware of). DMAs from devices on the subordinate bus can be > > > translated by the top-level IOMMU (assuming it sees them as coming > > > from the bridge), but they all need to be treated as one group. > > > > Why would the top level IOMMU provider not set the isolation group in > > this case. > > Because it knows nothing about the subordinate bus. For example > imagine a VT-d system, with a wacky PCI card into which you plug some > other type of DMA capable device. The PCI card is acting as a bridge > from PCI to this, let's call it FooBus. Obviously the VT-d code won't > have a FooBus notifier, since it knows nothing about FooBus. But the > FooBus devices still need to end up in the group of the PCI bridge > device, since their DMA operations will appear as coming from the PCI > bridge card to the IOMMU, and can be isolated from the rest of the > system (but not each other) on that basis. I guess I was imagining that it's ok to have devices without an isolation group. When that happens we can traverse up the bus to find a higher level isolation group. It would probably make sense to have some kind of top-most isolation group that contains everything as it's an easy proof that if you own everything, you're isolated. Potentially though, wackyPCIcard can also register isolation groups for each of it's FooBus devices if it's able to provide that capability. Thanks, Alex
On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote: > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote: > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > > > +/* > > > > > + * Add a device to an isolation group. Isolation groups start empty and > > > > > + * must be told about the devices they contain. Expect this to be called > > > > > + * from isolation group providers via notifier. > > > > > + */ > > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the > > > > provider is integrated into host bridge code. > > > > > > Sure, a provider could do this on it's own if it wants. This just > > > provides some infrastructure for a common path. Also note that this > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > > > to be seen whether that can reasonably be the case once isolation groups > > > are added to streaming DMA paths. > > > > Right, but other than the #ifdef safety, which could be achieved more > > simply, I'm not seeing what benefit the infrastructure provides over > > directly calling the bus notifier function. The infrastructure groups > > the notifiers by bus type internally, but AFAICT exactly one bus > > notifier call would become exactly one isolation notifier call, and > > the notifier callback itself would be almost identical. > > I guess I don't see this as a fundamental design point of the proposal, > it's just a convenient way to initialize groups as a side-band addition > until isolation groups become a more fundamental part of the iommu > infrastructure. If you want to do that level of integration in your > provider, do it and make the callbacks w/o the notifier. If nobody ends > up using them, we'll remove them. Maybe it will just end up being a > bootstrap. In the typical case, yes, one bus notifier is one isolation > notifier. It does however also allow one bus notifier to become > multiple isolation notifiers, and includes some filtering that would > just be duplicated if every provider decided to implement their own bus > notifier. Uh.. I didn't notice any filtering? That's why I'm asking. > > > > > +int isolation_group_add_dev(struct isolation_group *group, struct device *dev) > > > > > +{ > > > > > + struct isolation_device *device; > > > > > + int ret = 0; > > > > > + > > > > > + mutex_lock(&isolation_lock); > > > > > + > > > > > + if (dev->isolation_group) { > > > > > + ret = -EBUSY; > > > > > + goto out; > > > > > > > > This should probably be a BUG_ON() - the isolation provider shouldn't > > > > be putting the same device into two different groups. > > > > > > Yeah, probably. > > > > > > > > + } > > > > > + > > > > > + device = kzalloc(sizeof(*device), GFP_KERNEL); > > > > > + if (!device) { > > > > > + ret = -ENOMEM; > > > > > + goto out; > > > > > + } > > > > > + > > > > > + device->dev = dev; > > > > > + > > > > > + /* Cross link the device in sysfs */ > > > > > + ret = sysfs_create_link(&dev->kobj, &group->kobj, > > > > > + "isolation_group"); > > > > > + if (ret) { > > > > > + kfree(device); > > > > > + goto out; > > > > > + } > > > > > + > > > > > + ret = sysfs_create_link(group->devices_kobj, &dev->kobj, > > > > > + kobject_name(&dev->kobj)); > > > > > > > > So, a problem both here and in my version is what to name the device > > > > links. Because they could be potentially be quite widely scattered, > > > > I'm not sure that kobject_name() is guaranteed to be sufficiently > > > > unique. > > > > > > Even if the name is not, we're pointing to a unique sysfs location. I > > > expect the primary user of this will be when userspace translates a > > > device to a group (via the isolation_group link below), then tries to > > > walk all the devices in the group to determine effected host devices and > > > manager driver status. So it would probably be traversing the link back > > > rather than relying solely on the name of the link. Right? > > > > Yes, but if we get a duplicate kobject_name() we could get duplicate > > link names within the same directory so we're still in trouble. We > > could replace the names with just an index number. > > This seems exceptionally unlikely, doesn't it? Hmm, I don't think I have enough data to judge it particularly unlikely. Just that it's possible. > Maybe > sysfs_create_link() will fail in such a case as we can append some > identifier to the name? That could work. > > > > > + if (ret) { > > > > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > > > > + kfree(device); > > > > > + goto out; > > > > > + } > > > > > + > > > > > + list_add(&device->list, &group->devices); > > > > > + > > > > > + dev->isolation_group = group; > > > > > + > > > > > + if (group->iommu_domain) { > > > > > + ret = group->iommu_ops->attach_dev(group->iommu_domain, dev); > > > > > + if (ret) { > > > > > + /* XXX fail device? */ > > > > > + } > > > > > > > > So, I presume the idea is that this is a stop-gap until iommu_ops is > > > > converted to be in terms of groups rather than indivdual devices? > > > > > > Yes, I thought we could just back it by the iommu api for now so we can > > > implement managers, but eventually domain_init and attach_dev would > > > probably be a single callback in some kind of group aware iommu api. > > > > Makes sense. > > > > > > > + } > > > > > + > > > > > + /* XXX signal manager? */ > > > > > > > > Uh, what did you have in mind here? > > > > > > Another notifier? Maybe just a callback struct registered when a > > > manager is added to a group. On one hand, maybe it's not necessary > > > because adding a device to a managed group already restricts driver > > > matching, but on the other, the manager may want to be informed about > > > new devices and try to actively bind a driver to it. For instance, if > > > assigning an entire PE to a guest, which might include empty slots, > > > would the manager want to be informed about the addition of a device so > > > it could mirror the addition to the guest assigned the PE? > > > > Oh, right, I was misinterpreting the comment. Instead of reading it > > as "notify the isolation group manager that something is happening" I > > read it as "do something to manage when some process is given a Unix > > signal". Hence my complete confusion. > > > > > > > +out: > > > > > + mutex_unlock(&isolation_lock); > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > +/* > > > > > + * Remove a device from an isolation group, likely because the device > > > > > + * went away. > > > > > + */ > > > > > +int isolation_group_del_dev(struct device *dev) > > > > > +{ > > > > > + struct isolation_group *group = dev->isolation_group; > > > > > + struct isolation_device *device; > > > > > + > > > > > + if (!group) > > > > > + return -ENODEV; > > > > > + > > > > > + mutex_lock(&isolation_lock); > > > > > + > > > > > + list_for_each_entry(device, &group->devices, list) { > > > > > + if (device->dev == dev) { > > > > > + /* XXX signal manager? */ > > > > > + > > > > > + if (group->iommu_domain) > > > > > + group->iommu_ops->detach_dev( > > > > > + group->iommu_domain, dev); > > > > > + list_del(&device->list); > > > > > + kfree(device); > > > > > + dev->isolation_group = NULL; > > > > > + sysfs_remove_link(group->devices_kobj, > > > > > + kobject_name(&dev->kobj)); > > > > > + sysfs_remove_link(&dev->kobj, "isolation_group"); > > > > > + break; > > > > > + } > > > > > + } > > > > > + > > > > > + mutex_unlock(&isolation_lock); > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > +/* > > > > > + * Filter and call out to our own notifier chains > > > > > + */ > > > > > > > > So, I haven't quite worked out what this isolation notifier > > > > infrastructure gives you, as opposed to having isolation providers > > > > directly register bus notifiers. > > > > > > For now, it nicely separates CONFIG_ISOLATION code so I don't litter > > > intel-iommu with #ifdefs. It also embraces that devices are always > > > being added and removed and supports that with very little change to > > > existing code paths. > > > > Well, I get the first bit, but it's a long way from the minimum needed > > to de-#ifdef. I still haven't seen what it's giving the provider that > > wouldn't be just as simple with direct bus_notifier calls. > > I think this is covered above. > > > > > > +static int isolation_bus_notify(struct notifier_block *nb, > > > > > + unsigned long action, void *data) > > > > > +{ > > > > > + struct isolation_notifier *notifier = > > > > > + container_of(nb, struct isolation_notifier, nb); > > > > > + struct device *dev = data; > > > > > + > > > > > + switch (action) { > > > > > + case BUS_NOTIFY_ADD_DEVICE: > > > > > + if (!dev->isolation_group) > > > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > > > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > > > + break; > > > > > + case BUS_NOTIFY_DEL_DEVICE: > > > > > + if (dev->isolation_group) > > > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > > > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > > > > > + break; > > > > > + } > > > > > + > > > > > + return NOTIFY_DONE; > > > > > +} > > > > > + > > > > > +/* > > > > > + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add > > > > > + * a new notifier. > > > > > + */ > > > > > +static int isolation_do_add_notify(struct device *dev, void *data) > > > > > +{ > > > > > + struct notifier_block *nb = data; > > > > > + > > > > > + if (!dev->isolation_group) > > > > > + nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > > > + > > > > > + return 0; > > > > > +} > > > > > + > > > > > +/* > > > > > + * Register a notifier. This is for isolation group providers. It's > > > > > + * possible we could have multiple isolation group providers for the same > > > > > + * bus type, > > > > > > > > So the bit above doesn't seem to connect to the bit below. We can > > > > indeed have multiple isolation providers for one bus type, but the > > > > below seems to be covering the (also possible, but different) case of > > > > one isolation provider covering multiple bus types. > > > > > > It covers both. If a provider covers multiple buses it will call this > > > function once for each bus. Each new bus creates a new struct > > > isolation_notifier and does a bus_register_notifier (using > > > isolation_bus_notify). The provider notifier block is added to our own > > > notifier call chain for that struct isolation_notifier. This way we > > > only ever register a single notifier per bus, but support multiple > > > providers for the bus. > > > > Um, right, but what would be the problem with registering multiple > > notifiers per bus. > > It's a minor optimization, but this way we can stop when one of the > providers decides it "owns" the device. As noted, I'm not stuck to this > notifier and in the end, it doesn't gate creating groups using the > isolation functions directly. Hm, ok. > > > > > so we create a struct isolation_notifier per bus_type, each > > > > > + * with a notifier block. Providers are therefore welcome to register > > > > > + * as many buses as they can handle. > > > > > + */ > > > > > +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb) > > > > > +{ > > > > > + struct isolation_notifier *notifier; > > > > > + bool found = false; > > > > > + int ret; > > > > > + > > > > > + mutex_lock(&isolation_lock); > > > > > + list_for_each_entry(notifier, &isolation_notifiers, list) { > > > > > + if (notifier->bus == bus) { > > > > > + found = true; > > > > > + break; > > > > > + } > > > > > + } > > > > > + > > > > > + if (!found) { > > > > > + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); > > > > > + if (!notifier) { > > > > > + mutex_unlock(&isolation_lock); > > > > > + return -ENOMEM; > > > > > + } > > > > > + notifier->bus = bus; > > > > > + notifier->nb.notifier_call = isolation_bus_notify; > > > > > + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->notifier); > > > > > + bus_register_notifier(bus, ¬ifier->nb); > > > > > + list_add(¬ifier->list, &isolation_notifiers); > > > > > + } > > > > > + > > > > > + ret = blocking_notifier_chain_register(¬ifier->notifier, nb); > > > > > + if (ret) { > > > > > + if (notifier->refcnt == 0) { > > > > > + bus_unregister_notifier(bus, ¬ifier->nb); > > > > > + list_del(¬ifier->list); > > > > > + kfree(notifier); > > > > > + } > > > > > + mutex_unlock(&isolation_lock); > > > > > + return ret; > > > > > + } > > > > > + notifier->refcnt++; > > > > > + > > > > > + mutex_unlock(&isolation_lock); > > > > > + > > > > > + bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify); > > > > > + > > > > > + return 0; > > > > > +} > > > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > > > exactly where. If an isolation provider doesn't explicitly put a > > > > device into a group, the device should go into the group of its parent > > > > bridge. This covers the case of a bus with IOMMU which has below it a > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > > > explicitly aware of). DMAs from devices on the subordinate bus can be > > > > translated by the top-level IOMMU (assuming it sees them as coming > > > > from the bridge), but they all need to be treated as one group. > > > > > > Why would the top level IOMMU provider not set the isolation group in > > > this case. > > > > Because it knows nothing about the subordinate bus. For example > > imagine a VT-d system, with a wacky PCI card into which you plug some > > other type of DMA capable device. The PCI card is acting as a bridge > > from PCI to this, let's call it FooBus. Obviously the VT-d code won't > > have a FooBus notifier, since it knows nothing about FooBus. But the > > FooBus devices still need to end up in the group of the PCI bridge > > device, since their DMA operations will appear as coming from the PCI > > bridge card to the IOMMU, and can be isolated from the rest of the > > system (but not each other) on that basis. > > I guess I was imagining that it's ok to have devices without an > isolation group. It is, but having NULL isolation group has a pretty specific meaning - it means it's never safe to give that device to userspace, but it also means that normal kernel driver operation of that device must not interfere with anything in any iso group (otherwise we can never no that those iso groups _are_ safe to hand out). Likewise userspace operation of any isolation group can't mess with no-group devices. None of these conditions is true for the hypothetical Foobus case. The bus as a whole could be safely given to userspace, the devices on it *could* mess with an existing isolation group (suppose the group consisted of a PCI segment with the FooBus bridge plus another PCI card - FooBus DMAs would be bridged onto the PCI segment and might target the other card's MMIO space). And other grouped devices can certainly mess with the FooBus devices (if userspace grabs the bridge and manipulates its IOMMU mappings, that would clearly screw up any kernel drivers doing DMA from FooBus devices behind it). Oh.. and I just thought of an existing-in-the-field case with the same problem. I'm pretty sure for devices that can appear on PCI but also have "dumb-bus" versions used on embedded systems, at least some of the kernel drivers are structured so that there is a separate struct device for the PCI "wrapper" and the device inside. If the inner driver is initiating DMA, as it usually would, it has to be in the same iso group as it's PCI device parent. > When that happens we can traverse up the bus to find a > higher level isolation group. Well, that's one possible answer to my "where should the hook be question": rather than an initialization hook, when we look up a device's isolation group, if it doesn't say one explicitly, we try it's bridge parent and so forth up the tree. I wonder about the overhead of having to walk all the way up the bus heirarchy before returning NULL whenever we ask about the group of a device that doesn't have one. > It would probably make sense to have some > kind of top-most isolation group that contains everything as it's an > easy proof that if you own everything, you're isolated. Hrm, no. Things that have no IOMMU above them will have ungated access to system RAM, and so can never be safely isolated for userspace purposes, even if userspace owned every _device_ in the system (not that you could do that in practice, anyway). > Potentially > though, wackyPCIcard can also register isolation groups for each of it's > FooBus devices if it's able to provide that capability. Thanks, It could, but why should it. It doesn't know anything about IOMMUs or isolation, and it doesn't need to. Even less so for PCI devices which create subordinate non-PCI struct devices for internal reasons.
On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote: > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote: > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote: > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > > > > +/* > > > > > > + * Add a device to an isolation group. Isolation groups start empty and > > > > > > + * must be told about the devices they contain. Expect this to be called > > > > > > + * from isolation group providers via notifier. > > > > > > + */ > > > > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the > > > > > provider is integrated into host bridge code. > > > > > > > > Sure, a provider could do this on it's own if it wants. This just > > > > provides some infrastructure for a common path. Also note that this > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > > > > to be seen whether that can reasonably be the case once isolation groups > > > > are added to streaming DMA paths. > > > > > > Right, but other than the #ifdef safety, which could be achieved more > > > simply, I'm not seeing what benefit the infrastructure provides over > > > directly calling the bus notifier function. The infrastructure groups > > > the notifiers by bus type internally, but AFAICT exactly one bus > > > notifier call would become exactly one isolation notifier call, and > > > the notifier callback itself would be almost identical. > > > > I guess I don't see this as a fundamental design point of the proposal, > > it's just a convenient way to initialize groups as a side-band addition > > until isolation groups become a more fundamental part of the iommu > > infrastructure. If you want to do that level of integration in your > > provider, do it and make the callbacks w/o the notifier. If nobody ends > > up using them, we'll remove them. Maybe it will just end up being a > > bootstrap. In the typical case, yes, one bus notifier is one isolation > > notifier. It does however also allow one bus notifier to become > > multiple isolation notifiers, and includes some filtering that would > > just be duplicated if every provider decided to implement their own bus > > notifier. > > Uh.. I didn't notice any filtering? That's why I'm asking. Not much, but a little: + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + if (!dev->isolation_group) + blocking_notifier_call_chain(¬ifier->notifier, + ISOLATION_NOTIFY_ADD_DEVICE, dev); + break; + case BUS_NOTIFY_DEL_DEVICE: + if (dev->isolation_group) + blocking_notifier_call_chain(¬ifier->notifier, + ISOLATION_NOTIFY_DEL_DEVICE, dev); + break; + } ... > > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > > > > exactly where. If an isolation provider doesn't explicitly put a > > > > > device into a group, the device should go into the group of its parent > > > > > bridge. This covers the case of a bus with IOMMU which has below it a > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > > > > explicitly aware of). DMAs from devices on the subordinate bus can be > > > > > translated by the top-level IOMMU (assuming it sees them as coming > > > > > from the bridge), but they all need to be treated as one group. > > > > > > > > Why would the top level IOMMU provider not set the isolation group in > > > > this case. > > > > > > Because it knows nothing about the subordinate bus. For example > > > imagine a VT-d system, with a wacky PCI card into which you plug some > > > other type of DMA capable device. The PCI card is acting as a bridge > > > from PCI to this, let's call it FooBus. Obviously the VT-d code won't > > > have a FooBus notifier, since it knows nothing about FooBus. But the > > > FooBus devices still need to end up in the group of the PCI bridge > > > device, since their DMA operations will appear as coming from the PCI > > > bridge card to the IOMMU, and can be isolated from the rest of the > > > system (but not each other) on that basis. > > > > I guess I was imagining that it's ok to have devices without an > > isolation group. > > It is, but having NULL isolation group has a pretty specific meaning - > it means it's never safe to give that device to userspace, but it also > means that normal kernel driver operation of that device must not > interfere with anything in any iso group (otherwise we can never no > that those iso groups _are_ safe to hand out). Likewise userspace > operation of any isolation group can't mess with no-group devices. This is where wanting to use isolation groups as the working unit for an iommu ops layer and also wanting to use iommu ops to replace dma ops seem to collide a bit. Do we want two interfaces for dma, one group based and one for non-isolated devices? Isolation providers like intel-iommu would always use one, non-isolation capable dma paths, like swiotlb or non-isolation hardware iommus, would use another. And how do we factor in the degree of isolation to avoid imposing policy in the kernel? MSI isolation is an example. We should allow userspace to set a policy of whether lack of MSI protection is an acceptable risk. Does that mean we can have isolation groups that provide no isolation and sysfs files indicating capabilities? Perhaps certain classes of groups don't even allow manager binding? > None of these conditions is true for the hypothetical Foobus case. > The bus as a whole could be safely given to userspace, the devices on > it *could* mess with an existing isolation group (suppose the group > consisted of a PCI segment with the FooBus bridge plus another PCI > card - FooBus DMAs would be bridged onto the PCI segment and might > target the other card's MMIO space). And other grouped devices can > certainly mess with the FooBus devices (if userspace grabs the bridge > and manipulates its IOMMU mappings, that would clearly screw up any > kernel drivers doing DMA from FooBus devices behind it). ---segment-----+---------------+ | | [whackyPCIcard] [other device] | +---FooBus---+-------+ | | [FooDev1] [FooDev2] This is another example of the quality of the isolation group and what factors we incorporate in judging that. If the bridge/switch port generating the segment does not support or enable PCI ACS then the IOMMU may be able to differentiate whackyPCIcard from the other device but not prevent peer-to-peer traffic between the two (or between FooDevs and the other device - same difference). This would suggest that we might have an isolation group at the root of the segment for which the provider can guarantee isolation (includes everything on and below the bus), and we might also have isolation groups at whackyPCIcard and the other device that have a difference quality of isolation. /me pauses for rant about superior hardware... ;) > Oh.. and I just thought of an existing-in-the-field case with the same > problem. I'm pretty sure for devices that can appear on PCI but also > have "dumb-bus" versions used on embedded systems, at least some of > the kernel drivers are structured so that there is a separate struct > device for the PCI "wrapper" and the device inside. If the inner > driver is initiating DMA, as it usually would, it has to be in the > same iso group as it's PCI device parent. I don't know that we can ever generically identify such devices, but maybe this introduces the idea that we need to be able to blacklist certain devices as multi-homed and tainting any isolation group that contains them. > > When that happens we can traverse up the bus to find a > > higher level isolation group. > > Well, that's one possible answer to my "where should the hook be > question": rather than an initialization hook, when we look up a > device's isolation group, if it doesn't say one explicitly, we try > it's bridge parent and so forth up the tree. I wonder about the > overhead of having to walk all the way up the bus heirarchy before > returning NULL whenever we ask about the group of a device that > doesn't have one. Yep, that could definitely be a concern for streaming dma. > > It would probably make sense to have some > > kind of top-most isolation group that contains everything as it's an > > easy proof that if you own everything, you're isolated. > > Hrm, no. Things that have no IOMMU above them will have ungated > access to system RAM, and so can never be safely isolated for > userspace purposes, even if userspace owned every _device_ in the > system (not that you could do that in practice, anyway). RAM is a good point, there are "non-devices" to worry about. Potentially that top level group doesn't allow managers. > > Potentially > > though, wackyPCIcard can also register isolation groups for each of it's > > FooBus devices if it's able to provide that capability. Thanks, > > It could, but why should it. It doesn't know anything about IOMMUs or > isolation, and it doesn't need to. Even less so for PCI devices which > create subordinate non-PCI struct devices for internal reasons. Sorry I wasn't clear. If wackyPCIcard happens to include an onboard IOMMU of some sort that's capable of isolation, it might be able to register groups for each of the devices below it. Therefore we could end up with a scenario like above that the segment may not have ACS and therefore not be able to isolate wackyPCIcard from otherdevice, but wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice. I think that means we potentially need to check all devices downstream of an isolation group to allow a manager to lock it, as well as checking the path upstream to make sure it isn't used above... Are you sure we're ready for this kind of infrastructure? Thanks, Alex
On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote: > On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote: > > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote: > > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote: > > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > > > > > +/* > > > > > > > + * Add a device to an isolation group. Isolation groups start empty and > > > > > > > + * must be told about the devices they contain. Expect this to be called > > > > > > > + * from isolation group providers via notifier. > > > > > > > + */ > > > > > > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the > > > > > > provider is integrated into host bridge code. > > > > > > > > > > Sure, a provider could do this on it's own if it wants. This just > > > > > provides some infrastructure for a common path. Also note that this > > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > > > > > to be seen whether that can reasonably be the case once isolation groups > > > > > are added to streaming DMA paths. > > > > > > > > Right, but other than the #ifdef safety, which could be achieved more > > > > simply, I'm not seeing what benefit the infrastructure provides over > > > > directly calling the bus notifier function. The infrastructure groups > > > > the notifiers by bus type internally, but AFAICT exactly one bus > > > > notifier call would become exactly one isolation notifier call, and > > > > the notifier callback itself would be almost identical. > > > > > > I guess I don't see this as a fundamental design point of the proposal, > > > it's just a convenient way to initialize groups as a side-band addition > > > until isolation groups become a more fundamental part of the iommu > > > infrastructure. If you want to do that level of integration in your > > > provider, do it and make the callbacks w/o the notifier. If nobody ends > > > up using them, we'll remove them. Maybe it will just end up being a > > > bootstrap. In the typical case, yes, one bus notifier is one isolation > > > notifier. It does however also allow one bus notifier to become > > > multiple isolation notifiers, and includes some filtering that would > > > just be duplicated if every provider decided to implement their own bus > > > notifier. > > > > Uh.. I didn't notice any filtering? That's why I'm asking. > > Not much, but a little: > > + switch (action) { > + case BUS_NOTIFY_ADD_DEVICE: > + if (!dev->isolation_group) > + blocking_notifier_call_chain(¬ifier->notifier, > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > + break; > + case BUS_NOTIFY_DEL_DEVICE: > + if (dev->isolation_group) > + blocking_notifier_call_chain(¬ifier->notifier, > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > + break; > + } Ah, I see, fair enough. A couple of tangential observations. First, I suspect using BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug, it might be better to have an unplug callback in the group instead. Second, I don't think aborting the call chain early for hot-plug is actually a good idea. I can't see a clear guarantee on the order, so individual providers couldn't rely on that short-cut behaviour. Which means that if two providers would have attempted to claim the same device, something is seriously wrong and we should probably report that. > ... > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > > > > > exactly where. If an isolation provider doesn't explicitly put a > > > > > > device into a group, the device should go into the group of its parent > > > > > > bridge. This covers the case of a bus with IOMMU which has below it a > > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > > > > > explicitly aware of). DMAs from devices on the subordinate bus can be > > > > > > translated by the top-level IOMMU (assuming it sees them as coming > > > > > > from the bridge), but they all need to be treated as one group. > > > > > > > > > > Why would the top level IOMMU provider not set the isolation group in > > > > > this case. > > > > > > > > Because it knows nothing about the subordinate bus. For example > > > > imagine a VT-d system, with a wacky PCI card into which you plug some > > > > other type of DMA capable device. The PCI card is acting as a bridge > > > > from PCI to this, let's call it FooBus. Obviously the VT-d code won't > > > > have a FooBus notifier, since it knows nothing about FooBus. But the > > > > FooBus devices still need to end up in the group of the PCI bridge > > > > device, since their DMA operations will appear as coming from the PCI > > > > bridge card to the IOMMU, and can be isolated from the rest of the > > > > system (but not each other) on that basis. > > > > > > I guess I was imagining that it's ok to have devices without an > > > isolation group. > > > > It is, but having NULL isolation group has a pretty specific meaning - > > it means it's never safe to give that device to userspace, but it also > > means that normal kernel driver operation of that device must not > > interfere with anything in any iso group (otherwise we can never no > > that those iso groups _are_ safe to hand out). Likewise userspace > > operation of any isolation group can't mess with no-group devices. > > This is where wanting to use isolation groups as the working unit for an > iommu ops layer and also wanting to use iommu ops to replace dma ops > seem to collide a bit. Do we want two interfaces for dma, one group > based and one for non-isolated devices? Well, I don't know enough about what dwmw2 had planned to answer that. I was assuming the in-kernel dma reply would remain device/bus focussed, and how it looked up and used the groups was an internal matter. I would expect it would probably need a fallback path for "no group" for transition purposes, even if it wasn't planned to keep that forever. > Isolation providers like > intel-iommu would always use one, non-isolation capable dma paths, like > swiotlb or non-isolation hardware iommus, would use another. And how do > we factor in the degree of isolation to avoid imposing policy in the > kernel? MSI isolation is an example. We should allow userspace to set > a policy of whether lack of MSI protection is an acceptable risk. Does > that mean we can have isolation groups that provide no isolation and > sysfs files indicating capabilities? Perhaps certain classes of groups > don't even allow manager binding? Ugh. I thought about flagging groups with some sort of bitmap of isolation strength properties, but I think that way lies madness. We might want a global policy bitmask though which expresses which constraints are acceptable risks. The providers would then have to provide groups which are as strong as requested, or none at all. > > None of these conditions is true for the hypothetical Foobus case. > > The bus as a whole could be safely given to userspace, the devices on > > it *could* mess with an existing isolation group (suppose the group > > consisted of a PCI segment with the FooBus bridge plus another PCI > > card - FooBus DMAs would be bridged onto the PCI segment and might > > target the other card's MMIO space). And other grouped devices can > > certainly mess with the FooBus devices (if userspace grabs the bridge > > and manipulates its IOMMU mappings, that would clearly screw up any > > kernel drivers doing DMA from FooBus devices behind it). > > ---segment-----+---------------+ > | | > [whackyPCIcard] [other device] > | > +---FooBus---+-------+ > | | > [FooDev1] [FooDev2] > > This is another example of the quality of the isolation group and what > factors we incorporate in judging that. If the bridge/switch port > generating the segment does not support or enable PCI ACS then the IOMMU > may be able to differentiate whackyPCIcard from the other device but not > prevent peer-to-peer traffic between the two (or between FooDevs and the > other device - same difference). This would suggest that we might have > an isolation group at the root of the segment for which the provider can > guarantee isolation (includes everything on and below the bus), and we > might also have isolation groups at whackyPCIcard and the other device > that have a difference quality of isolation. /me pauses for rant about > superior hardware... ;) Nested isolation groups? Please no. An isolation group at the bridge encompassing that segment was exactly what I had in mind, but my point is that all the FooBus devices *also* need to be in that group, even though the isolation provider knows nothing about FooBus. > > Oh.. and I just thought of an existing-in-the-field case with the same > > problem. I'm pretty sure for devices that can appear on PCI but also > > have "dumb-bus" versions used on embedded systems, at least some of > > the kernel drivers are structured so that there is a separate struct > > device for the PCI "wrapper" and the device inside. If the inner > > driver is initiating DMA, as it usually would, it has to be in the > > same iso group as it's PCI device parent. > > I don't know that we can ever generically identify such devices, but > maybe this introduces the idea that we need to be able to blacklist > certain devices as multi-homed and tainting any isolation group that > contains them. Sorry, I was unclear. These are not multi-homed devices, just multiple-variant devices, any given instance will either be PCI or not. But to handle the two variants easily, the drivers have nested struct devices. It's essentially a pure software artifact we're dealing with here, but nonetheless we need to get the group-ownership of those struct devices correct, whether they're synthetic or not. > > > When that happens we can traverse up the bus to find a > > > higher level isolation group. > > > > Well, that's one possible answer to my "where should the hook be > > question": rather than an initialization hook, when we look up a > > device's isolation group, if it doesn't say one explicitly, we try > > it's bridge parent and so forth up the tree. I wonder about the > > overhead of having to walk all the way up the bus heirarchy before > > returning NULL whenever we ask about the group of a device that > > doesn't have one. > > Yep, that could definitely be a concern for streaming dma. Right, which is why I'm suggesting handling at init time instead. We'd need something that runs in the generic hot-add path, after bus notifiers, which would set the device's group to the same as its parent's if a provider hasn't already given it one. > > > It would probably make sense to have some > > > kind of top-most isolation group that contains everything as it's an > > > easy proof that if you own everything, you're isolated. > > > > Hrm, no. Things that have no IOMMU above them will have ungated > > access to system RAM, and so can never be safely isolated for > > userspace purposes, even if userspace owned every _device_ in the > > system (not that you could do that in practice, anyway). > > RAM is a good point, there are "non-devices" to worry about. > Potentially that top level group doesn't allow managers. I don't really see the point of a "top-level" group. Groups are exclusive, not heirarchical, and I don't see that there are any paths really simplified by having a (not manageable) "leftovers" group. > > > Potentially > > > though, wackyPCIcard can also register isolation groups for each of it's > > > FooBus devices if it's able to provide that capability. Thanks, > > > > It could, but why should it. It doesn't know anything about IOMMUs or > > isolation, and it doesn't need to. Even less so for PCI devices which > > create subordinate non-PCI struct devices for internal reasons. > > Sorry I wasn't clear. If wackyPCIcard happens to include an onboard > IOMMU of some sort that's capable of isolation, it might be able to > register groups for each of the devices below it. Therefore we could > end up with a scenario like above that the segment may not have ACS and > therefore not be able to isolate wackyPCIcard from otherdevice, but > wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice. I think > that means we potentially need to check all devices downstream of an > isolation group to allow a manager to lock it, as well as checking the > path upstream to make sure it isn't used above... Are you sure we're > ready for this kind of infrastructure? Thanks, Hrm, at this stage I think I'm prepared to assume that IOMMUs are either strictly independent (e.g. covering different PCI domains) or are at least vaguely aware of each other (i.e. platform conventions cover the combination). Truly nested IOMMUs, that are entirely unaware of each other is a problem for another day (by which I mean probably never). That said, again, groups have to be exclusive, not heirarchical, so in the above case, I think we'd have a few possible ways to go: 1) The FooBus isolation provider could refuse to initialize if the FooBus bridge is sharing an iso group with something else before it probes the FooBus. 2) The FooBus isolation provider could be in the FooBus bridge driver - meaning that if someone tried to grab the PCI group including the FooBridge, the provider driver would be booted out, causing the FooBus groups to cease to exist (meaning the PCI group manager would never get lock while someone was already using the FooGroups). In this case, it gets a bit complex. When the FooBus isolation provider is active, the FooBus devices would be in their own groups, not the group of the FooBridge and its sibling. When the FooBus isolation provider is removed, it would have to configure the FooBus IOMMU to a passthrough mode, and revert the FooBus devices to the parent's group. Hm. Complicated.
On Sat, 2012-03-17 at 15:57 +1100, David Gibson wrote: > On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote: > > On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote: > > > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote: > > > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote: > > > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > > > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > > > > > > +/* > > > > > > > > + * Add a device to an isolation group. Isolation groups start empty and > > > > > > > > + * must be told about the devices they contain. Expect this to be called > > > > > > > > + * from isolation group providers via notifier. > > > > > > > > + */ > > > > > > > > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the > > > > > > > provider is integrated into host bridge code. > > > > > > > > > > > > Sure, a provider could do this on it's own if it wants. This just > > > > > > provides some infrastructure for a common path. Also note that this > > > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > > > > > > to be seen whether that can reasonably be the case once isolation groups > > > > > > are added to streaming DMA paths. > > > > > > > > > > Right, but other than the #ifdef safety, which could be achieved more > > > > > simply, I'm not seeing what benefit the infrastructure provides over > > > > > directly calling the bus notifier function. The infrastructure groups > > > > > the notifiers by bus type internally, but AFAICT exactly one bus > > > > > notifier call would become exactly one isolation notifier call, and > > > > > the notifier callback itself would be almost identical. > > > > > > > > I guess I don't see this as a fundamental design point of the proposal, > > > > it's just a convenient way to initialize groups as a side-band addition > > > > until isolation groups become a more fundamental part of the iommu > > > > infrastructure. If you want to do that level of integration in your > > > > provider, do it and make the callbacks w/o the notifier. If nobody ends > > > > up using them, we'll remove them. Maybe it will just end up being a > > > > bootstrap. In the typical case, yes, one bus notifier is one isolation > > > > notifier. It does however also allow one bus notifier to become > > > > multiple isolation notifiers, and includes some filtering that would > > > > just be duplicated if every provider decided to implement their own bus > > > > notifier. > > > > > > Uh.. I didn't notice any filtering? That's why I'm asking. > > > > Not much, but a little: > > > > + switch (action) { > > + case BUS_NOTIFY_ADD_DEVICE: > > + if (!dev->isolation_group) > > + blocking_notifier_call_chain(¬ifier->notifier, > > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > > + break; > > + case BUS_NOTIFY_DEL_DEVICE: > > + if (dev->isolation_group) > > + blocking_notifier_call_chain(¬ifier->notifier, > > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > > + break; > > + } > > > Ah, I see, fair enough. > > A couple of tangential observations. First, I suspect using > BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug, > it might be better to have an unplug callback in the group instead. There's really one already. Assuming the device is attached to a group driver, the .remove entry point on the driver will get called first. It doesn't get to return error, but it can block. The DEL_DEVICE will only happen after the group driver has relinquished the device, or at least returned from remove(). For a device with no driver, the group would only learn about it through the notifier, but it probably doesn't need anything more direct. > Second, I don't think aborting the call chain early for hot-plug is > actually a good idea. I can't see a clear guarantee on the order, so > individual providers couldn't rely on that short-cut behaviour. Which > means that if two providers would have attempted to claim the same > device, something is seriously wrong and we should probably report > that. Yeah, that seems like a reasonable safety check. > > ... > > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > > > > > > exactly where. If an isolation provider doesn't explicitly put a > > > > > > > device into a group, the device should go into the group of its parent > > > > > > > bridge. This covers the case of a bus with IOMMU which has below it a > > > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > > > > > > explicitly aware of). DMAs from devices on the subordinate bus can be > > > > > > > translated by the top-level IOMMU (assuming it sees them as coming > > > > > > > from the bridge), but they all need to be treated as one group. > > > > > > > > > > > > Why would the top level IOMMU provider not set the isolation group in > > > > > > this case. > > > > > > > > > > Because it knows nothing about the subordinate bus. For example > > > > > imagine a VT-d system, with a wacky PCI card into which you plug some > > > > > other type of DMA capable device. The PCI card is acting as a bridge > > > > > from PCI to this, let's call it FooBus. Obviously the VT-d code won't > > > > > have a FooBus notifier, since it knows nothing about FooBus. But the > > > > > FooBus devices still need to end up in the group of the PCI bridge > > > > > device, since their DMA operations will appear as coming from the PCI > > > > > bridge card to the IOMMU, and can be isolated from the rest of the > > > > > system (but not each other) on that basis. > > > > > > > > I guess I was imagining that it's ok to have devices without an > > > > isolation group. > > > > > > It is, but having NULL isolation group has a pretty specific meaning - > > > it means it's never safe to give that device to userspace, but it also > > > means that normal kernel driver operation of that device must not > > > interfere with anything in any iso group (otherwise we can never no > > > that those iso groups _are_ safe to hand out). Likewise userspace > > > operation of any isolation group can't mess with no-group devices. > > > > This is where wanting to use isolation groups as the working unit for an > > iommu ops layer and also wanting to use iommu ops to replace dma ops > > seem to collide a bit. Do we want two interfaces for dma, one group > > based and one for non-isolated devices? > > Well, I don't know enough about what dwmw2 had planned to answer > that. I was assuming the in-kernel dma reply would remain device/bus > focussed, and how it looked up and used the groups was an internal > matter. I would expect it would probably need a fallback path for "no > group" for transition purposes, even if it wasn't planned to keep that > forever. Hmm, I think the value of a core isolation group layer starts to fall apart if an iommu driver can't count on a group being present for any device that might do dma. Some ideas below. > > Isolation providers like > > intel-iommu would always use one, non-isolation capable dma paths, like > > swiotlb or non-isolation hardware iommus, would use another. And how do > > we factor in the degree of isolation to avoid imposing policy in the > > kernel? MSI isolation is an example. We should allow userspace to set > > a policy of whether lack of MSI protection is an acceptable risk. Does > > that mean we can have isolation groups that provide no isolation and > > sysfs files indicating capabilities? Perhaps certain classes of groups > > don't even allow manager binding? > > Ugh. I thought about flagging groups with some sort of bitmap of > isolation strength properties, but I think that way lies madness. We > might want a global policy bitmask though which expresses which > constraints are acceptable risks. The providers would then have to > provide groups which are as strong as requested, or none at all. That's effectively how the current iommu_device_group() interface works; identify isolate-able groups, or none at all. I don't think we get that luxury if we push isolation groups down into the device layer though. If we want to use them for dma_ops and to solve various device quirks, they can't only be present on some configurations. I think you're right on the global policy though, we just need to apply it differently. I'm thinking we need something like "isolation_allow=" allowing options of "nomsi" and "nop2p" (just for starters). When a provider creates a group, they provide a set of flags for the group. A manager is not allowed to bind to the group if the global policy doesn't match the group flags. We'll need some support functions, maybe even a sysfs file, so userspace can know in advance and vfio can avoid creating entries for unusable groups. > > > None of these conditions is true for the hypothetical Foobus case. > > > The bus as a whole could be safely given to userspace, the devices on > > > it *could* mess with an existing isolation group (suppose the group > > > consisted of a PCI segment with the FooBus bridge plus another PCI > > > card - FooBus DMAs would be bridged onto the PCI segment and might > > > target the other card's MMIO space). And other grouped devices can > > > certainly mess with the FooBus devices (if userspace grabs the bridge > > > and manipulates its IOMMU mappings, that would clearly screw up any > > > kernel drivers doing DMA from FooBus devices behind it). > > > > ---segment-----+---------------+ > > | | > > [whackyPCIcard] [other device] > > | > > +---FooBus---+-------+ > > | | > > [FooDev1] [FooDev2] > > > > This is another example of the quality of the isolation group and what > > factors we incorporate in judging that. If the bridge/switch port > > generating the segment does not support or enable PCI ACS then the IOMMU > > may be able to differentiate whackyPCIcard from the other device but not > > prevent peer-to-peer traffic between the two (or between FooDevs and the > > other device - same difference). This would suggest that we might have > > an isolation group at the root of the segment for which the provider can > > guarantee isolation (includes everything on and below the bus), and we > > might also have isolation groups at whackyPCIcard and the other device > > that have a difference quality of isolation. /me pauses for rant about > > superior hardware... ;) > > Nested isolation groups? Please no. > > An isolation group at the bridge encompassing that segment was exactly > what I had in mind, but my point is that all the FooBus devices *also* > need to be in that group, even though the isolation provider knows > nothing about FooBus. If we're able to strictly say "no isolation = no group" and ignore FooBus for a moment, yes, the group at the bridge makes sense. But as I said, I don't think we get that luxury. There will be different "acceptable" levels of isolation and we'll need to support both the model of single group encompassing the segment as well as separate isolation groups for each device, whackyPCICard/other. A question then is how do we support both? Perhaps it's a provider option whether to only create fully isolated group, which makes it traverse up to the segment. The default might be to make separate groups at whackyPCICard and other, each with the nop2p flag. It gets a lot more complicated, but maybe we support both so if system policy prevents a manager from binding to a sub-group due to nop2p, it can walk up a level. I suspect dma_ops always wants the closer group to avoid traversing levels. Back to FooBus, dma_ops only knows how to provide dma for certain types of devices. intel-iommu only knows about struct pci_dev. So if FooDev needed to do dma, it would need some kind of intermediary to do the mapping via whackyPCICard. So from that perspective, we could get away w/o including FooBus devices in the group. Of course if we want a manager to attach to the group at whackyPCICard (ignoring nop2p), we need to put requirements on the state of the FooDevs. I think that might actually give some credibility to nested groups, hierarchical really, ie if we need to walk all the devices below a group, why not allow groups below a group? I don't like it either, but I'm not sure it's avoidable. > > > Oh.. and I just thought of an existing-in-the-field case with the same > > > problem. I'm pretty sure for devices that can appear on PCI but also > > > have "dumb-bus" versions used on embedded systems, at least some of > > > the kernel drivers are structured so that there is a separate struct > > > device for the PCI "wrapper" and the device inside. If the inner > > > driver is initiating DMA, as it usually would, it has to be in the > > > same iso group as it's PCI device parent. > > > > I don't know that we can ever generically identify such devices, but > > maybe this introduces the idea that we need to be able to blacklist > > certain devices as multi-homed and tainting any isolation group that > > contains them. > > Sorry, I was unclear. These are not multi-homed devices, just > multiple-variant devices, any given instance will either be PCI or > not. But to handle the two variants easily, the drivers have nested > struct devices. It's essentially a pure software artifact we're > dealing with here, but nonetheless we need to get the group-ownership > of those struct devices correct, whether they're synthetic or not. So long as the ADD_DEVICE happens with the currently running variant, which seems like it has to be the case, I'm not sure how this matters. I'm trying to be very careful to only use struct device for groups. > > > > When that happens we can traverse up the bus to find a > > > > higher level isolation group. > > > > > > Well, that's one possible answer to my "where should the hook be > > > question": rather than an initialization hook, when we look up a > > > device's isolation group, if it doesn't say one explicitly, we try > > > it's bridge parent and so forth up the tree. I wonder about the > > > overhead of having to walk all the way up the bus heirarchy before > > > returning NULL whenever we ask about the group of a device that > > > doesn't have one. > > > > Yep, that could definitely be a concern for streaming dma. > > Right, which is why I'm suggesting handling at init time instead. > We'd need something that runs in the generic hot-add path, after bus > notifiers, which would set the device's group to the same as its > parent's if a provider hasn't already given it one. I don't think that works. Back to the FooBus example, if the isolation group becomes a fundamental part of the dma_ops path, we may end up with groups at each FooDev (or maybe one for FooBus) generated by the intermediary that sets up dma using whackyPCICard. > > > > It would probably make sense to have some > > > > kind of top-most isolation group that contains everything as it's an > > > > easy proof that if you own everything, you're isolated. > > > > > > Hrm, no. Things that have no IOMMU above them will have ungated > > > access to system RAM, and so can never be safely isolated for > > > userspace purposes, even if userspace owned every _device_ in the > > > system (not that you could do that in practice, anyway). > > > > RAM is a good point, there are "non-devices" to worry about. > > Potentially that top level group doesn't allow managers. > > I don't really see the point of a "top-level" group. Groups are > exclusive, not heirarchical, and I don't see that there are any paths > really simplified by having a (not manageable) "leftovers" group. Yeah, a top level group may not make sense, but if we plan for dma_ops, we need non-manageable groups, which I think is just the degenerate case of isolation quality. > > > > Potentially > > > > though, wackyPCIcard can also register isolation groups for each of it's > > > > FooBus devices if it's able to provide that capability. Thanks, > > > > > > It could, but why should it. It doesn't know anything about IOMMUs or > > > isolation, and it doesn't need to. Even less so for PCI devices which > > > create subordinate non-PCI struct devices for internal reasons. > > > > Sorry I wasn't clear. If wackyPCIcard happens to include an onboard > > IOMMU of some sort that's capable of isolation, it might be able to > > register groups for each of the devices below it. Therefore we could > > end up with a scenario like above that the segment may not have ACS and > > therefore not be able to isolate wackyPCIcard from otherdevice, but > > wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice. I think > > that means we potentially need to check all devices downstream of an > > isolation group to allow a manager to lock it, as well as checking the > > path upstream to make sure it isn't used above... Are you sure we're > > ready for this kind of infrastructure? Thanks, > > Hrm, at this stage I think I'm prepared to assume that IOMMUs are > either strictly independent (e.g. covering different PCI domains) or > are at least vaguely aware of each other (i.e. platform conventions > cover the combination). Truly nested IOMMUs, that are entirely > unaware of each other is a problem for another day (by which I mean > probably never). Ok, but even if there's no iommu onboard whackyPCICard, it can still be possible to create groups on FooBus just to facilitate dma_ops and handle quirks. > That said, again, groups have to be exclusive, not heirarchical, so > in the above case, I think we'd have a few possible ways to go: Ownership absolutely has to be exclusive, but it looks like groups themselves are hierarchical. > 1) The FooBus isolation provider could refuse to initialize if > the FooBus bridge is sharing an iso group with something else before > it probes the FooBus. Sharing, as in a parent group is managed? Yes, something special needs to happen in that case, I'd probably investigate having it initialized in a managed state versus not initialized at all. > 2) The FooBus isolation provider could be in the FooBus bridge > driver - meaning that if someone tried to grab the PCI group including > the FooBridge, the provider driver would be booted out, causing the > FooBus groups to cease to exist (meaning the PCI group manager would > never get lock while someone was already using the FooGroups). In Right, I don't know that they need to cease to exist, but if a manager is attached anywhere above or below the group it wants to lock, it has to be blocked. > this case, it gets a bit complex. When the FooBus isolation provider > is active, the FooBus devices would be in their own groups, not the > group of the FooBridge and its sibling. When the FooBus isolation > provider is removed, it would have to configure the FooBus IOMMU to a > passthrough mode, and revert the FooBus devices to the parent's > group. Hm. Complicated. Yep. I think we're arriving at the same point. Groups are hierarchical, but ownership via a manager cannot be nested. So to manage a group, we need to walk the entire tree of devices below each device checking that none of the groups are managed and all the devices are using the right driver, then walk up from the group to verify no group of a parent device is managed. Thanks, Alex
On Wed, Mar 21, 2012 at 03:12:58PM -0600, Alex Williamson wrote: > On Sat, 2012-03-17 at 15:57 +1100, David Gibson wrote: > > On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote: > > > On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote: > > > > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote: > > > > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote: > > > > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > > > > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > > > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > > > > > > > +/* > > > > > > > > > + * Add a device to an isolation group. Isolation groups start empty and > > > > > > > > > + * must be told about the devices they contain. Expect this to be called > > > > > > > > > + * from isolation group providers via notifier. > > > > > > > > > + */ > > > > > > > > > > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the > > > > > > > > provider is integrated into host bridge code. > > > > > > > > > > > > > > Sure, a provider could do this on it's own if it wants. This just > > > > > > > provides some infrastructure for a common path. Also note that this > > > > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > > > > > > > to be seen whether that can reasonably be the case once isolation groups > > > > > > > are added to streaming DMA paths. > > > > > > > > > > > > Right, but other than the #ifdef safety, which could be achieved more > > > > > > simply, I'm not seeing what benefit the infrastructure provides over > > > > > > directly calling the bus notifier function. The infrastructure groups > > > > > > the notifiers by bus type internally, but AFAICT exactly one bus > > > > > > notifier call would become exactly one isolation notifier call, and > > > > > > the notifier callback itself would be almost identical. > > > > > > > > > > I guess I don't see this as a fundamental design point of the proposal, > > > > > it's just a convenient way to initialize groups as a side-band addition > > > > > until isolation groups become a more fundamental part of the iommu > > > > > infrastructure. If you want to do that level of integration in your > > > > > provider, do it and make the callbacks w/o the notifier. If nobody ends > > > > > up using them, we'll remove them. Maybe it will just end up being a > > > > > bootstrap. In the typical case, yes, one bus notifier is one isolation > > > > > notifier. It does however also allow one bus notifier to become > > > > > multiple isolation notifiers, and includes some filtering that would > > > > > just be duplicated if every provider decided to implement their own bus > > > > > notifier. > > > > > > > > Uh.. I didn't notice any filtering? That's why I'm asking. > > > > > > Not much, but a little: > > > > > > + switch (action) { > > > + case BUS_NOTIFY_ADD_DEVICE: > > > + if (!dev->isolation_group) > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > + break; > > > + case BUS_NOTIFY_DEL_DEVICE: > > > + if (dev->isolation_group) > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > > > + break; > > > + } T> > > > > > Ah, I see, fair enough. > > > > A couple of tangential observations. First, I suspect using > > BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug, > > it might be better to have an unplug callback in the group instead. > > There's really one already. Assuming the device is attached to a group > driver, the .remove entry point on the driver will get called first. It > doesn't get to return error, but it can block. Hrm. Assuming the isolation provider has installed a driver for the relevant bus type. And as we're discussing elsewhere, I think we have situations where the groups will get members on (subordinate) busses which the isolation provider doesn't have explicit knowledge of. > The DEL_DEVICE will only > happen after the group driver has relinquished the device, or at least > returned from remove(). For a device with no driver, the group would > only learn about it through the notifier, but it probably doesn't need > anything more direct. DEL_DEVICE is certainly sufficient, it just seems a bit unnecessarily awkward. > > Second, I don't think aborting the call chain early for hot-plug is > > actually a good idea. I can't see a clear guarantee on the order, so > > individual providers couldn't rely on that short-cut behaviour. Which > > means that if two providers would have attempted to claim the same > > device, something is seriously wrong and we should probably report > > that. > > Yeah, that seems like a reasonable safety check. > > > > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > > > > > > > exactly where. If an isolation provider doesn't explicitly put a > > > > > > > > device into a group, the device should go into the group of its parent > > > > > > > > bridge. This covers the case of a bus with IOMMU which has below it a > > > > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > > > > > > > explicitly aware of). DMAs from devices on the subordinate bus can be > > > > > > > > translated by the top-level IOMMU (assuming it sees them as coming > > > > > > > > from the bridge), but they all need to be treated as one group. > > > > > > > > > > > > > > Why would the top level IOMMU provider not set the isolation group in > > > > > > > this case. > > > > > > > > > > > > Because it knows nothing about the subordinate bus. For example > > > > > > imagine a VT-d system, with a wacky PCI card into which you plug some > > > > > > other type of DMA capable device. The PCI card is acting as a bridge > > > > > > from PCI to this, let's call it FooBus. Obviously the VT-d code won't > > > > > > have a FooBus notifier, since it knows nothing about FooBus. But the > > > > > > FooBus devices still need to end up in the group of the PCI bridge > > > > > > device, since their DMA operations will appear as coming from the PCI > > > > > > bridge card to the IOMMU, and can be isolated from the rest of the > > > > > > system (but not each other) on that basis. > > > > > > > > > > I guess I was imagining that it's ok to have devices without an > > > > > isolation group. > > > > > > > > It is, but having NULL isolation group has a pretty specific meaning - > > > > it means it's never safe to give that device to userspace, but it also > > > > means that normal kernel driver operation of that device must not > > > > interfere with anything in any iso group (otherwise we can never no > > > > that those iso groups _are_ safe to hand out). Likewise userspace > > > > operation of any isolation group can't mess with no-group devices. > > > > > > This is where wanting to use isolation groups as the working unit for an > > > iommu ops layer and also wanting to use iommu ops to replace dma ops > > > seem to collide a bit. Do we want two interfaces for dma, one group > > > based and one for non-isolated devices? > > > > Well, I don't know enough about what dwmw2 had planned to answer > > that. I was assuming the in-kernel dma reply would remain device/bus > > focussed, and how it looked up and used the groups was an internal > > matter. I would expect it would probably need a fallback path for "no > > group" for transition purposes, even if it wasn't planned to keep that > > forever. > > Hmm, I think the value of a core isolation group layer starts to fall > apart if an iommu driver can't count on a group being present for any > device that might do dma. Some ideas below. Ah, yes, to clarify: I think anything subject to IOMMU translation should have a group (which might need a no-manager flag). However, for devices with no IOMMU, or with an IOMMU we can switch permanently into bypass mode, I think it's reasonable to leave them without group. > > > Isolation providers like > > > intel-iommu would always use one, non-isolation capable dma paths, like > > > swiotlb or non-isolation hardware iommus, would use another. And how do > > > we factor in the degree of isolation to avoid imposing policy in the > > > kernel? MSI isolation is an example. We should allow userspace to set > > > a policy of whether lack of MSI protection is an acceptable risk. Does > > > that mean we can have isolation groups that provide no isolation and > > > sysfs files indicating capabilities? Perhaps certain classes of groups > > > don't even allow manager binding? > > > > Ugh. I thought about flagging groups with some sort of bitmap of > > isolation strength properties, but I think that way lies madness. We > > might want a global policy bitmask though which expresses which > > constraints are acceptable risks. The providers would then have to > > provide groups which are as strong as requested, or none at all. > > That's effectively how the current iommu_device_group() interface works; > identify isolate-able groups, or none at all. I don't think we get that > luxury if we push isolation groups down into the device layer though. > If we want to use them for dma_ops and to solve various device quirks, > they can't only be present on some configurations. I think you're right > on the global policy though, we just need to apply it differently. I'm > thinking we need something like "isolation_allow=" allowing options of > "nomsi" and "nop2p" (just for starters). When a provider creates a > group, they provide a set of flags for the group. A manager is not > allowed to bind to the group if the global policy doesn't match the > group flags. We'll need some support functions, maybe even a sysfs > file, so userspace can know in advance and vfio can avoid creating > entries for unusable groups. Just banning managers for groups of insufficient strength isn't quite enough, because that doesn't allow for consolidation of several poorly-isolated groups into one strongly isolated groups which may be possible on some hardware configurations. > > > > None of these conditions is true for the hypothetical Foobus case. > > > > The bus as a whole could be safely given to userspace, the devices on > > > > it *could* mess with an existing isolation group (suppose the group > > > > consisted of a PCI segment with the FooBus bridge plus another PCI > > > > card - FooBus DMAs would be bridged onto the PCI segment and might > > > > target the other card's MMIO space). And other grouped devices can > > > > certainly mess with the FooBus devices (if userspace grabs the bridge > > > > and manipulates its IOMMU mappings, that would clearly screw up any > > > > kernel drivers doing DMA from FooBus devices behind it). > > > > > > ---segment-----+---------------+ > > > | | > > > [whackyPCIcard] [other device] > > > | > > > +---FooBus---+-------+ > > > | | > > > [FooDev1] [FooDev2] > > > > > > This is another example of the quality of the isolation group and what > > > factors we incorporate in judging that. If the bridge/switch port > > > generating the segment does not support or enable PCI ACS then the IOMMU > > > may be able to differentiate whackyPCIcard from the other device but not > > > prevent peer-to-peer traffic between the two (or between FooDevs and the > > > other device - same difference). This would suggest that we might have > > > an isolation group at the root of the segment for which the provider can > > > guarantee isolation (includes everything on and below the bus), and we > > > might also have isolation groups at whackyPCIcard and the other device > > > that have a difference quality of isolation. /me pauses for rant about > > > superior hardware... ;) > > > > Nested isolation groups? Please no. > > > > An isolation group at the bridge encompassing that segment was exactly > > what I had in mind, but my point is that all the FooBus devices *also* > > need to be in that group, even though the isolation provider knows > > nothing about FooBus. > > If we're able to strictly say "no isolation = no group" and ignore > FooBus for a moment, yes, the group at the bridge makes sense. But as I > said, I don't think we get that luxury. There will be different > "acceptable" levels of isolation and we'll need to support both the > model of single group encompassing the segment as well as separate > isolation groups for each device, whackyPCICard/other. Well, sure. But in that case, the FooBus devices still need to live in the same group as their bridge card, since they're subject to the same translations. > A question then is how do we support both? Perhaps it's a provider > option whether to only create fully isolated group, which makes it > traverse up to the segment. The default might be to make separate > groups at whackyPCICard and other, each with the nop2p flag. It gets a > lot more complicated, but maybe we support both so if system policy > prevents a manager from binding to a sub-group due to nop2p, it can walk > up a level. I suspect dma_ops always wants the closer group to avoid > traversing levels. Yeah. I really wish I knew more about what dwmw2 had in mind. > Back to FooBus, dma_ops only knows how to provide dma for certain types > of devices. intel-iommu only knows about struct pci_dev. So if FooDev > needed to do dma, it would need some kind of intermediary to do the > mapping via whackyPCICard. So from that perspective, we could get away > w/o including FooBus devices in the group. That seems very fragile Logically the FooBus devices should be in the same group. Yes, the FooBus dma hooks will need to know how to find the bridge PCI card and thereby manipulate it's IOMMU mappings, but they still belong in the same isolation group. > Of course if we want a > manager to attach to the group at whackyPCICard (ignoring nop2p), we > need to put requirements on the state of the FooDevs. I think that > might actually give some credibility to nested groups, hierarchical > really, ie if we need to walk all the devices below a group, why not > allow groups below a group? I don't like it either, but I'm not sure > it's avoidable. Urgh. Nested groups, a group parent must always have a strictly stronger set of isolation properties than subgroups. I suppose that could work, it's really setting off my over-engineering alarms, though. > > > > Oh.. and I just thought of an existing-in-the-field case with the same > > > > problem. I'm pretty sure for devices that can appear on PCI but also > > > > have "dumb-bus" versions used on embedded systems, at least some of > > > > the kernel drivers are structured so that there is a separate struct > > > > device for the PCI "wrapper" and the device inside. If the inner > > > > driver is initiating DMA, as it usually would, it has to be in the > > > > same iso group as it's PCI device parent. > > > > > > I don't know that we can ever generically identify such devices, but > > > maybe this introduces the idea that we need to be able to blacklist > > > certain devices as multi-homed and tainting any isolation group that > > > contains them. > > > > Sorry, I was unclear. These are not multi-homed devices, just > > multiple-variant devices, any given instance will either be PCI or > > not. But to handle the two variants easily, the drivers have nested > > struct devices. It's essentially a pure software artifact we're > > dealing with here, but nonetheless we need to get the group-ownership > > of those struct devices correct, whether they're synthetic or not. > > So long as the ADD_DEVICE happens with the currently running variant, > which seems like it has to be the case, I'm not sure how this matters. > I'm trying to be very careful to only use struct device for groups. Nope, I still haven't managed to convey the situation. The *hardware* comes in two varants, PCI and "dumb bus". The core part of the driver software binds (only) to a dumb bus struct device (usually a platform device, actually). To support the PCI variant, there is a "wrapper" driver. This is a PCI driver which does any PCI specific initialization, determines the register addresses from config space then creates a dumb bus struct device as a child of the pci_dev. The core driver then attaches to that dumb bus (e.g. platform) device. My point is that in the second case, the dumb bus struct device needs to be in the same group as its pci_dev parent. > > > > > When that happens we can traverse up the bus to find a > > > > > higher level isolation group. > > > > > > > > Well, that's one possible answer to my "where should the hook be > > > > question": rather than an initialization hook, when we look up a > > > > device's isolation group, if it doesn't say one explicitly, we try > > > > it's bridge parent and so forth up the tree. I wonder about the > > > > overhead of having to walk all the way up the bus heirarchy before > > > > returning NULL whenever we ask about the group of a device that > > > > doesn't have one. > > > > > > Yep, that could definitely be a concern for streaming dma. > > > > Right, which is why I'm suggesting handling at init time instead. > > We'd need something that runs in the generic hot-add path, after bus > > notifiers, which would set the device's group to the same as its > > parent's if a provider hasn't already given it one. > > I don't think that works. Back to the FooBus example, if the isolation > group becomes a fundamental part of the dma_ops path, we may end up with > groups at each FooDev (or maybe one for FooBus) generated by the > intermediary that sets up dma using whackyPCICard. That's why this would only do something *iff* nothing else has set a group (like that intermediary). > > > > > It would probably make sense to have some > > > > > kind of top-most isolation group that contains everything as it's an > > > > > easy proof that if you own everything, you're isolated. > > > > > > > > Hrm, no. Things that have no IOMMU above them will have ungated > > > > access to system RAM, and so can never be safely isolated for > > > > userspace purposes, even if userspace owned every _device_ in the > > > > system (not that you could do that in practice, anyway). > > > > > > RAM is a good point, there are "non-devices" to worry about. > > > Potentially that top level group doesn't allow managers. > > > > I don't really see the point of a "top-level" group. Groups are > > exclusive, not heirarchical, and I don't see that there are any paths > > really simplified by having a (not manageable) "leftovers" group. > > Yeah, a top level group may not make sense, but if we plan for dma_ops, > we need non-manageable groups, which I think is just the degenerate case > of isolation quality. Yeah, I'll buy that. Pending more details on dwmw2's plans, anyway. > > > > > Potentially > > > > > though, wackyPCIcard can also register isolation groups for each of it's > > > > > FooBus devices if it's able to provide that capability. Thanks, > > > > > > > > It could, but why should it. It doesn't know anything about IOMMUs or > > > > isolation, and it doesn't need to. Even less so for PCI devices which > > > > create subordinate non-PCI struct devices for internal reasons. > > > > > > Sorry I wasn't clear. If wackyPCIcard happens to include an onboard > > > IOMMU of some sort that's capable of isolation, it might be able to > > > register groups for each of the devices below it. Therefore we could > > > end up with a scenario like above that the segment may not have ACS and > > > therefore not be able to isolate wackyPCIcard from otherdevice, but > > > wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice. I think > > > that means we potentially need to check all devices downstream of an > > > isolation group to allow a manager to lock it, as well as checking the > > > path upstream to make sure it isn't used above... Are you sure we're > > > ready for this kind of infrastructure? Thanks, > > > > Hrm, at this stage I think I'm prepared to assume that IOMMUs are > > either strictly independent (e.g. covering different PCI domains) or > > are at least vaguely aware of each other (i.e. platform conventions > > cover the combination). Truly nested IOMMUs, that are entirely > > unaware of each other is a problem for another day (by which I mean > > probably never). > > Ok, but even if there's no iommu onboard whackyPCICard, it can still be > possible to create groups on FooBus just to facilitate dma_ops and > handle quirks. No, you can't arbitrarily go creating groups - the group boundary means something specific. If dma_ops or something needs more info, it will need that *in addition* to the group. > > That said, again, groups have to be exclusive, not heirarchical, so > > in the above case, I think we'd have a few possible ways to go: > > Ownership absolutely has to be exclusive, but it looks like groups > themselves are hierarchical. Doesn't really make sense, since a group is defined as the minimum isolatable parcel. You can do something if as you walk down the tree isolation level decreases, but it's certainly not feeling right. > > 1) The FooBus isolation provider could refuse to initialize if > > the FooBus bridge is sharing an iso group with something else before > > it probes the FooBus. > > Sharing, as in a parent group is managed? No, just as in a group exists which contains both the FooBus bridge and something else. Basically this option means that if you plugged the FooBus bridge into a slot that wasn't sufficiently isolated, it would simply refuse to work. > Yes, something special needs > to happen in that case, I'd probably investigate having it initialized > in a managed state versus not initialized at all. > > > 2) The FooBus isolation provider could be in the FooBus bridge > > driver - meaning that if someone tried to grab the PCI group including > > the FooBridge, the provider driver would be booted out, causing the > > FooBus groups to cease to exist (meaning the PCI group manager would > > never get lock while someone was already using the FooGroups). In > > Right, I don't know that they need to cease to exist, but if a manager > is attached anywhere above or below the group it wants to lock, it has > to be blocked. > > > this case, it gets a bit complex. When the FooBus isolation provider > > is active, the FooBus devices would be in their own groups, not the > > group of the FooBridge and its sibling. When the FooBus isolation > > provider is removed, it would have to configure the FooBus IOMMU to a > > passthrough mode, and revert the FooBus devices to the parent's > > group. Hm. Complicated. > > Yep. I think we're arriving at the same point. Groups are > hierarchical, but ownership via a manager cannot be nested. So to > manage a group, we need to walk the entire tree of devices below each > device checking that none of the groups are managed and all the devices > are using the right driver, then walk up from the group to verify no > group of a parent device is managed. Thanks, Blargh. I really, really hope we can come up with a simpler model than that.
On Tue, 2012-03-27 at 16:14 +1100, David Gibson wrote: > On Wed, Mar 21, 2012 at 03:12:58PM -0600, Alex Williamson wrote: > > On Sat, 2012-03-17 at 15:57 +1100, David Gibson wrote: > > > On Fri, Mar 16, 2012 at 01:31:18PM -0600, Alex Williamson wrote: > > > > On Fri, 2012-03-16 at 14:45 +1100, David Gibson wrote: > > > > > On Thu, Mar 15, 2012 at 02:15:01PM -0600, Alex Williamson wrote: > > > > > > On Wed, 2012-03-14 at 20:58 +1100, David Gibson wrote: > > > > > > > On Tue, Mar 13, 2012 at 10:49:47AM -0600, Alex Williamson wrote: > > > > > > > > On Wed, 2012-03-14 at 01:33 +1100, David Gibson wrote: > > > > > > > > > On Mon, Mar 12, 2012 at 04:32:54PM -0600, Alex Williamson wrote: > > > > > > > > > > +/* > > > > > > > > > > + * Add a device to an isolation group. Isolation groups start empty and > > > > > > > > > > + * must be told about the devices they contain. Expect this to be called > > > > > > > > > > + * from isolation group providers via notifier. > > > > > > > > > > + */ > > > > > > > > > > > > > > > > > > Doesn't necessarily have to be from a notifier, particularly if the > > > > > > > > > provider is integrated into host bridge code. > > > > > > > > > > > > > > > > Sure, a provider could do this on it's own if it wants. This just > > > > > > > > provides some infrastructure for a common path. Also note that this > > > > > > > > helps to eliminate all the #ifdef CONFIG_ISOLATION in the provider. Yet > > > > > > > > to be seen whether that can reasonably be the case once isolation groups > > > > > > > > are added to streaming DMA paths. > > > > > > > > > > > > > > Right, but other than the #ifdef safety, which could be achieved more > > > > > > > simply, I'm not seeing what benefit the infrastructure provides over > > > > > > > directly calling the bus notifier function. The infrastructure groups > > > > > > > the notifiers by bus type internally, but AFAICT exactly one bus > > > > > > > notifier call would become exactly one isolation notifier call, and > > > > > > > the notifier callback itself would be almost identical. > > > > > > > > > > > > I guess I don't see this as a fundamental design point of the proposal, > > > > > > it's just a convenient way to initialize groups as a side-band addition > > > > > > until isolation groups become a more fundamental part of the iommu > > > > > > infrastructure. If you want to do that level of integration in your > > > > > > provider, do it and make the callbacks w/o the notifier. If nobody ends > > > > > > up using them, we'll remove them. Maybe it will just end up being a > > > > > > bootstrap. In the typical case, yes, one bus notifier is one isolation > > > > > > notifier. It does however also allow one bus notifier to become > > > > > > multiple isolation notifiers, and includes some filtering that would > > > > > > just be duplicated if every provider decided to implement their own bus > > > > > > notifier. > > > > > > > > > > Uh.. I didn't notice any filtering? That's why I'm asking. > > > > > > > > Not much, but a little: > > > > > > > > + switch (action) { > > > > + case BUS_NOTIFY_ADD_DEVICE: > > > > + if (!dev->isolation_group) > > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > > + ISOLATION_NOTIFY_ADD_DEVICE, dev); > > > > + break; > > > > + case BUS_NOTIFY_DEL_DEVICE: > > > > + if (dev->isolation_group) > > > > + blocking_notifier_call_chain(¬ifier->notifier, > > > > + ISOLATION_NOTIFY_DEL_DEVICE, dev); > > > > + break; > > > > + } > T> > > > > > > > Ah, I see, fair enough. > > > > > > A couple of tangential observations. First, I suspect using > > > BUS_NOTIFY_DEL_DEVICE is a very roundabout way of handling hot-unplug, > > > it might be better to have an unplug callback in the group instead. > > > > There's really one already. Assuming the device is attached to a group > > driver, the .remove entry point on the driver will get called first. It > > doesn't get to return error, but it can block. > > Hrm. Assuming the isolation provider has installed a driver for the > relevant bus type. And as we're discussing elsewhere, I think we have > situations where the groups will get members on (subordinate) busses > which the isolation provider doesn't have explicit knowledge of. > > > The DEL_DEVICE will only > > happen after the group driver has relinquished the device, or at least > > returned from remove(). For a device with no driver, the group would > > only learn about it through the notifier, but it probably doesn't need > > anything more direct. > > DEL_DEVICE is certainly sufficient, it just seems a bit unnecessarily > awkward. > > > > Second, I don't think aborting the call chain early for hot-plug is > > > actually a good idea. I can't see a clear guarantee on the order, so > > > individual providers couldn't rely on that short-cut behaviour. Which > > > means that if two providers would have attempted to claim the same > > > device, something is seriously wrong and we should probably report > > > that. > > > > Yeah, that seems like a reasonable safety check. > > > > > > > > > > > So, somewhere, I think we need a fallback path, but I'm not sure > > > > > > > > > exactly where. If an isolation provider doesn't explicitly put a > > > > > > > > > device into a group, the device should go into the group of its parent > > > > > > > > > bridge. This covers the case of a bus with IOMMU which has below it a > > > > > > > > > bridge to a different type of DMA capable bus (which the IOMMU isn't > > > > > > > > > explicitly aware of). DMAs from devices on the subordinate bus can be > > > > > > > > > translated by the top-level IOMMU (assuming it sees them as coming > > > > > > > > > from the bridge), but they all need to be treated as one group. > > > > > > > > > > > > > > > > Why would the top level IOMMU provider not set the isolation group in > > > > > > > > this case. > > > > > > > > > > > > > > Because it knows nothing about the subordinate bus. For example > > > > > > > imagine a VT-d system, with a wacky PCI card into which you plug some > > > > > > > other type of DMA capable device. The PCI card is acting as a bridge > > > > > > > from PCI to this, let's call it FooBus. Obviously the VT-d code won't > > > > > > > have a FooBus notifier, since it knows nothing about FooBus. But the > > > > > > > FooBus devices still need to end up in the group of the PCI bridge > > > > > > > device, since their DMA operations will appear as coming from the PCI > > > > > > > bridge card to the IOMMU, and can be isolated from the rest of the > > > > > > > system (but not each other) on that basis. > > > > > > > > > > > > I guess I was imagining that it's ok to have devices without an > > > > > > isolation group. > > > > > > > > > > It is, but having NULL isolation group has a pretty specific meaning - > > > > > it means it's never safe to give that device to userspace, but it also > > > > > means that normal kernel driver operation of that device must not > > > > > interfere with anything in any iso group (otherwise we can never no > > > > > that those iso groups _are_ safe to hand out). Likewise userspace > > > > > operation of any isolation group can't mess with no-group devices. > > > > > > > > This is where wanting to use isolation groups as the working unit for an > > > > iommu ops layer and also wanting to use iommu ops to replace dma ops > > > > seem to collide a bit. Do we want two interfaces for dma, one group > > > > based and one for non-isolated devices? > > > > > > Well, I don't know enough about what dwmw2 had planned to answer > > > that. I was assuming the in-kernel dma reply would remain device/bus > > > focussed, and how it looked up and used the groups was an internal > > > matter. I would expect it would probably need a fallback path for "no > > > group" for transition purposes, even if it wasn't planned to keep that > > > forever. > > > > Hmm, I think the value of a core isolation group layer starts to fall > > apart if an iommu driver can't count on a group being present for any > > device that might do dma. Some ideas below. > > Ah, yes, to clarify: I think anything subject to IOMMU translation > should have a group (which might need a no-manager flag). However, > for devices with no IOMMU, or with an IOMMU we can switch permanently > into bypass mode, I think it's reasonable to leave them without group. > > > > > Isolation providers like > > > > intel-iommu would always use one, non-isolation capable dma paths, like > > > > swiotlb or non-isolation hardware iommus, would use another. And how do > > > > we factor in the degree of isolation to avoid imposing policy in the > > > > kernel? MSI isolation is an example. We should allow userspace to set > > > > a policy of whether lack of MSI protection is an acceptable risk. Does > > > > that mean we can have isolation groups that provide no isolation and > > > > sysfs files indicating capabilities? Perhaps certain classes of groups > > > > don't even allow manager binding? > > > > > > Ugh. I thought about flagging groups with some sort of bitmap of > > > isolation strength properties, but I think that way lies madness. We > > > might want a global policy bitmask though which expresses which > > > constraints are acceptable risks. The providers would then have to > > > provide groups which are as strong as requested, or none at all. > > > > That's effectively how the current iommu_device_group() interface works; > > identify isolate-able groups, or none at all. I don't think we get that > > luxury if we push isolation groups down into the device layer though. > > If we want to use them for dma_ops and to solve various device quirks, > > they can't only be present on some configurations. I think you're right > > on the global policy though, we just need to apply it differently. I'm > > thinking we need something like "isolation_allow=" allowing options of > > "nomsi" and "nop2p" (just for starters). When a provider creates a > > group, they provide a set of flags for the group. A manager is not > > allowed to bind to the group if the global policy doesn't match the > > group flags. We'll need some support functions, maybe even a sysfs > > file, so userspace can know in advance and vfio can avoid creating > > entries for unusable groups. > > Just banning managers for groups of insufficient strength isn't quite > enough, because that doesn't allow for consolidation of several > poorly-isolated groups into one strongly isolated groups which may be > possible on some hardware configurations. This is the part where things start to completely fall apart. > > > > > None of these conditions is true for the hypothetical Foobus case. > > > > > The bus as a whole could be safely given to userspace, the devices on > > > > > it *could* mess with an existing isolation group (suppose the group > > > > > consisted of a PCI segment with the FooBus bridge plus another PCI > > > > > card - FooBus DMAs would be bridged onto the PCI segment and might > > > > > target the other card's MMIO space). And other grouped devices can > > > > > certainly mess with the FooBus devices (if userspace grabs the bridge > > > > > and manipulates its IOMMU mappings, that would clearly screw up any > > > > > kernel drivers doing DMA from FooBus devices behind it). > > > > > > > > ---segment-----+---------------+ > > > > | | > > > > [whackyPCIcard] [other device] > > > > | > > > > +---FooBus---+-------+ > > > > | | > > > > [FooDev1] [FooDev2] > > > > > > > > This is another example of the quality of the isolation group and what > > > > factors we incorporate in judging that. If the bridge/switch port > > > > generating the segment does not support or enable PCI ACS then the IOMMU > > > > may be able to differentiate whackyPCIcard from the other device but not > > > > prevent peer-to-peer traffic between the two (or between FooDevs and the > > > > other device - same difference). This would suggest that we might have > > > > an isolation group at the root of the segment for which the provider can > > > > guarantee isolation (includes everything on and below the bus), and we > > > > might also have isolation groups at whackyPCIcard and the other device > > > > that have a difference quality of isolation. /me pauses for rant about > > > > superior hardware... ;) > > > > > > Nested isolation groups? Please no. > > > > > > An isolation group at the bridge encompassing that segment was exactly > > > what I had in mind, but my point is that all the FooBus devices *also* > > > need to be in that group, even though the isolation provider knows > > > nothing about FooBus. > > > > If we're able to strictly say "no isolation = no group" and ignore > > FooBus for a moment, yes, the group at the bridge makes sense. But as I > > said, I don't think we get that luxury. There will be different > > "acceptable" levels of isolation and we'll need to support both the > > model of single group encompassing the segment as well as separate > > isolation groups for each device, whackyPCICard/other. > > Well, sure. But in that case, the FooBus devices still need to live > in the same group as their bridge card, since they're subject to the > same translations. > > > A question then is how do we support both? Perhaps it's a provider > > option whether to only create fully isolated group, which makes it > > traverse up to the segment. The default might be to make separate > > groups at whackyPCICard and other, each with the nop2p flag. It gets a > > lot more complicated, but maybe we support both so if system policy > > prevents a manager from binding to a sub-group due to nop2p, it can walk > > up a level. I suspect dma_ops always wants the closer group to avoid > > traversing levels. > > Yeah. I really wish I knew more about what dwmw2 had in mind. > > > Back to FooBus, dma_ops only knows how to provide dma for certain types > > of devices. intel-iommu only knows about struct pci_dev. So if FooDev > > needed to do dma, it would need some kind of intermediary to do the > > mapping via whackyPCICard. So from that perspective, we could get away > > w/o including FooBus devices in the group. > > That seems very fragile Logically the FooBus devices should be in the > same group. Yes, the FooBus dma hooks will need to know how to find > the bridge PCI card and thereby manipulate it's IOMMU mappings, but > they still belong in the same isolation group. > > > Of course if we want a > > manager to attach to the group at whackyPCICard (ignoring nop2p), we > > need to put requirements on the state of the FooDevs. I think that > > might actually give some credibility to nested groups, hierarchical > > really, ie if we need to walk all the devices below a group, why not > > allow groups below a group? I don't like it either, but I'm not sure > > it's avoidable. > > Urgh. Nested groups, a group parent must always have a strictly > stronger set of isolation properties than subgroups. I suppose that > could work, it's really setting off my over-engineering alarms, > though. This entire thing is over engineered, I think it's time to take a step back. We're combining iommu visibility with device quirks with isolation strength and hierarchical groups with manager locking and driver autoprobing... it's all over the place. > > > > > Oh.. and I just thought of an existing-in-the-field case with the same > > > > > problem. I'm pretty sure for devices that can appear on PCI but also > > > > > have "dumb-bus" versions used on embedded systems, at least some of > > > > > the kernel drivers are structured so that there is a separate struct > > > > > device for the PCI "wrapper" and the device inside. If the inner > > > > > driver is initiating DMA, as it usually would, it has to be in the > > > > > same iso group as it's PCI device parent. > > > > > > > > I don't know that we can ever generically identify such devices, but > > > > maybe this introduces the idea that we need to be able to blacklist > > > > certain devices as multi-homed and tainting any isolation group that > > > > contains them. > > > > > > Sorry, I was unclear. These are not multi-homed devices, just > > > multiple-variant devices, any given instance will either be PCI or > > > not. But to handle the two variants easily, the drivers have nested > > > struct devices. It's essentially a pure software artifact we're > > > dealing with here, but nonetheless we need to get the group-ownership > > > of those struct devices correct, whether they're synthetic or not. > > > > So long as the ADD_DEVICE happens with the currently running variant, > > which seems like it has to be the case, I'm not sure how this matters. > > I'm trying to be very careful to only use struct device for groups. > > Nope, I still haven't managed to convey the situation. The *hardware* > comes in two varants, PCI and "dumb bus". The core part of the driver > software binds (only) to a dumb bus struct device (usually a platform > device, actually). > > To support the PCI variant, there is a "wrapper" driver. This is a > PCI driver which does any PCI specific initialization, determines the > register addresses from config space then creates a dumb bus struct > device as a child of the pci_dev. The core driver then attaches to > that dumb bus (e.g. platform) device. > > My point is that in the second case, the dumb bus struct device needs > to be in the same group as its pci_dev parent. > > > > > > > When that happens we can traverse up the bus to find a > > > > > > higher level isolation group. > > > > > > > > > > Well, that's one possible answer to my "where should the hook be > > > > > question": rather than an initialization hook, when we look up a > > > > > device's isolation group, if it doesn't say one explicitly, we try > > > > > it's bridge parent and so forth up the tree. I wonder about the > > > > > overhead of having to walk all the way up the bus heirarchy before > > > > > returning NULL whenever we ask about the group of a device that > > > > > doesn't have one. > > > > > > > > Yep, that could definitely be a concern for streaming dma. > > > > > > Right, which is why I'm suggesting handling at init time instead. > > > We'd need something that runs in the generic hot-add path, after bus > > > notifiers, which would set the device's group to the same as its > > > parent's if a provider hasn't already given it one. > > > > I don't think that works. Back to the FooBus example, if the isolation > > group becomes a fundamental part of the dma_ops path, we may end up with > > groups at each FooDev (or maybe one for FooBus) generated by the > > intermediary that sets up dma using whackyPCICard. > > That's why this would only do something *iff* nothing else has set a > group (like that intermediary). > > > > > > > It would probably make sense to have some > > > > > > kind of top-most isolation group that contains everything as it's an > > > > > > easy proof that if you own everything, you're isolated. > > > > > > > > > > Hrm, no. Things that have no IOMMU above them will have ungated > > > > > access to system RAM, and so can never be safely isolated for > > > > > userspace purposes, even if userspace owned every _device_ in the > > > > > system (not that you could do that in practice, anyway). > > > > > > > > RAM is a good point, there are "non-devices" to worry about. > > > > Potentially that top level group doesn't allow managers. > > > > > > I don't really see the point of a "top-level" group. Groups are > > > exclusive, not heirarchical, and I don't see that there are any paths > > > really simplified by having a (not manageable) "leftovers" group. > > > > Yeah, a top level group may not make sense, but if we plan for dma_ops, > > we need non-manageable groups, which I think is just the degenerate case > > of isolation quality. > > Yeah, I'll buy that. Pending more details on dwmw2's plans, anyway. > > > > > > > Potentially > > > > > > though, wackyPCIcard can also register isolation groups for each of it's > > > > > > FooBus devices if it's able to provide that capability. Thanks, > > > > > > > > > > It could, but why should it. It doesn't know anything about IOMMUs or > > > > > isolation, and it doesn't need to. Even less so for PCI devices which > > > > > create subordinate non-PCI struct devices for internal reasons. > > > > > > > > Sorry I wasn't clear. If wackyPCIcard happens to include an onboard > > > > IOMMU of some sort that's capable of isolation, it might be able to > > > > register groups for each of the devices below it. Therefore we could > > > > end up with a scenario like above that the segment may not have ACS and > > > > therefore not be able to isolate wackyPCIcard from otherdevice, but > > > > wackyPCIcard can isolate FooDev1 from FooDev2 and otherdevice. I think > > > > that means we potentially need to check all devices downstream of an > > > > isolation group to allow a manager to lock it, as well as checking the > > > > path upstream to make sure it isn't used above... Are you sure we're > > > > ready for this kind of infrastructure? Thanks, > > > > > > Hrm, at this stage I think I'm prepared to assume that IOMMUs are > > > either strictly independent (e.g. covering different PCI domains) or > > > are at least vaguely aware of each other (i.e. platform conventions > > > cover the combination). Truly nested IOMMUs, that are entirely > > > unaware of each other is a problem for another day (by which I mean > > > probably never). > > > > Ok, but even if there's no iommu onboard whackyPCICard, it can still be > > possible to create groups on FooBus just to facilitate dma_ops and > > handle quirks. > > No, you can't arbitrarily go creating groups - the group boundary > means something specific. If dma_ops or something needs more info, it > will need that *in addition* to the group. > > > > That said, again, groups have to be exclusive, not heirarchical, so > > > in the above case, I think we'd have a few possible ways to go: > > > > Ownership absolutely has to be exclusive, but it looks like groups > > themselves are hierarchical. > > Doesn't really make sense, since a group is defined as the minimum > isolatable parcel. You can do something if as you walk down the tree > isolation level decreases, but it's certainly not feeling right. > > > > 1) The FooBus isolation provider could refuse to initialize if > > > the FooBus bridge is sharing an iso group with something else before > > > it probes the FooBus. > > > > Sharing, as in a parent group is managed? > > No, just as in a group exists which contains both the FooBus bridge > and something else. Basically this option means that if you plugged > the FooBus bridge into a slot that wasn't sufficiently isolated, it > would simply refuse to work. > > > Yes, something special needs > > to happen in that case, I'd probably investigate having it initialized > > in a managed state versus not initialized at all. > > > > > 2) The FooBus isolation provider could be in the FooBus bridge > > > driver - meaning that if someone tried to grab the PCI group including > > > the FooBridge, the provider driver would be booted out, causing the > > > FooBus groups to cease to exist (meaning the PCI group manager would > > > never get lock while someone was already using the FooGroups). In > > > > Right, I don't know that they need to cease to exist, but if a manager > > is attached anywhere above or below the group it wants to lock, it has > > to be blocked. > > > > > this case, it gets a bit complex. When the FooBus isolation provider > > > is active, the FooBus devices would be in their own groups, not the > > > group of the FooBridge and its sibling. When the FooBus isolation > > > provider is removed, it would have to configure the FooBus IOMMU to a > > > passthrough mode, and revert the FooBus devices to the parent's > > > group. Hm. Complicated. > > > > Yep. I think we're arriving at the same point. Groups are > > hierarchical, but ownership via a manager cannot be nested. So to > > manage a group, we need to walk the entire tree of devices below each > > device checking that none of the groups are managed and all the devices > > are using the right driver, then walk up from the group to verify no > > group of a parent device is managed. Thanks, > > Blargh. I really, really hope we can come up with a simpler model > than that. Yep, I'm pretty well at the end of this experiment. Honestly, I think isolation groups are the wrong approach. We're trying to wrap too many concepts together and it's completely unmanageable. I cannot see adding the complexity we're talking about here to the core device model for such a niche usage. I think we're better off going back to the iommu_device_group() and building that out into something more complete, starting with group based iommu ops and a dma quirk infrastructure. From there we can add some basic facilities to toggle driver autoprobe, maybe setup notifications for the group, and hopefully include a way to share iommu mappings between groups. Anything much beyond that we should probably leave for something like the vfio driver. Thanks, Alex
On Tue, Mar 27, 2012 at 01:34:43PM -0600, Alex Williamson wrote: [snip] > > > > this case, it gets a bit complex. When the FooBus isolation provider > > > > is active, the FooBus devices would be in their own groups, not the > > > > group of the FooBridge and its sibling. When the FooBus isolation > > > > provider is removed, it would have to configure the FooBus IOMMU to a > > > > passthrough mode, and revert the FooBus devices to the parent's > > > > group. Hm. Complicated. > > > > > > Yep. I think we're arriving at the same point. Groups are > > > hierarchical, but ownership via a manager cannot be nested. So to > > > manage a group, we need to walk the entire tree of devices below each > > > device checking that none of the groups are managed and all the devices > > > are using the right driver, then walk up from the group to verify no > > > group of a parent device is managed. Thanks, > > > > Blargh. I really, really hope we can come up with a simpler model > > than that. > > Yep, I'm pretty well at the end of this experiment. Honestly, I think > isolation groups are the wrong approach. We're trying to wrap too many > concepts together and it's completely unmanageable. I cannot see adding > the complexity we're talking about here to the core device model for > such a niche usage. I think we're better off going back to the > iommu_device_group() and building that out into something more complete, > starting with group based iommu ops and a dma quirk infrastructure. > >From there we can add some basic facilities to toggle driver autoprobe, > maybe setup notifications for the group, and hopefully include a way to > share iommu mappings between groups. Anything much beyond that we > should probably leave for something like the vfio driver. Thanks, Yes, well, I was hoping for a simpler model that didn't involve simply sweeping all the issues under a rug.
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 7be9f79..e98a5f3 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -189,4 +189,14 @@ config DMA_SHARED_BUFFER APIs extension; the file's descriptor can then be passed on to other driver. +config ISOLATION_GROUPS + bool "Enable Isolation Group API" + default n + depends on EXPERIMENTAL && IOMMU_API + help + This option enables grouping of devices into Isolation Groups + which may be used by other subsystems to perform quirks across + sets of devices as well as userspace drivers for guaranteeing + devices are isolated from the rest of the system. + endmenu diff --git a/drivers/base/Makefile b/drivers/base/Makefile index 610f999..047b5f9 100644 --- a/drivers/base/Makefile +++ b/drivers/base/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_MODULES) += module.o endif obj-$(CONFIG_SYS_HYPERVISOR) += hypervisor.o obj-$(CONFIG_REGMAP) += regmap/ +obj-$(CONFIG_ISOLATION_GROUPS) += isolation.o ccflags-$(CONFIG_DEBUG_DRIVER) := -DDEBUG diff --git a/drivers/base/base.h b/drivers/base/base.h index b858dfd..376758a 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -1,4 +1,5 @@ #include <linux/notifier.h> +#include <linux/isolation.h> /** * struct subsys_private - structure to hold the private to the driver core portions of the bus_type/class structure. @@ -108,6 +109,10 @@ extern int driver_probe_device(struct device_driver *drv, struct device *dev); static inline int driver_match_device(struct device_driver *drv, struct device *dev) { + if (isolation_group_managed(to_isolation_group(dev)) && + !isolation_group_manager_driver(to_isolation_group(dev), drv)) + return 0; + return drv->bus->match ? drv->bus->match(dev, drv) : 1; } diff --git a/drivers/base/isolation.c b/drivers/base/isolation.c new file mode 100644 index 0000000..c01365c --- /dev/null +++ b/drivers/base/isolation.c @@ -0,0 +1,798 @@ +/* + * Isolation group interface + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/device.h> +#include <linux/iommu.h> +#include <linux/isolation.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/slab.h> +#include <linux/uuid.h> + +static struct kset *isolation_kset; +/* XXX add more complete locking, maybe rcu */ +static DEFINE_MUTEX(isolation_lock); +static LIST_HEAD(isolation_groups); +static LIST_HEAD(isolation_notifiers); + +/* Keep these private */ +struct isolation_manager_driver { + struct device_driver *drv; + struct list_head list; +}; + +struct isolation_manager { + struct list_head drivers; + /* Likely need managers to register some callbacks */ +}; + +struct isolation_group { + struct list_head list; + struct list_head devices; + struct kobject kobj; + struct kobject *devices_kobj; + struct iommu_domain *iommu_domain; + struct iommu_ops *iommu_ops; + struct isolation_manager *manager; + uuid_le uuid; +}; + +struct isolation_device { + struct device *dev; + struct list_head list; +}; + +struct isolation_notifier { + struct bus_type *bus; + struct notifier_block nb; + struct blocking_notifier_head notifier; + struct list_head list; + int refcnt; +}; + +struct iso_attribute { + struct attribute attr; + ssize_t (*show)(struct isolation_group *group, char *buf); + ssize_t (*store)(struct isolation_group *group, + const char *buf, size_t count); +}; + +#define to_iso_attr(_attr) container_of(_attr, struct iso_attribute, attr) +#define to_iso_group(_kobj) container_of(_kobj, struct isolation_group, kobj) + +static ssize_t iso_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct iso_attribute *iso_attr = to_iso_attr(attr); + struct isolation_group *group = to_iso_group(kobj); + ssize_t ret = -EIO; + + if (iso_attr->show) + ret = iso_attr->show(group, buf); + return ret; +} + +static ssize_t iso_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct iso_attribute *iso_attr = to_iso_attr(attr); + struct isolation_group *group = to_iso_group(kobj); + ssize_t ret = -EIO; + + if (iso_attr->store) + ret = iso_attr->store(group, buf, count); + return ret; +} + +static void iso_release(struct kobject *kobj) +{ + struct isolation_group *group = to_iso_group(kobj); + kfree(group); +} + +static const struct sysfs_ops iso_sysfs_ops = { + .show = iso_attr_show, + .store = iso_attr_store, +}; + +static struct kobj_type iso_ktype = { + .sysfs_ops = &iso_sysfs_ops, + .release = iso_release, +}; + +/* + * Create an isolation group. Isolation group "providers" register a + * notifier for their bus(es) and call this to create a new isolation + * group. + */ +struct isolation_group *isolation_create_group(void) +{ + struct isolation_group *group, *tmp; + int ret; + + if (unlikely(!isolation_kset)) + return ERR_PTR(-ENODEV); + + group = kzalloc(sizeof(*group), GFP_KERNEL); + if (!group) + return ERR_PTR(-ENOMEM); + + mutex_lock(&isolation_lock); + + /* + * Use a UUID for group identification simply because it seems wrong + * to expose it as a kernel pointer (%p). Neither is user friendly. + * Mostly only expect userspace to do anything with this. + */ +new_uuid: + uuid_le_gen(&group->uuid); + + /* Unlikely, but nothing prevents duplicates */ + list_for_each_entry(tmp, &isolation_groups, list) + if (memcmp(&group->uuid, &tmp->uuid, sizeof(group->uuid)) == 0) + goto new_uuid; + + INIT_LIST_HEAD(&group->devices); + group->kobj.kset = isolation_kset; + + ret = kobject_init_and_add(&group->kobj, &iso_ktype, NULL, + "%pUl", &group->uuid); + if (ret) { + kfree(group); + mutex_unlock(&isolation_lock); + return ERR_PTR(ret); + } + + /* Add a subdirectory to save links for devices withing the group. */ + group->devices_kobj = kobject_create_and_add("devices", &group->kobj); + if (!group->devices_kobj) { + kobject_put(&group->kobj); + mutex_unlock(&isolation_lock); + return ERR_PTR(-ENOMEM); + } + + list_add(&group->list, &isolation_groups); + + mutex_unlock(&isolation_lock); + + return group; +} + +/* + * Free isolation group. XXX need to cleanup/reject based on manager status. + */ +int isolation_free_group(struct isolation_group *group) +{ + mutex_lock(&isolation_lock); + + if (!list_empty(&group->devices)) { + mutex_unlock(&isolation_lock); + return -EBUSY; + } + + list_del(&group->list); + kobject_put(group->devices_kobj); + kobject_put(&group->kobj); + + mutex_unlock(&isolation_lock); + return 0; +} + +/* + * Add a device to an isolation group. Isolation groups start empty and + * must be told about the devices they contain. Expect this to be called + * from isolation group providers via notifier. + */ +int isolation_group_add_dev(struct isolation_group *group, struct device *dev) +{ + struct isolation_device *device; + int ret = 0; + + mutex_lock(&isolation_lock); + + if (dev->isolation_group) { + ret = -EBUSY; + goto out; + } + + device = kzalloc(sizeof(*device), GFP_KERNEL); + if (!device) { + ret = -ENOMEM; + goto out; + } + + device->dev = dev; + + /* Cross link the device in sysfs */ + ret = sysfs_create_link(&dev->kobj, &group->kobj, + "isolation_group"); + if (ret) { + kfree(device); + goto out; + } + + ret = sysfs_create_link(group->devices_kobj, &dev->kobj, + kobject_name(&dev->kobj)); + if (ret) { + sysfs_remove_link(&dev->kobj, "isolation_group"); + kfree(device); + goto out; + } + + list_add(&device->list, &group->devices); + + dev->isolation_group = group; + + if (group->iommu_domain) { + ret = group->iommu_ops->attach_dev(group->iommu_domain, dev); + if (ret) { + /* XXX fail device? */ + } + } + + /* XXX signal manager? */ +out: + mutex_unlock(&isolation_lock); + + return 0; +} + +/* + * Remove a device from an isolation group, likely because the device + * went away. + */ +int isolation_group_del_dev(struct device *dev) +{ + struct isolation_group *group = dev->isolation_group; + struct isolation_device *device; + + if (!group) + return -ENODEV; + + mutex_lock(&isolation_lock); + + list_for_each_entry(device, &group->devices, list) { + if (device->dev == dev) { + /* XXX signal manager? */ + + if (group->iommu_domain) + group->iommu_ops->detach_dev( + group->iommu_domain, dev); + list_del(&device->list); + kfree(device); + dev->isolation_group = NULL; + sysfs_remove_link(group->devices_kobj, + kobject_name(&dev->kobj)); + sysfs_remove_link(&dev->kobj, "isolation_group"); + break; + } + } + + mutex_unlock(&isolation_lock); + + return 0; +} + +/* + * Filter and call out to our own notifier chains + */ +static int isolation_bus_notify(struct notifier_block *nb, + unsigned long action, void *data) +{ + struct isolation_notifier *notifier = + container_of(nb, struct isolation_notifier, nb); + struct device *dev = data; + + switch (action) { + case BUS_NOTIFY_ADD_DEVICE: + if (!dev->isolation_group) + blocking_notifier_call_chain(¬ifier->notifier, + ISOLATION_NOTIFY_ADD_DEVICE, dev); + break; + case BUS_NOTIFY_DEL_DEVICE: + if (dev->isolation_group) + blocking_notifier_call_chain(¬ifier->notifier, + ISOLATION_NOTIFY_DEL_DEVICE, dev); + break; + } + + return NOTIFY_DONE; +} + +/* + * Wrapper for manual playback of BUS_NOTIFY_ADD_DEVICE when we add + * a new notifier. + */ +static int isolation_do_add_notify(struct device *dev, void *data) +{ + struct notifier_block *nb = data; + + if (!dev->isolation_group) + nb->notifier_call(nb, ISOLATION_NOTIFY_ADD_DEVICE, dev); + + return 0; +} + +/* + * Register a notifier. This is for isolation group providers. It's + * possible we could have multiple isolation group providers for the same + * bus type, so we create a struct isolation_notifier per bus_type, each + * with a notifier block. Providers are therefore welcome to register + * as many buses as they can handle. + */ +int isolation_register_notifier(struct bus_type *bus, struct notifier_block *nb) +{ + struct isolation_notifier *notifier; + bool found = false; + int ret; + + mutex_lock(&isolation_lock); + list_for_each_entry(notifier, &isolation_notifiers, list) { + if (notifier->bus == bus) { + found = true; + break; + } + } + + if (!found) { + notifier = kzalloc(sizeof(*notifier), GFP_KERNEL); + if (!notifier) { + mutex_unlock(&isolation_lock); + return -ENOMEM; + } + notifier->bus = bus; + notifier->nb.notifier_call = isolation_bus_notify; + BLOCKING_INIT_NOTIFIER_HEAD(¬ifier->notifier); + bus_register_notifier(bus, ¬ifier->nb); + list_add(¬ifier->list, &isolation_notifiers); + } + + ret = blocking_notifier_chain_register(¬ifier->notifier, nb); + if (ret) { + if (notifier->refcnt == 0) { + bus_unregister_notifier(bus, ¬ifier->nb); + list_del(¬ifier->list); + kfree(notifier); + } + mutex_unlock(&isolation_lock); + return ret; + } + notifier->refcnt++; + + mutex_unlock(&isolation_lock); + + bus_for_each_dev(bus, NULL, nb, isolation_do_add_notify); + + return 0; +} + +/* + * Unregister... + */ +int isolation_unregister_notifier(struct bus_type *bus, + struct notifier_block *nb) +{ + struct isolation_notifier *notifier; + bool found = false; + int ret; + + mutex_lock(&isolation_lock); + list_for_each_entry(notifier, &isolation_notifiers, list) { + if (notifier->bus == bus) { + found = true; + break; + } + } + + if (!found) { + mutex_unlock(&isolation_lock); + return -ENODEV; + } + + ret = blocking_notifier_chain_unregister(¬ifier->notifier, nb); + if (ret) { + mutex_unlock(&isolation_lock); + return ret; + } + + /* Free at nobody left in the notifier block */ + if (--notifier->refcnt == 0) { + bus_unregister_notifier(bus, ¬ifier->nb); + list_del(¬ifier->list); + kfree(notifier); + } + + mutex_unlock(&isolation_lock); + + return 0; +} + +/* + * Set iommu_ops on an isolation group. Hopefully all DMA will eventually + * be done through isolation groups, for now, we just provide another + * interface to the iommu api. + */ +int isolation_group_set_iommu_ops(struct isolation_group *group, + struct iommu_ops *iommu_ops) +{ + mutex_lock(&isolation_lock); + + if (group->iommu_ops) { + mutex_unlock(&isolation_lock); + return -EBUSY; + } + + group->iommu_ops = iommu_ops; + + mutex_unlock(&isolation_lock); + + return 0; +} + +/* + * Attach all the devices for a group to the specified iommu domain. + */ +static int __isolation_group_domain_attach_devs(struct iommu_domain *domain, + struct list_head *devices) +{ + struct isolation_device *device; + struct device *dev; + int ret = 0; + + list_for_each_entry(device, devices, list) { + dev = device->dev; + + ret = domain->ops->attach_dev(domain, dev); + if (ret) { + list_for_each_entry_continue_reverse(device, + devices, list) { + dev = device->dev; + domain->ops->detach_dev(domain, dev); + } + break; + } + } + + return ret; +} + +/* + * And detach... + */ +static void __isolation_group_domain_detach_devs(struct iommu_domain *domain, + struct list_head *devices) +{ + struct isolation_device *device; + struct device *dev; + + list_for_each_entry(device, devices, list) { + dev = device->dev; + + domain->ops->detach_dev(domain, dev); + } +} + +/* + * Initialize the iommu domain for an isolation group. This is to ease the + * transition to using isolation groups and expected to be used by current + * users of the iommu api for now. + */ +int isolation_group_init_iommu_domain(struct isolation_group *group) +{ + int ret; + + mutex_lock(&isolation_lock); + + if (!group->iommu_ops || list_empty(&group->devices)) { + mutex_unlock(&isolation_lock); + return -EINVAL; + } + + if (group->iommu_domain) { + mutex_unlock(&isolation_lock); + return -EBUSY; + } + + group->iommu_domain = kzalloc(sizeof(struct iommu_domain), GFP_KERNEL); + if (!group->iommu_domain) { + mutex_unlock(&isolation_lock); + return -ENOMEM; + } + + group->iommu_domain->ops = group->iommu_ops; + + ret = group->iommu_ops->domain_init(group->iommu_domain); + if (ret) { + kfree(group->iommu_domain); + group->iommu_domain = NULL; + mutex_unlock(&isolation_lock); + return ret; + } + + /* Automatically attach all the devices in the group. */ + ret = __isolation_group_domain_attach_devs(group->iommu_domain, + &group->devices); + if (ret) { + group->iommu_ops->domain_destroy(group->iommu_domain); + kfree(group->iommu_domain); + group->iommu_domain = NULL; + mutex_unlock(&isolation_lock); + return ret; + } + + mutex_unlock(&isolation_lock); + return 0; +} + +/* + * And free... + * Note just below, we add the ability to add another group to an iommu + * domain, so we don't always destroy and free the domain here. + */ +void isolation_group_free_iommu_domain(struct isolation_group *group) +{ + struct isolation_group *tmp; + bool inuse = false; + + if (!group->iommu_domain || !group->iommu_ops) + return; + + mutex_lock(&isolation_lock); + + __isolation_group_domain_detach_devs(group->iommu_domain, + &group->devices); + + list_for_each_entry(tmp, &isolation_groups, list) { + if (tmp == group) + continue; + if (tmp->iommu_domain == group->iommu_domain) { + inuse = true; + break; + } + } + + if (!inuse) { + group->iommu_ops->domain_destroy(group->iommu_domain); + kfree(group->iommu_domain); + } + + group->iommu_domain = NULL; + + mutex_unlock(&isolation_lock); +} + +/* + * This handles the VFIO "merge" interface, allowing us to manage multiple + * isolation groups with a single domain. We just rely on attach_dev to + * tell us whether this is ok. + */ +int isolation_group_iommu_domain_add_group(struct isolation_group *groupa, + struct isolation_group *groupb) +{ + int ret; + + if (!groupa->iommu_domain || + groupb->iommu_domain || list_empty(&groupb->devices)) + return -EINVAL; + + if (groupa->iommu_ops != groupb->iommu_ops) + return -EIO; + + mutex_lock(&isolation_lock); + + ret = __isolation_group_domain_attach_devs(groupa->iommu_domain, + &groupb->devices); + if (ret) { + mutex_unlock(&isolation_lock); + return ret; + } + + groupb->iommu_domain = groupa->iommu_domain; + + mutex_unlock(&isolation_lock); + + return 0; +} + +/* + * XXX page mapping/unmapping and more iommu api wrappers + */ + +/* + * Do we have a group manager? Group managers restrict what drivers are + * allowed to attach to devices. A group is "locked" when all of the devices + * for a group belong to group manager drivers (or no driver at all). At + * that point, the group manager can initialize the iommu. vfio is an example + * of a group manager and vfio-pci is an exanple of a driver that a group + * manager may add as a "managed" driver. Note that we don't gate iommu + * manipulation on being managed because we want to use it for regular DMA + * at some point as well. + */ +bool isolation_group_managed(struct isolation_group *group) +{ + return group->manager != NULL; +} + +/* + * Initialize the group manager struct. At this point the isolation group + * becomes "managed". + */ +int isolation_group_init_manager(struct isolation_group *group) +{ + mutex_lock(&isolation_lock); + + if (group->manager) { + mutex_unlock(&isolation_lock); + return -EBUSY; + } + + group->manager = kzalloc(sizeof(struct isolation_manager), GFP_KERNEL); + if (!group->manager) { + mutex_unlock(&isolation_lock); + return -ENOMEM; + } + + mutex_unlock(&isolation_lock); + + return 0; +} + +/* + * And free... cleanup registerd managed drivers too. + */ +void isolation_group_free_manager(struct isolation_group *group) +{ + struct isolation_manager_driver *driver, *tmp; + + if (!group->manager) + return; + + mutex_lock(&isolation_lock); + + list_for_each_entry_safe(driver, tmp, &group->manager->drivers, list) { + list_del(&driver->list); + kfree(driver); + } + + kfree(group->manager); + group->manager = NULL; + mutex_unlock(&isolation_lock); +} + +/* + * Add a managed driver. Drivers added here are the only ones that will + * be allowed to attach to a managed isolation group. + */ +int isolation_group_manager_add_driver(struct isolation_group *group, + struct device_driver *drv) +{ + struct isolation_manager_driver *driver; + + if (!group->manager) + return -EINVAL; + + driver = kzalloc(sizeof(*driver), GFP_KERNEL); + if (!driver) + return -ENOMEM; + + driver->drv = drv; + + mutex_lock(&isolation_lock); + + list_add(&driver->list, &group->manager->drivers); + + mutex_unlock(&isolation_lock); + + return 0; +} + +/* + * And remove a driver. Don't really expect to need this much. + */ +void isolation_group_manager_del_driver(struct isolation_group *group, + struct device_driver *drv) +{ + struct isolation_manager_driver *driver; + + if (!group->manager) + return; + + mutex_lock(&isolation_lock); + + list_for_each_entry(driver, &group->manager->drivers, list) { + if (driver->drv == drv) { + list_del(&driver->list); + kfree(driver); + break; + } + } + + mutex_unlock(&isolation_lock); +} + +/* + * Test whether a driver is a "managed" driver for the group. This allows + * is to preempt normal driver matching and only let our drivers in. + */ +bool isolation_group_manager_driver(struct isolation_group *group, + struct device_driver *drv) +{ + struct isolation_manager_driver *driver; + bool found = false; + + if (!group->manager) + return found; + + mutex_lock(&isolation_lock); + + list_for_each_entry(driver, &group->manager->drivers, list) { + if (driver->drv == drv) { + found = true; + break; + } + } + + mutex_unlock(&isolation_lock); + + return found; +} + +/* + * Does the group manager have control of all of the devices in the group? + * We consider driver-less devices to be ok here as they don't do DMA and + * don't present any interfaces to subvert the rest of the group. + */ +bool isolation_group_manager_locked(struct isolation_group *group) +{ + struct isolation_device *device; + struct isolation_manager_driver *driver; + bool found, locked = true; + + if (!group->manager) + return false; + + mutex_lock(&isolation_lock); + + list_for_each_entry(device, &group->devices, list) { + found = false; + + if (!device->dev->driver) + continue; + + list_for_each_entry(driver, &group->manager->drivers, list) { + if (device->dev->driver == driver->drv) { + found = true; + break; + } + } + + if (!found) { + locked = false; + break; + } + } + + mutex_unlock(&isolation_lock); + + return locked; +} + +static int __init isolation_init(void) +{ + isolation_kset = kset_create_and_add("isolation", NULL, NULL); + + WARN(!isolation_kset, "Failed to initialize isolation group kset\n"); + + return isolation_kset ? 0 : -1; +} +subsys_initcall(isolation_init); diff --git a/include/linux/device.h b/include/linux/device.h index b63fb39..5805c56 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -663,6 +663,10 @@ struct device { struct device_dma_parameters *dma_parms; +#ifdef CONFIG_ISOLATION_GROUPS + struct isolation_group *isolation_group; +#endif + struct list_head dma_pools; /* dma pools (if dma'ble) */ struct dma_coherent_mem *dma_mem; /* internal for coherent mem diff --git a/include/linux/isolation.h b/include/linux/isolation.h new file mode 100644 index 0000000..1d87caf --- /dev/null +++ b/include/linux/isolation.h @@ -0,0 +1,138 @@ +/* + * Isolation group interface + * + * Copyright (C) 2012 Red Hat, Inc. All rights reserved. + * Author: Alex Williamson <alex.williamson@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#ifndef _LINUX_ISOLATION_H +#define _LINUX_ISOLATION_H + +#define ISOLATION_NOTIFY_ADD_DEVICE 1 +#define ISOLATION_NOTIFY_DEL_DEVICE 2 + +#ifdef CONFIG_ISOLATION_GROUPS + +extern struct isolation_group *isolation_create_group(void); +extern int isolation_free_group(struct isolation_group *group); +extern int isolation_group_add_dev(struct isolation_group *group, + struct device *dev); +extern int isolation_group_del_dev(struct device *dev); +extern int isolation_register_notifier(struct bus_type *bus, + struct notifier_block *nb); +extern int isolation_unregister_notifier(struct bus_type *bus, + struct notifier_block *nb); +extern int isolation_group_set_iommu_ops(struct isolation_group *group, + struct iommu_ops *iommu_ops); +extern int isolation_group_init_iommu_domain(struct isolation_group *group); +extern void isolation_group_free_iommu_domain(struct isolation_group *group); +extern int isolation_group_iommu_domain_add_group( + struct isolation_group *groupa, struct isolation_group *groupb); +extern bool isolation_group_managed(struct isolation_group *group); +extern int isolation_group_init_manager(struct isolation_group *group); +extern void isolation_group_free_manager(struct isolation_group *group); +extern int isolation_group_manager_add_driver(struct isolation_group *group, + struct device_driver *drv); +extern void isolation_group_manager_del_driver(struct isolation_group *group, + struct device_driver *drv); +extern bool isolation_group_manager_driver(struct isolation_group *group, + struct device_driver *drv); +extern bool isolation_group_manager_locked(struct isolation_group *group); + +#define to_isolation_group(dev) ((dev)->isolation_group) + +#else /* CONFIG_ISOLATION_GROUPS */ + +static inline struct isolation_group *isolation_create_group(void) +{ + return NULL; +} + +static inline int isolation_free_group(struct isolation_group *group) +{ + return 0; +} + +static inline int isolation_group_add_dev(struct isolation_group *group, + struct device *dev) +{ + return 0; +} + +static inline int isolation_group_del_dev(struct device *dev) +{ + return 0; +} + +static int isolation_register_notifier(struct bus_type *bus, + struct notifier_block *nb) +{ + return 0; +} + +static int isolation_unregister_notifier(struct bus_type *bus, + struct notifier_block *nb) +{ + return 0; +} + +static int isolation_group_set_iommu_ops(struct isolation_group *group, + struct iommu_ops *iommu_ops) +{ + return -ENOSYS; +} + +static int isolation_group_init_iommu_domain(struct isolation_group *group) +{ + return -ENOSYS; +} + +static void isolation_group_free_iommu_domain(struct isolation_group *group) +{ +} + +static int isolation_group_iommu_domain_add_group( + struct isolation_group *groupa, struct isolation_group *groupb) +{ + return -ENOSYS; +} + +static int isolation_group_init_manager(struct isolation_group *group) +{ + return -ENOSYS; +} + +static void isolation_group_free_manager(struct isolation_group *group) +{ +} + +static int isolation_group_manager_add_driver(struct isolation_group *group, + struct device_driver *drv) +{ + return -ENOSYS; +} + +static void isolation_group_manager_del_driver(struct isolation_group *group, + struct device_driver *drv) +{ +} + +static bool isolation_group_manager_locked(struct isolation_group *group) +{ + return false; +} + +#define to_isolation_group(dev) (NULL) + +#define isolation_group_managed(group) (false) + +#define isolation_group_manager_driver(group, drv) (false) + +#endif /* CONFIG_ISOLATION_GROUPS */ + +#endif /* _LINUX_ISOLATION_H */
Signed-off-by: Alex Williamson <alex.williamson@redhat.com> --- drivers/base/Kconfig | 10 + drivers/base/Makefile | 1 drivers/base/base.h | 5 drivers/base/isolation.c | 798 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/device.h | 4 include/linux/isolation.h | 138 ++++++++ 6 files changed, 956 insertions(+), 0 deletions(-) create mode 100644 drivers/base/isolation.c create mode 100644 include/linux/isolation.h