diff mbox

[RFC,v4,1/3] Mediated device Core driver

Message ID 1464119897-10844-2-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede May 24, 2016, 7:58 p.m. UTC
Design for Mediated Device Driver:
Main purpose of this driver is to provide a common interface for mediated
device management that can be used by differnt drivers of different
devices.

This module provides a generic interface to create the device, add it to
mediated bus, add device to IOMMU group and then add it to vfio group.

Below is the high Level block diagram, with Nvidia, Intel and IBM devices
as example, since these are the devices which are going to actively use
this module as of now.

 +---------------+
 |               |
 | +-----------+ |  mdev_register_driver() +--------------+
 | |           | +<------------------------+ __init()     |
 | |           | |                         |              |
 | |  mdev     | +------------------------>+              |<-> VFIO user
 | |  bus      | |     probe()/remove()    | vfio_mpci.ko |    APIs
 | |  driver   | |                         |              |
 | |           | |                         +--------------+
 | |           | |  mdev_register_driver() +--------------+
 | |           | +<------------------------+ __init()     |
 | |           | |                         |              |
 | |           | +------------------------>+              |<-> VFIO user
 | +-----------+ |     probe()/remove()    | vfio_mccw.ko |    APIs
 |               |                         |              |
 |  MDEV CORE    |                         +--------------+
 |   MODULE      |
 |   mdev.ko     |
 | +-----------+ |  mdev_register_device() +--------------+
 | |           | +<------------------------+              |
 | |           | |                         |  nvidia.ko   |<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | | Physical  | |
 | |  device   | |  mdev_register_device() +--------------+
 | | interface | |<------------------------+              |
 | |           | |                         |  i915.ko     |<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | |           | |
 | |           | |  mdev_register_device() +--------------+
 | |           | +<------------------------+              |
 | |           | |                         | ccw_device.ko|<-> physical
 | |           | +------------------------>+              |    device
 | |           | |        callback         +--------------+
 | +-----------+ |
 +---------------+

Core driver provides two types of registration interfaces:
1. Registration interface for mediated bus driver:

/**
  * struct mdev_driver - Mediated device's driver
  * @name: driver name
  * @probe: called when new device created
  * @remove:called when device removed
  * @match: called when new device or driver is added for this bus.
	    Return 1 if given device can be handled by given driver and
	    zero otherwise.
  * @driver:device driver structure
  *
  **/
struct mdev_driver {
         const char *name;
         int  (*probe)  (struct device *dev);
         void (*remove) (struct device *dev);
	 int  (*match)(struct device *dev);
         struct device_driver    driver;
};

int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
void mdev_unregister_driver(struct mdev_driver *drv);

Mediated device's driver for mdev should use this interface to register
with Core driver. With this, mediated devices driver for such devices is
responsible to add mediated device to VFIO group.

2. Physical device driver interface
This interface provides vendor driver the set APIs to manage physical
device related work in their own driver. APIs are :
- supported_config: provide supported configuration list by the vendor
		    driver
- create: to allocate basic resources in vendor driver for a mediated
	  device.
- destroy: to free resources in vendor driver when mediated device is
	   destroyed.
- start: to initiate mediated device initialization process from vendor
	 driver when VM boots and before QEMU starts.
- shutdown: to teardown mediated device resources during VM teardown.
- read : read emulation callback.
- write: write emulation callback.
- set_irqs: send interrupt configuration information that QEMU sets.
- get_region_info: to provide region size and its flags for the mediated
		   device.
- validate_map_request: to validate remap pfn request.

This registration interface should be used by vendor drivers to register
each physical device to mdev core driver.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I88f4482f7608f40550a152c5f882b64271287c62
---
 drivers/vfio/Kconfig             |   1 +
 drivers/vfio/Makefile            |   1 +
 drivers/vfio/mdev/Kconfig        |  11 +
 drivers/vfio/mdev/Makefile       |   5 +
 drivers/vfio/mdev/mdev-core.c    | 462 +++++++++++++++++++++++++++++++++++++++
 drivers/vfio/mdev/mdev-driver.c  | 139 ++++++++++++
 drivers/vfio/mdev/mdev-sysfs.c   | 312 ++++++++++++++++++++++++++
 drivers/vfio/mdev/mdev_private.h |  33 +++
 include/linux/mdev.h             | 224 +++++++++++++++++++
 9 files changed, 1188 insertions(+)
 create mode 100644 drivers/vfio/mdev/Kconfig
 create mode 100644 drivers/vfio/mdev/Makefile
 create mode 100644 drivers/vfio/mdev/mdev-core.c
 create mode 100644 drivers/vfio/mdev/mdev-driver.c
 create mode 100644 drivers/vfio/mdev/mdev-sysfs.c
 create mode 100644 drivers/vfio/mdev/mdev_private.h
 create mode 100644 include/linux/mdev.h

Comments

Tian, Kevin May 25, 2016, 7:55 a.m. UTC | #1
> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
> Sent: Wednesday, May 25, 2016 3:58 AM
> 
> Design for Mediated Device Driver:
> Main purpose of this driver is to provide a common interface for mediated
> device management that can be used by differnt drivers of different
> devices.
> 
> This module provides a generic interface to create the device, add it to
> mediated bus, add device to IOMMU group and then add it to vfio group.
> 
> Below is the high Level block diagram, with Nvidia, Intel and IBM devices
> as example, since these are the devices which are going to actively use
> this module as of now.
> 
>  +---------------+
>  |               |
>  | +-----------+ |  mdev_register_driver() +--------------+
>  | |           | +<------------------------+ __init()     |
>  | |           | |                         |              |
>  | |  mdev     | +------------------------>+              |<-> VFIO user
>  | |  bus      | |     probe()/remove()    | vfio_mpci.ko |    APIs
>  | |  driver   | |                         |              |
>  | |           | |                         +--------------+
>  | |           | |  mdev_register_driver() +--------------+
>  | |           | +<------------------------+ __init()     |
>  | |           | |                         |              |
>  | |           | +------------------------>+              |<-> VFIO user
>  | +-----------+ |     probe()/remove()    | vfio_mccw.ko |    APIs
>  |               |                         |              |
>  |  MDEV CORE    |                         +--------------+
>  |   MODULE      |
>  |   mdev.ko     |
>  | +-----------+ |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         |  nvidia.ko   |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | | Physical  | |
>  | |  device   | |  mdev_register_device() +--------------+
>  | | interface | |<------------------------+              |
>  | |           | |                         |  i915.ko     |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | |           | |
>  | |           | |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         | ccw_device.ko|<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | +-----------+ |
>  +---------------+
> 
> Core driver provides two types of registration interfaces:
> 1. Registration interface for mediated bus driver:
> 
> /**
>   * struct mdev_driver - Mediated device's driver
>   * @name: driver name
>   * @probe: called when new device created
>   * @remove:called when device removed
>   * @match: called when new device or driver is added for this bus.
> 	    Return 1 if given device can be handled by given driver and
> 	    zero otherwise.
>   * @driver:device driver structure
>   *
>   **/
> struct mdev_driver {
>          const char *name;
>          int  (*probe)  (struct device *dev);
>          void (*remove) (struct device *dev);
> 	 int  (*match)(struct device *dev);
>          struct device_driver    driver;
> };
> 
> int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> void mdev_unregister_driver(struct mdev_driver *drv);
> 
> Mediated device's driver for mdev should use this interface to register
> with Core driver. With this, mediated devices driver for such devices is
> responsible to add mediated device to VFIO group.
> 
> 2. Physical device driver interface
> This interface provides vendor driver the set APIs to manage physical
> device related work in their own driver. APIs are :
> - supported_config: provide supported configuration list by the vendor
> 		    driver
> - create: to allocate basic resources in vendor driver for a mediated
> 	  device.
> - destroy: to free resources in vendor driver when mediated device is
> 	   destroyed.
> - start: to initiate mediated device initialization process from vendor
> 	 driver when VM boots and before QEMU starts.
> - shutdown: to teardown mediated device resources during VM teardown.
> - read : read emulation callback.
> - write: write emulation callback.
> - set_irqs: send interrupt configuration information that QEMU sets.
> - get_region_info: to provide region size and its flags for the mediated
> 		   device.
> - validate_map_request: to validate remap pfn request.
> 
> This registration interface should be used by vendor drivers to register
> each physical device to mdev core driver.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I88f4482f7608f40550a152c5f882b64271287c62
> ---
>  drivers/vfio/Kconfig             |   1 +
>  drivers/vfio/Makefile            |   1 +
>  drivers/vfio/mdev/Kconfig        |  11 +
>  drivers/vfio/mdev/Makefile       |   5 +
>  drivers/vfio/mdev/mdev-core.c    | 462
> +++++++++++++++++++++++++++++++++++++++
>  drivers/vfio/mdev/mdev-driver.c  | 139 ++++++++++++
>  drivers/vfio/mdev/mdev-sysfs.c   | 312 ++++++++++++++++++++++++++
>  drivers/vfio/mdev/mdev_private.h |  33 +++
>  include/linux/mdev.h             | 224 +++++++++++++++++++
>  9 files changed, 1188 insertions(+)
>  create mode 100644 drivers/vfio/mdev/Kconfig
>  create mode 100644 drivers/vfio/mdev/Makefile
>  create mode 100644 drivers/vfio/mdev/mdev-core.c
>  create mode 100644 drivers/vfio/mdev/mdev-driver.c
>  create mode 100644 drivers/vfio/mdev/mdev-sysfs.c
>  create mode 100644 drivers/vfio/mdev/mdev_private.h
>  create mode 100644 include/linux/mdev.h
> 
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index da6e2ce77495..23eced02aaf6 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU
> 
>  source "drivers/vfio/pci/Kconfig"
>  source "drivers/vfio/platform/Kconfig"
> +source "drivers/vfio/mdev/Kconfig"
>  source "virt/lib/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 7b8a31f63fea..7c70753e54ab 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) +=
> vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
>  obj-$(CONFIG_VFIO_PLATFORM) += platform/
> +obj-$(CONFIG_MDEV) += mdev/
> diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
> new file mode 100644
> index 000000000000..951e2bb06a3f
> --- /dev/null
> +++ b/drivers/vfio/mdev/Kconfig
> @@ -0,0 +1,11 @@
> +
> +config MDEV
> +    tristate "Mediated device driver framework"

Sorry not a native speaker. Is it cleaner to say "Driver framework for Mediated 
Devices" or "Mediated Device Framework"? Should we focus on driver or device
here?

> +    depends on VFIO
> +    default n
> +    help
> +        MDEV provides a framework to virtualize device without SR-IOV cap
> +        See Documentation/mdev.txt for more details.

Looks Documentation/mdev.txt is not included in this version.

> +
> +        If you don't know what do here, say N.
> +
> diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
> new file mode 100644
> index 000000000000..4adb069febce
> --- /dev/null
> +++ b/drivers/vfio/mdev/Makefile
> @@ -0,0 +1,5 @@
> +
> +mdev-y := mdev-core.o mdev-sysfs.o mdev-driver.o
> +
> +obj-$(CONFIG_MDEV) += mdev.o
> +
> diff --git a/drivers/vfio/mdev/mdev-core.c b/drivers/vfio/mdev/mdev-core.c
> new file mode 100644
> index 000000000000..af070d73735f
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev-core.c
> @@ -0,0 +1,462 @@
> +/*
> + * Mediated device Core Driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/slab.h>
> +#include <linux/cdev.h>
> +#include <linux/sched.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +#define DRIVER_VERSION		"0.1"
> +#define DRIVER_AUTHOR		"NVIDIA Corporation"
> +#define DRIVER_DESC		"Mediated device Core Driver"
> +
> +#define MDEV_CLASS_NAME		"mdev"
> +
> +/*
> + * Global Structures
> + */
> +
> +static struct devices_list {
> +	struct list_head    dev_list;
> +	struct mutex        list_lock;
> +} mdevices, phy_devices;

phy_devices -> pdevices? and similarly we can use pdev/mdev
pair in other places...

> +
> +/*
> + * Functions
> + */
> +
> +static int mdev_add_attribute_group(struct device *dev,
> +				    const struct attribute_group **groups)
> +{
> +	return sysfs_create_groups(&dev->kobj, groups);
> +}
> +
> +static void mdev_remove_attribute_group(struct device *dev,
> +					const struct attribute_group **groups)
> +{
> +	sysfs_remove_groups(&dev->kobj, groups);
> +}
> +
> +static struct mdev_device *find_mdev_device(uuid_le uuid, int instance)

can we just call it "struct mdev* or "mdevice"? "dev_device" looks redundant.

Sorry I may have to ask same question since I didn't get an answer yet.
what exactly does 'instance' mean here? since uuid is unique, why do 
we need match instance too?

> +{
> +	struct mdev_device *vdev = NULL, *v;

better to unify the notation here. what's the difference between mdev
and vdev?

> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_for_each_entry(v, &mdevices.dev_list, next) {
> +		if ((uuid_le_cmp(v->uuid, uuid) == 0) &&
> +		    (v->instance == instance)) {
> +			vdev = v;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&mdevices.list_lock);
> +	return vdev;
> +}
> +
> +static struct mdev_device *find_next_mdev_device(struct phy_device *phy_dev)
> +{
> +	struct mdev_device *mdev = NULL, *p;
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_for_each_entry(p, &mdevices.dev_list, next) {
> +		if (p->phy_dev == phy_dev) {
> +			mdev = p;
> +			break;
> +		}
> +	}

Looks above is to find the first mdev for a given physical device, instead of
finding next mdev

> +	mutex_unlock(&mdevices.list_lock);
> +	return mdev;
> +}
> +
> +static struct phy_device *find_physical_device(struct device *dev)
> +{
> +	struct phy_device *pdev = NULL, *p;
> +
> +	mutex_lock(&phy_devices.list_lock);
> +	list_for_each_entry(p, &phy_devices.dev_list, next) {
> +		if (p->dev == dev) {
> +			pdev = p;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&phy_devices.list_lock);
> +	return pdev;
> +}
> +
> +static void mdev_destroy_device(struct mdev_device *mdevice)
> +{
> +	struct phy_device *phy_dev = mdevice->phy_dev;
> +
> +	if (phy_dev) {
> +		mutex_lock(&phy_devices.list_lock);
> +
> +		/*
> +		* If vendor driver doesn't return success that means vendor
> +		* driver doesn't support hot-unplug
> +		*/
> +		if (phy_dev->ops->destroy) {
> +			if (phy_dev->ops->destroy(phy_dev->dev, mdevice->uuid,
> +						  mdevice->instance)) {
> +				mutex_unlock(&phy_devices.list_lock);

a warning message is preferred. Also better to return -EBUSY here.

> +				return;
> +			}
> +		}
> +
> +		mdev_remove_attribute_group(&mdevice->dev,
> +					    phy_dev->ops->mdev_attr_groups);
> +		mdevice->phy_dev = NULL;

Am I missing something here? You didn't remove this mdev node from
the list, and below...

> +		mutex_unlock(&phy_devices.list_lock);

you should use mutex of mdevices list

> +	}
> +
> +	mdev_put_device(mdevice);
> +	device_unregister(&mdevice->dev);
> +}
> +
> +/*
> + * Find mediated device from given iommu_group and increment refcount of
> + * mediated device. Caller should call mdev_put_device() when the use of
> + * mdev_device is done.
> + */
> +struct mdev_device *mdev_get_device_by_group(struct iommu_group *group)
> +{
> +	struct mdev_device *mdev = NULL, *p;
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_for_each_entry(p, &mdevices.dev_list, next) {
> +		if (!p->group)
> +			continue;
> +
> +		if (iommu_group_id(p->group) == iommu_group_id(group)) {
> +			mdev = mdev_get_device(p);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&mdevices.list_lock);
> +	return mdev;
> +}
> +EXPORT_SYMBOL_GPL(mdev_get_device_by_group);
> +
> +/*
> + * mdev_register_device : Register a device
> + * @dev: device structure representing physical device.
> + * @phy_device_ops: Physical device operation structure to be registered.
> + *
> + * Add device to list of registered physical devices.
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_device(struct device *dev, const struct phy_device_ops *ops)
> +{
> +	int ret = 0;
> +	struct phy_device *phy_dev, *pdev;
> +
> +	if (!dev || !ops)
> +		return -EINVAL;
> +
> +	/* Check for duplicate */
> +	pdev = find_physical_device(dev);
> +	if (pdev)
> +		return -EEXIST;
> +
> +	phy_dev = kzalloc(sizeof(*phy_dev), GFP_KERNEL);
> +	if (!phy_dev)
> +		return -ENOMEM;
> +
> +	phy_dev->dev = dev;
> +	phy_dev->ops = ops;
> +
> +	mutex_lock(&phy_devices.list_lock);
> +	ret = mdev_create_sysfs_files(dev);
> +	if (ret)
> +		goto add_sysfs_error;
> +
> +	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
> +	if (ret)
> +		goto add_group_error;

any reason to include sysfs operations inside the mutex which is
purely about phy_devices list?

> +
> +	list_add(&phy_dev->next, &phy_devices.dev_list);
> +	dev_info(dev, "MDEV: Registered\n");
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	return 0;
> +
> +add_group_error:
> +	mdev_remove_sysfs_files(dev);
> +add_sysfs_error:
> +	mutex_unlock(&phy_devices.list_lock);
> +	kfree(phy_dev);
> +	return ret;
> +}
> +EXPORT_SYMBOL(mdev_register_device);
> +
> +/*
> + * mdev_unregister_device : Unregister a physical device
> + * @dev: device structure representing physical device.
> + *
> + * Remove device from list of registered physical devices. Gives a change to
> + * free existing mediated devices for the given physical device.
> + */
> +
> +void mdev_unregister_device(struct device *dev)
> +{
> +	struct phy_device *phy_dev;
> +	struct mdev_device *vdev = NULL;
> +
> +	phy_dev = find_physical_device(dev);
> +
> +	if (!phy_dev)
> +		return;
> +
> +	dev_info(dev, "MDEV: Unregistering\n");
> +
> +	while ((vdev = find_next_mdev_device(phy_dev)))
> +		mdev_destroy_device(vdev);

Need check return value here since ops->destroy may fail.

> +
> +	mutex_lock(&phy_devices.list_lock);
> +	list_del(&phy_dev->next);
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	mdev_remove_attribute_group(dev,
> +				    phy_dev->ops->dev_attr_groups);
> +
> +	mdev_remove_sysfs_files(dev);
> +	kfree(phy_dev);
> +}
> +EXPORT_SYMBOL(mdev_unregister_device);
> +
> +/*
> + * Functions required for mdev-sysfs
> + */
> +
> +static struct mdev_device *mdev_device_alloc(uuid_le uuid, int instance)
> +{
> +	struct mdev_device *mdevice = NULL;
> +
> +	mdevice = kzalloc(sizeof(*mdevice), GFP_KERNEL);
> +	if (!mdevice)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&mdevice->kref);
> +	memcpy(&mdevice->uuid, &uuid, sizeof(uuid_le));
> +	mdevice->instance = instance;
> +	mutex_init(&mdevice->ops_lock);
> +
> +	return mdevice;
> +}
> +
> +static void mdev_device_release(struct device *dev)

what's the difference between this release and earlier destroy version?

> +{
> +	struct mdev_device *mdevice = to_mdev_device(dev);
> +
> +	if (!mdevice)
> +		return;
> +
> +	dev_info(&mdevice->dev, "MDEV: destroying\n");
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_del(&mdevice->next);
> +	mutex_unlock(&mdevices.list_lock);
> +
> +	kfree(mdevice);
> +}
> +
> +int create_mdev_device(struct device *dev, uuid_le uuid, uint32_t instance,
> +		       char *mdev_params)
> +{
> +	int retval = 0;
> +	struct mdev_device *mdevice = NULL;
> +	struct phy_device *phy_dev;
> +
> +	phy_dev = find_physical_device(dev);
> +	if (!phy_dev)
> +		return -EINVAL;
> +
> +	mdevice = mdev_device_alloc(uuid, instance);
> +	if (IS_ERR(mdevice)) {
> +		retval = PTR_ERR(mdevice);
> +		return retval;
> +	}
> +
> +	mdevice->dev.parent  = dev;
> +	mdevice->dev.bus     = &mdev_bus_type;
> +	mdevice->dev.release = mdev_device_release;
> +	dev_set_name(&mdevice->dev, "%pUb-%d", uuid.b, instance);
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_add(&mdevice->next, &mdevices.dev_list);
> +	mutex_unlock(&mdevices.list_lock);

update list in the end, since even ops->create hasn't been invoked yet.

> +
> +	retval = device_register(&mdevice->dev);
> +	if (retval) {
> +		mdev_put_device(mdevice);
> +		return retval;
> +	}
> +
> +	mutex_lock(&phy_devices.list_lock);
> +	if (phy_dev->ops->create) {
> +		retval = phy_dev->ops->create(dev, mdevice->uuid,
> +					      instance, mdev_params);
> +		if (retval)
> +			goto create_failed;
> +	}
> +
> +	retval = mdev_add_attribute_group(&mdevice->dev,
> +					  phy_dev->ops->mdev_attr_groups);
> +	if (retval)
> +		goto create_failed;
> +
> +	mdevice->phy_dev = phy_dev;
> +	mutex_unlock(&phy_devices.list_lock);
> +	mdev_get_device(mdevice);
> +	dev_info(&mdevice->dev, "MDEV: created\n");
> +
> +	return retval;
> +
> +create_failed:
> +	mutex_unlock(&phy_devices.list_lock);
> +	device_unregister(&mdevice->dev);
> +	return retval;
> +}
> +
> +int destroy_mdev_device(uuid_le uuid, uint32_t instance)
> +{
> +	struct mdev_device *vdev;
> +
> +	vdev = find_mdev_device(uuid, instance);
> +
> +	if (!vdev)
> +		return -EINVAL;
> +
> +	mdev_destroy_device(vdev);
> +	return 0;
> +}
> +
> +void get_mdev_supported_types(struct device *dev, char *str)
> +{
> +	struct phy_device *phy_dev;
> +
> +	phy_dev = find_physical_device(dev);
> +
> +	if (phy_dev) {
> +		mutex_lock(&phy_devices.list_lock);
> +		if (phy_dev->ops->supported_config)
> +			phy_dev->ops->supported_config(phy_dev->dev, str);
> +		mutex_unlock(&phy_devices.list_lock);
> +	}
> +}
> +
> +int mdev_start_callback(uuid_le uuid, uint32_t instance)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdevice;
> +	struct phy_device *phy_dev;
> +
> +	mdevice = find_mdev_device(uuid, instance);
> +
> +	if (!mdevice)
> +		return -EINVAL;
> +
> +	phy_dev = mdevice->phy_dev;
> +
> +	mutex_lock(&phy_devices.list_lock);
> +	if (phy_dev->ops->start)
> +		ret = phy_dev->ops->start(mdevice->uuid);
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	if (ret < 0)
> +		pr_err("mdev_start failed  %d\n", ret);
> +	else
> +		kobject_uevent(&mdevice->dev.kobj, KOBJ_ONLINE);
> +
> +	return ret;
> +}
> +
> +int mdev_shutdown_callback(uuid_le uuid, uint32_t instance)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdevice;
> +	struct phy_device *phy_dev;
> +
> +	mdevice = find_mdev_device(uuid, instance);
> +
> +	if (!mdevice)
> +		return -EINVAL;
> +
> +	phy_dev = mdevice->phy_dev;
> +
> +	mutex_lock(&phy_devices.list_lock);
> +	if (phy_dev->ops->shutdown)
> +		ret = phy_dev->ops->shutdown(mdevice->uuid);
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	if (ret < 0)
> +		pr_err("mdev_shutdown failed %d\n", ret);
> +	else
> +		kobject_uevent(&mdevice->dev.kobj, KOBJ_OFFLINE);
> +
> +	return ret;
> +}
> +
> +static struct class mdev_class = {
> +	.name		= MDEV_CLASS_NAME,
> +	.owner		= THIS_MODULE,
> +	.class_attrs	= mdev_class_attrs,
> +};
> +
> +static int __init mdev_init(void)
> +{
> +	int rc = 0;
> +
> +	mutex_init(&mdevices.list_lock);
> +	INIT_LIST_HEAD(&mdevices.dev_list);
> +	mutex_init(&phy_devices.list_lock);
> +	INIT_LIST_HEAD(&phy_devices.dev_list);
> +
> +	rc = class_register(&mdev_class);
> +	if (rc < 0) {
> +		pr_err("Failed to register mdev class\n");
> +		return rc;
> +	}
> +
> +	rc = mdev_bus_register();
> +	if (rc < 0) {
> +		pr_err("Failed to register mdev bus\n");
> +		class_unregister(&mdev_class);
> +		return rc;
> +	}
> +
> +	return rc;
> +}
> +
> +static void __exit mdev_exit(void)
> +{

should we check any remaining mdev/pdev which are not cleaned
up correctly here?

> +	mdev_bus_unregister();
> +	class_unregister(&mdev_class);
> +}
> +
> +module_init(mdev_init)
> +module_exit(mdev_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/mdev/mdev-driver.c b/drivers/vfio/mdev/mdev-driver.c
> new file mode 100644
> index 000000000000..bc8a169782bc
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev-driver.c
> @@ -0,0 +1,139 @@
> +/*
> + * MDEV driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/device.h>
> +#include <linux/iommu.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +static int mdevice_attach_iommu(struct mdev_device *mdevice)
> +{
> +	int retval = 0;
> +	struct iommu_group *group = NULL;
> +
> +	group = iommu_group_alloc();
> +	if (IS_ERR(group)) {
> +		dev_err(&mdevice->dev, "MDEV: failed to allocate group!\n");
> +		return PTR_ERR(group);
> +	}
> +
> +	retval = iommu_group_add_device(group, &mdevice->dev);
> +	if (retval) {
> +		dev_err(&mdevice->dev, "MDEV: failed to add dev to group!\n");
> +		goto attach_fail;
> +	}
> +
> +	mdevice->group = group;
> +
> +	dev_info(&mdevice->dev, "MDEV: group_id = %d\n",
> +				 iommu_group_id(group));
> +attach_fail:
> +	iommu_group_put(group);
> +	return retval;
> +}
> +
> +static void mdevice_detach_iommu(struct mdev_device *mdevice)
> +{
> +	iommu_group_remove_device(&mdevice->dev);
> +	dev_info(&mdevice->dev, "MDEV: detaching iommu\n");
> +}
> +
> +static int mdevice_probe(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdevice = to_mdev_device(dev);
> +	int status = 0;
> +
> +	status = mdevice_attach_iommu(mdevice);
> +	if (status) {
> +		dev_err(dev, "Failed to attach IOMMU\n");
> +		return status;
> +	}
> +
> +	if (drv && drv->probe)
> +		status = drv->probe(dev);
> +
> +	return status;
> +}
> +
> +static int mdevice_remove(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdevice = to_mdev_device(dev);
> +
> +	if (drv && drv->remove)
> +		drv->remove(dev);
> +
> +	mdevice_detach_iommu(mdevice);
> +
> +	return 0;
> +}
> +
> +static int mdevice_match(struct device *dev, struct device_driver *drv)
> +{
> +	int ret = 0;
> +	struct mdev_driver *mdrv = to_mdev_driver(drv);
> +
> +	if (mdrv && mdrv->match)
> +		ret = mdrv->match(dev);
> +
> +	return ret;
> +}
> +
> +struct bus_type mdev_bus_type = {
> +	.name		= "mdev",
> +	.match		= mdevice_match,
> +	.probe		= mdevice_probe,
> +	.remove		= mdevice_remove,
> +};
> +EXPORT_SYMBOL_GPL(mdev_bus_type);
> +
> +/**
> + * mdev_register_driver - register a new MDEV driver
> + * @drv: the driver to register
> + * @owner: owner module of driver ro register
> + *
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
> +{
> +	/* initialize common driver fields */
> +	drv->driver.name = drv->name;
> +	drv->driver.bus = &mdev_bus_type;
> +	drv->driver.owner = owner;
> +
> +	/* register with core */
> +	return driver_register(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_register_driver);
> +
> +/**
> + * mdev_unregister_driver - unregister MDEV driver
> + * @drv: the driver to unregister
> + *
> + */
> +void mdev_unregister_driver(struct mdev_driver *drv)
> +{
> +	driver_unregister(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_unregister_driver);
> +
> +int mdev_bus_register(void)
> +{
> +	return bus_register(&mdev_bus_type);
> +}
> +
> +void mdev_bus_unregister(void)
> +{
> +	bus_unregister(&mdev_bus_type);
> +}
> diff --git a/drivers/vfio/mdev/mdev-sysfs.c b/drivers/vfio/mdev/mdev-sysfs.c
> new file mode 100644
> index 000000000000..79d351a7a502
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev-sysfs.c
> @@ -0,0 +1,312 @@
> +/*
> + * File attributes for Mediated devices
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +/* Prototypes */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf);
> +static DEVICE_ATTR_RO(mdev_supported_types);
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_create);
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_destroy);
> +
> +/* Static functions */
> +
> +#define UUID_CHAR_LENGTH	36
> +#define UUID_BYTE_LENGTH	16
> +
> +#define SUPPORTED_TYPE_BUFFER_LENGTH	1024
> +
> +static inline bool is_uuid_sep(char sep)
> +{
> +	if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
> +		return true;
> +	return false;
> +}
> +
> +static int uuid_parse(const char *str, uuid_le *uuid)
> +{
> +	int i;
> +
> +	if (strlen(str) < UUID_CHAR_LENGTH)
> +		return -1;
> +
> +	for (i = 0; i < UUID_BYTE_LENGTH; i++) {
> +		if (!isxdigit(str[0]) || !isxdigit(str[1])) {
> +			pr_err("%s err", __func__);
> +			return -EINVAL;
> +		}
> +
> +		uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
> +		str += 2;
> +		if (is_uuid_sep(*str))
> +			str++;
> +	}
> +
> +	return 0;
> +}
> +
> +/* Functions */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	char *str;
> +	ssize_t n;
> +
> +	str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	get_mdev_supported_types(dev, str);
> +
> +	n = sprintf(buf, "%s\n", str);
> +	kfree(str);
> +
> +	return n;
> +}
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count)
> +{
> +	char *str, *pstr;
> +	char *uuid_str, *instance_str, *mdev_params = NULL;
> +	uuid_le uuid;
> +	uint32_t instance;
> +	int ret = 0;
> +
> +	pstr = str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_create: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (!str) {
> +		pr_err("mdev_create: mdev instance not present %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	instance_str = strsep(&str, ":");
> +	if (!instance_str) {
> +		pr_err("mdev_create: Empty instance string %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	ret = kstrtouint(instance_str, 0, &instance);
> +	if (ret) {
> +		pr_err("mdev_create: mdev instance parsing error %s\n", buf);
> +		goto create_error;
> +	}
> +
> +	if (!str) {
> +		pr_err("mdev_create: mdev params not specified %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	mdev_params = kstrdup(str, GFP_KERNEL);
> +
> +	if (!mdev_params) {
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_create: UUID parse error %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (create_mdev_device(dev, uuid, instance, mdev_params) < 0) {
> +		pr_err("mdev_create: Failed to create mdev device\n");
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +	ret = count;
> +
> +create_error:
> +	kfree(mdev_params);
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count)
> +{
> +	char *uuid_str, *str, *pstr;
> +	uuid_le uuid;
> +	unsigned int instance;
> +	int ret;
> +
> +	str = pstr = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	if (str == NULL) {
> +		pr_err("mdev_destroy: instance not specified %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	ret = kstrtouint(str, 0, &instance);
> +	if (ret) {
> +		pr_err("mdev_destroy: instance parsing error %s\n", buf);
> +		goto destroy_error;
> +	}
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	ret = destroy_mdev_device(uuid, instance);
> +	if (ret < 0)
> +		goto destroy_error;
> +
> +	ret = count;
> +
> +destroy_error:
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +ssize_t mdev_start_store(struct class *class, struct class_attribute *attr,
> +			 const char *buf, size_t count)
> +{
> +	char *uuid_str;
> +	uuid_le uuid;
> +	int ret = 0;
> +
> +	uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_start: UUID parse error  %s\n", buf);
> +		ret = -EINVAL;
> +		goto start_error;
> +	}
> +
> +	ret = mdev_start_callback(uuid, 0);
> +	if (ret < 0)
> +		goto start_error;
> +
> +	ret = count;
> +
> +start_error:
> +	kfree(uuid_str);
> +	return ret;
> +}
> +
> +ssize_t mdev_shutdown_store(struct class *class, struct class_attribute *attr,
> +			    const char *buf, size_t count)
> +{
> +	char *uuid_str;
> +	uuid_le uuid;
> +	int ret = 0;
> +
> +	uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_shutdown: UUID parse error %s\n", buf);
> +		ret = -EINVAL;
> +	}
> +
> +	ret = mdev_shutdown_callback(uuid, 0);
> +	if (ret < 0)
> +		goto shutdown_error;
> +
> +	ret = count;
> +
> +shutdown_error:
> +	kfree(uuid_str);
> +	return ret;
> +
> +}
> +
> +struct class_attribute mdev_class_attrs[] = {
> +	__ATTR_WO(mdev_start),
> +	__ATTR_WO(mdev_shutdown),
> +	__ATTR_NULL
> +};
> +
> +int mdev_create_sysfs_files(struct device *dev)
> +{
> +	int retval;
> +
> +	retval = sysfs_create_file(&dev->kobj,
> +				   &dev_attr_mdev_supported_types.attr);
> +	if (retval) {
> +		pr_err("Failed to create mdev_supported_types sysfs entry\n");
> +		return retval;
> +	}
> +
> +	retval = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	if (retval) {
> +		pr_err("Failed to create mdev_create sysfs entry\n");
> +		return retval;
> +	}
> +
> +	retval = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +	if (retval) {
> +		pr_err("Failed to create mdev_destroy sysfs entry\n");
> +		return retval;
> +	}
> +
> +	return 0;
> +}
> +
> +void mdev_remove_sysfs_files(struct device *dev)
> +{
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +}
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> new file mode 100644
> index 000000000000..a472310c7749
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -0,0 +1,33 @@
> +/*
> + * Mediated device interal definitions
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_PRIVATE_H
> +#define MDEV_PRIVATE_H
> +
> +int  mdev_bus_register(void);
> +void mdev_bus_unregister(void);
> +
> +/* Function prototypes for mdev_sysfs */
> +
> +extern struct class_attribute mdev_class_attrs[];
> +
> +int  mdev_create_sysfs_files(struct device *dev);
> +void mdev_remove_sysfs_files(struct device *dev);
> +
> +int  create_mdev_device(struct device *dev, uuid_le uuid, uint32_t instance,
> +			char *mdev_params);
> +int  destroy_mdev_device(uuid_le uuid, uint32_t instance);
> +void get_mdev_supported_types(struct device *dev, char *str);
> +int  mdev_start_callback(uuid_le uuid, uint32_t instance);
> +int  mdev_shutdown_callback(uuid_le uuid, uint32_t instance);
> +
> +#endif /* MDEV_PRIVATE_H */
> diff --git a/include/linux/mdev.h b/include/linux/mdev.h
> new file mode 100644
> index 000000000000..d9633acd85f2
> --- /dev/null
> +++ b/include/linux/mdev.h
> @@ -0,0 +1,224 @@
> +/*
> + * Mediated device definition
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_H
> +#define MDEV_H
> +
> +/* Common Data structures */
> +
> +struct pci_region_info {
> +	uint64_t start;
> +	uint64_t size;
> +	uint32_t flags;		/*!< VFIO region info flags */
> +};
> +
> +enum mdev_emul_space {
> +	EMUL_CONFIG_SPACE,	/*!< PCI configuration space */
> +	EMUL_IO,		/*!< I/O register space */
> +	EMUL_MMIO		/*!< Memory-mapped I/O space */
> +};
> +
> +struct phy_device;
> +
> +/*
> + * Mediated device
> + */
> +
> +struct mdev_device {
> +	struct kref		kref;
> +	struct device		dev;
> +	struct phy_device	*phy_dev;
> +	struct iommu_group	*group;
> +	void			*iommu_data;
> +	uuid_le			uuid;
> +	uint32_t		instance;
> +	void			*driver_data;
> +	struct mutex		ops_lock;
> +	struct list_head	next;
> +};
> +
> +
> +/**
> + * struct phy_device_ops - Structure to be registered for each physical device
> + * to register the device to mdev module.
> + *
> + * @owner:		The module owner.
> + * @dev_attr_groups:	Default attributes of the physical device.
> + * @mdev_attr_groups:	Default attributes of the mediated device.
> + * @supported_config:	Called to get information about supported types.
> + *			@dev : device structure of physical device.
> + *			@config: should return string listing supported config
> + *			Returns integer: success (0) or error (< 0)
> + * @create:		Called to allocate basic resources in physical device's
> + *			driver for a particular mediated device
> + *			@dev: physical pci device structure on which mediated
> + *			      device should be created
> + *			@uuid: VM's uuid for which VM it is intended to
> + *			@instance: mediated instance in that VM
> + *			@mdev_params: extra parameters required by physical
> + *			device's driver.
> + *			Returns integer: success (0) or error (< 0)
> + * @destroy:		Called to free resources in physical device's driver for
> + *			a mediated device instance of that VM.
> + *			@dev: physical device structure to which this mediated
> + *			      device points to.
> + *			@uuid: VM's uuid for which the mediated device belongs
> + *			@instance: mdev instance in that VM
> + *			Returns integer: success (0) or error (< 0)
> + *			If VM is running and destroy() is called that means the
> + *			mdev is being hotunpluged. Return error if VM is running
> + *			and driver doesn't support mediated device hotplug.
> + * @start:		Called to do initiate mediated device initialization
> + *			process in physical device's driver when VM boots before
> + *			qemu starts.
> + *			@uuid: VM's UUID which is booting.
> + *			Returns integer: success (0) or error (< 0)
> + * @shutdown:		Called to teardown mediated device related resources for
> + *			the VM
> + *			@uuid: VM's UUID which is shutting down .
> + *			Returns integer: success (0) or error (< 0)
> + * @read:		Read emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: read buffer
> + *			@count: number bytes to read
> + *			@address_space: specifies for which address
> + *			space the request is: pci_config_space, IO
> + *			register space or MMIO space.
> + *			@pos: offset from base address.
> + *			Retuns number on bytes read on success or error.
> + * @write:		Write emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: write buffer
> + *			@count: number bytes to be written
> + *			@address_space: specifies for which address space the
> + *			request is: pci_config_space, IO register space or MMIO
> + *			space.
> + *			@pos: offset from base address.
> + *			Retuns number on bytes written on success or error.
> + * @set_irqs:		Called to send about interrupts configuration
> + *			information that VMM sets.
> + *			@mdev: mediated device structure
> + *			@flags, index, start, count and *data : same as that of
> + *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
> + * @get_region_info:	Called to get BAR size and flags of mediated device.
> + *			@mdev: mediated device structure
> + *			@region_index: VFIO region index
> + *			@region_info: output, returns size and flags of
> + *				      requested region.
> + *			Returns integer: success (0) or error (< 0)
> + * @validate_map_request: Validate remap pfn request
> + *			@mdev: mediated device structure
> + *			@virtaddr: target user address to start at
> + *			@pfn: physical address of kernel memory, vendor driver
> + *			      can change if required.
> + *			@size: size of map area, vendor driver can change the
> + *			       size of map area if desired.
> + *			@prot: page protection flags for this mapping, vendor
> + *			       driver can change, if required.
> + *			Returns integer: success (0) or error (< 0)
> + *
> + * Physical device that support mediated device should be registered with mdev
> + * module with phy_device_ops structure.
> + */
> +
> +struct phy_device_ops {
> +	struct module   *owner;
> +	const struct attribute_group **dev_attr_groups;
> +	const struct attribute_group **mdev_attr_groups;
> +
> +	int	(*supported_config)(struct device *dev, char *config);
> +	int     (*create)(struct device *dev, uuid_le uuid,
> +			  uint32_t instance, char *mdev_params);
> +	int     (*destroy)(struct device *dev, uuid_le uuid,
> +			   uint32_t instance);
> +	int     (*start)(uuid_le uuid);
> +	int     (*shutdown)(uuid_le uuid);
> +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> +			enum mdev_emul_space address_space, loff_t pos);
> +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> +			 enum mdev_emul_space address_space, loff_t pos);
> +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> +			    unsigned int index, unsigned int start,
> +			    unsigned int count, void *data);
> +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> +				 struct pci_region_info *region_info);
> +	int	(*validate_map_request)(struct mdev_device *vdev,
> +					unsigned long virtaddr,
> +					unsigned long *pfn, unsigned long *size,
> +					pgprot_t *prot);
> +};
> +
> +/*
> + * Physical Device
> + */
> +struct phy_device {
> +	struct device                   *dev;
> +	const struct phy_device_ops     *ops;
> +	struct list_head                next;
> +};
> +
> +/**
> + * struct mdev_driver - Mediated device driver
> + * @name: driver name
> + * @probe: called when new device created
> + * @remove: called when device removed
> + * @match: called when new device or driver is added for this bus. Return 1 if
> + *	   given device can be handled by given driver and zero otherwise.
> + * @driver: device driver structure
> + *
> + **/
> +struct mdev_driver {
> +	const char *name;
> +	int  (*probe)(struct device *dev);
> +	void (*remove)(struct device *dev);
> +	int  (*match)(struct device *dev);
> +	struct device_driver driver;
> +};
> +
> +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
> +{
> +	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
> +}
> +
> +static inline struct mdev_device *to_mdev_device(struct device *dev)
> +{
> +	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
> +}
> +
> +static inline struct mdev_device *mdev_get_device(struct mdev_device *vdev)
> +{
> +	return (vdev && get_device(&vdev->dev)) ? vdev : NULL;
> +}
> +
> +static inline  void mdev_put_device(struct mdev_device *vdev)
> +{
> +	if (vdev)
> +		put_device(&vdev->dev);
> +}
> +
> +extern struct bus_type mdev_bus_type;
> +
> +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
> +
> +extern int  mdev_register_device(struct device *dev,
> +				 const struct phy_device_ops *ops);
> +extern void mdev_unregister_device(struct device *dev);
> +
> +extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> +extern void mdev_unregister_driver(struct mdev_driver *drv);
> +
> +extern int mdev_map_virtual_bar(uint64_t virt_bar_addr, uint64_t phys_bar_addr,
> +				uint32_t len, uint32_t flags);
> +
> +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
> +
> +#endif /* MDEV_H */
> --
> 2.7.0
Kirti Wankhede May 25, 2016, 2:47 p.m. UTC | #2
On 5/25/2016 1:25 PM, Tian, Kevin wrote:
>> From: Kirti Wankhede [mailto:kwankhede@nvidia.com]
>> Sent: Wednesday, May 25, 2016 3:58 AM
>>

...

>> +
>> +config MDEV
>> +    tristate "Mediated device driver framework"
>
> Sorry not a native speaker. Is it cleaner to say "Driver framework for
Mediated
> Devices" or "Mediated Device Framework"? Should we focus on driver or
device
> here?
>

Both, device and driver. This framework provides way to register
physical *devices* and also register *driver* for mediated devices.


>> +    depends on VFIO
>> +    default n
>> +    help
>> +        MDEV provides a framework to virtualize device without
SR-IOV cap
>> +        See Documentation/mdev.txt for more details.
>
> Looks Documentation/mdev.txt is not included in this version.
>

Yes, will have Documentation/mdev.txt in next version of patch.


>> +static struct devices_list {
>> +	struct list_head    dev_list;
>> +	struct mutex        list_lock;
>> +} mdevices, phy_devices;
>
> phy_devices -> pdevices? and similarly we can use pdev/mdev
> pair in other places...
>

'pdevices' sometimes also refers to 'pointer to devices' that's the
reason I perfer to use phy_devices to represent 'physical devices'


>> +static struct mdev_device *find_mdev_device(uuid_le uuid, int instance)
>
> can we just call it "struct mdev* or "mdevice"? "dev_device" looks
redundant.
>

'struct mdev_device' represents 'device structure for device created by
mdev module'. Still that doesn't satisfy major folks, I'm open to change
it.


> Sorry I may have to ask same question since I didn't get an answer yet.
> what exactly does 'instance' mean here? since uuid is unique, why do
> we need match instance too?
>

'uuid' could be UUID of a VM for whom it is created. To support mutiple
mediated devices for same VM, name should be unique. Hence we need a
instance number to identify each mediated device uniquely in one VM.



>> +		if (phy_dev->ops->destroy) {
>> +			if (phy_dev->ops->destroy(phy_dev->dev, mdevice->uuid,
>> +						  mdevice->instance)) {
>> +				mutex_unlock(&phy_devices.list_lock);
>
> a warning message is preferred. Also better to return -EBUSY here.
>

mdev_destroy_device() is called from 2 paths, one is sysfs mdev_destroy
and mdev_unregister_device(). For the later case, return from here will
any ways ignored. mdev_unregister_device() is called from the remove
function of physical device and that doesn't care about return error, it
just removes the device from subsystem.

>> +				return;
>> +			}
>> +		}
>> +
>> +		mdev_remove_attribute_group(&mdevice->dev,
>> +					    phy_dev->ops->mdev_attr_groups);
>> +		mdevice->phy_dev = NULL;
>
> Am I missing something here? You didn't remove this mdev node from
> the list, and below...
>

device_unregister() calls put_device(dev) and if refcount is zero its
release function is called, which is mdev_device_release(), that is
hooked during device_register(). This node is removed from list from
mdev_device_release().


>> +		mutex_unlock(&phy_devices.list_lock);
>
> you should use mutex of mdevices list
>

No, this lock is for phy_dev.


>> +	phy_dev->dev = dev;
>> +	phy_dev->ops = ops;
>> +
>> +	mutex_lock(&phy_devices.list_lock);
>> +	ret = mdev_create_sysfs_files(dev);
>> +	if (ret)
>> +		goto add_sysfs_error;
>> +
>> +	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
>> +	if (ret)
>> +		goto add_group_error;
>
> any reason to include sysfs operations inside the mutex which is
> purely about phy_devices list?
>

dev_attr_groups attribute is for physical device, hence inside
phy_devices.list_lock.

* @dev_attr_groups:    Default attributes of the physical device.


>> +void mdev_unregister_device(struct device *dev)
>> +{
>> +	struct phy_device *phy_dev;
>> +	struct mdev_device *vdev = NULL;
>> +
>> +	phy_dev = find_physical_device(dev);
>> +
>> +	if (!phy_dev)
>> +		return;
>> +
>> +	dev_info(dev, "MDEV: Unregistering\n");
>> +
>> +	while ((vdev = find_next_mdev_device(phy_dev)))
>> +		mdev_destroy_device(vdev);
>
> Need check return value here since ops->destroy may fail.
>

See my comment above.


>> +static void mdev_device_release(struct device *dev)
>
> what's the difference between this release and earlier destroy version?
>

See my comment above


>> +static void __exit mdev_exit(void)
>> +{
>
> should we check any remaining mdev/pdev which are not cleaned
> up correctly here?
>

If there are any physical device registered with this module then the
usage count is not zero so rmmod would anyways fail.

All mdev_devices, which are created for any physical device,  are
destroyed from mdev_unregister_device(physial_device);

Hence no need to explicitly add the code here which would never get used.

>> +	mdev_bus_unregister();
>> +	class_unregister(&mdev_class);
>> +}


Thanks,
Kirti
Alex Williamson May 25, 2016, 10:39 p.m. UTC | #3
On Wed, 25 May 2016 01:28:15 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Design for Mediated Device Driver:
> Main purpose of this driver is to provide a common interface for mediated
> device management that can be used by differnt drivers of different
> devices.
> 
> This module provides a generic interface to create the device, add it to
> mediated bus, add device to IOMMU group and then add it to vfio group.
> 
> Below is the high Level block diagram, with Nvidia, Intel and IBM devices
> as example, since these are the devices which are going to actively use
> this module as of now.
> 
>  +---------------+
>  |               |
>  | +-----------+ |  mdev_register_driver() +--------------+
>  | |           | +<------------------------+ __init()     |
>  | |           | |                         |              |
>  | |  mdev     | +------------------------>+              |<-> VFIO user
>  | |  bus      | |     probe()/remove()    | vfio_mpci.ko |    APIs
>  | |  driver   | |                         |              |
>  | |           | |                         +--------------+
>  | |           | |  mdev_register_driver() +--------------+
>  | |           | +<------------------------+ __init()     |
>  | |           | |                         |              |
>  | |           | +------------------------>+              |<-> VFIO user
>  | +-----------+ |     probe()/remove()    | vfio_mccw.ko |    APIs
>  |               |                         |              |
>  |  MDEV CORE    |                         +--------------+
>  |   MODULE      |
>  |   mdev.ko     |
>  | +-----------+ |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         |  nvidia.ko   |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | | Physical  | |
>  | |  device   | |  mdev_register_device() +--------------+
>  | | interface | |<------------------------+              |
>  | |           | |                         |  i915.ko     |<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | |           | |
>  | |           | |  mdev_register_device() +--------------+
>  | |           | +<------------------------+              |
>  | |           | |                         | ccw_device.ko|<-> physical
>  | |           | +------------------------>+              |    device
>  | |           | |        callback         +--------------+
>  | +-----------+ |
>  +---------------+
> 
> Core driver provides two types of registration interfaces:
> 1. Registration interface for mediated bus driver:
> 
> /**
>   * struct mdev_driver - Mediated device's driver
>   * @name: driver name
>   * @probe: called when new device created
>   * @remove:called when device removed
>   * @match: called when new device or driver is added for this bus.
> 	    Return 1 if given device can be handled by given driver and
> 	    zero otherwise.
>   * @driver:device driver structure
>   *
>   **/
> struct mdev_driver {
>          const char *name;
>          int  (*probe)  (struct device *dev);
>          void (*remove) (struct device *dev);
> 	 int  (*match)(struct device *dev);
>          struct device_driver    driver;
> };
> 
> int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> void mdev_unregister_driver(struct mdev_driver *drv);
> 
> Mediated device's driver for mdev should use this interface to register
> with Core driver. With this, mediated devices driver for such devices is
> responsible to add mediated device to VFIO group.
> 
> 2. Physical device driver interface
> This interface provides vendor driver the set APIs to manage physical
> device related work in their own driver. APIs are :
> - supported_config: provide supported configuration list by the vendor
> 		    driver
> - create: to allocate basic resources in vendor driver for a mediated
> 	  device.
> - destroy: to free resources in vendor driver when mediated device is
> 	   destroyed.
> - start: to initiate mediated device initialization process from vendor
> 	 driver when VM boots and before QEMU starts.
> - shutdown: to teardown mediated device resources during VM teardown.
> - read : read emulation callback.
> - write: write emulation callback.
> - set_irqs: send interrupt configuration information that QEMU sets.
> - get_region_info: to provide region size and its flags for the mediated
> 		   device.
> - validate_map_request: to validate remap pfn request.

nit, vfio is a userspace driver interface where QEMU is simply a user
of that interface.  We should never assume QEMU is the only user.

> 
> This registration interface should be used by vendor drivers to register
> each physical device to mdev core driver.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I88f4482f7608f40550a152c5f882b64271287c62
> ---
>  drivers/vfio/Kconfig             |   1 +
>  drivers/vfio/Makefile            |   1 +
>  drivers/vfio/mdev/Kconfig        |  11 +
>  drivers/vfio/mdev/Makefile       |   5 +
>  drivers/vfio/mdev/mdev-core.c    | 462 +++++++++++++++++++++++++++++++++++++++
>  drivers/vfio/mdev/mdev-driver.c  | 139 ++++++++++++
>  drivers/vfio/mdev/mdev-sysfs.c   | 312 ++++++++++++++++++++++++++
>  drivers/vfio/mdev/mdev_private.h |  33 +++
>  include/linux/mdev.h             | 224 +++++++++++++++++++
>  9 files changed, 1188 insertions(+)
>  create mode 100644 drivers/vfio/mdev/Kconfig
>  create mode 100644 drivers/vfio/mdev/Makefile
>  create mode 100644 drivers/vfio/mdev/mdev-core.c
>  create mode 100644 drivers/vfio/mdev/mdev-driver.c
>  create mode 100644 drivers/vfio/mdev/mdev-sysfs.c
>  create mode 100644 drivers/vfio/mdev/mdev_private.h

- or _, pick one.  Underscore is more prevalent.

>  create mode 100644 include/linux/mdev.h
> diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
> index da6e2ce77495..23eced02aaf6 100644
> --- a/drivers/vfio/Kconfig
> +++ b/drivers/vfio/Kconfig
> @@ -48,4 +48,5 @@ menuconfig VFIO_NOIOMMU
>  
>  source "drivers/vfio/pci/Kconfig"
>  source "drivers/vfio/platform/Kconfig"
> +source "drivers/vfio/mdev/Kconfig"
>  source "virt/lib/Kconfig"
> diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
> index 7b8a31f63fea..7c70753e54ab 100644
> --- a/drivers/vfio/Makefile
> +++ b/drivers/vfio/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
>  obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
>  obj-$(CONFIG_VFIO_PCI) += pci/
>  obj-$(CONFIG_VFIO_PLATFORM) += platform/
> +obj-$(CONFIG_MDEV) += mdev/
> diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
> new file mode 100644
> index 000000000000..951e2bb06a3f
> --- /dev/null
> +++ b/drivers/vfio/mdev/Kconfig
> @@ -0,0 +1,11 @@
> +
> +config MDEV
> +    tristate "Mediated device driver framework"
> +    depends on VFIO
> +    default n
> +    help
> +        MDEV provides a framework to virtualize device without SR-IOV cap
> +        See Documentation/mdev.txt for more details.

I don't see that file anywhere in this series.  Also note that SR-IOV
is a PCI specific technology while as a framework this is specifically
not limited to PCI.  In fact, there's really no requirement here that
physical hardware even exists, this interface could be used to provide
in-kernel emulation of a device.

> +
> +        If you don't know what do here, say N.
> +
> diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
> new file mode 100644
> index 000000000000..4adb069febce
> --- /dev/null
> +++ b/drivers/vfio/mdev/Makefile
> @@ -0,0 +1,5 @@
> +
> +mdev-y := mdev-core.o mdev-sysfs.o mdev-driver.o
> +
> +obj-$(CONFIG_MDEV) += mdev.o
> +
> diff --git a/drivers/vfio/mdev/mdev-core.c b/drivers/vfio/mdev/mdev-core.c
> new file mode 100644
> index 000000000000..af070d73735f
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev-core.c
> @@ -0,0 +1,462 @@
> +/*
> + * Mediated device Core Driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/slab.h>
> +#include <linux/cdev.h>
> +#include <linux/sched.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/sysfs.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +#define DRIVER_VERSION		"0.1"
> +#define DRIVER_AUTHOR		"NVIDIA Corporation"
> +#define DRIVER_DESC		"Mediated device Core Driver"
> +
> +#define MDEV_CLASS_NAME		"mdev"
> +
> +/*
> + * Global Structures
> + */
> +
> +static struct devices_list {
> +	struct list_head    dev_list;
> +	struct mutex        list_lock;
> +} mdevices, phy_devices;
> +
> +/*
> + * Functions
> + */
> +
> +static int mdev_add_attribute_group(struct device *dev,
> +				    const struct attribute_group **groups)
> +{
> +	return sysfs_create_groups(&dev->kobj, groups);
> +}
> +
> +static void mdev_remove_attribute_group(struct device *dev,
> +					const struct attribute_group **groups)
> +{
> +	sysfs_remove_groups(&dev->kobj, groups);
> +}
> +
> +static struct mdev_device *find_mdev_device(uuid_le uuid, int instance)
> +{
> +	struct mdev_device *vdev = NULL, *v;
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_for_each_entry(v, &mdevices.dev_list, next) {
> +		if ((uuid_le_cmp(v->uuid, uuid) == 0) &&
> +		    (v->instance == instance)) {
> +			vdev = v;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&mdevices.list_lock);
> +	return vdev;
> +}
> +
> +static struct mdev_device *find_next_mdev_device(struct phy_device *phy_dev)
> +{

What's "next" about this function?  "next" generally means the caller
provides a starting point in the list and the search continues from
there.

> +	struct mdev_device *mdev = NULL, *p;
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_for_each_entry(p, &mdevices.dev_list, next) {
> +		if (p->phy_dev == phy_dev) {
> +			mdev = p;
> +			break;
> +		}
> +	}

Ah, I see from the unregister below that this is intended as a first
entry type function, so the naming is not consistent with Linux
terminology.  Suggest "first" in the name.

> +	mutex_unlock(&mdevices.list_lock);
> +	return mdev;
> +}
> +
> +static struct phy_device *find_physical_device(struct device *dev)
> +{
> +	struct phy_device *pdev = NULL, *p;
> +
> +	mutex_lock(&phy_devices.list_lock);
> +	list_for_each_entry(p, &phy_devices.dev_list, next) {
> +		if (p->dev == dev) {
> +			pdev = p;
> +			break;
> +		}
> +	}
> +	mutex_unlock(&phy_devices.list_lock);
> +	return pdev;
> +}
> +
> +static void mdev_destroy_device(struct mdev_device *mdevice)
> +{
> +	struct phy_device *phy_dev = mdevice->phy_dev;
> +
> +	if (phy_dev) {
> +		mutex_lock(&phy_devices.list_lock);
> +
> +		/*
> +		* If vendor driver doesn't return success that means vendor
> +		* driver doesn't support hot-unplug
> +		*/
> +		if (phy_dev->ops->destroy) {
> +			if (phy_dev->ops->destroy(phy_dev->dev, mdevice->uuid,
> +						  mdevice->instance)) {
> +				mutex_unlock(&phy_devices.list_lock);
> +				return;
> +			}
> +		}
> +
> +		mdev_remove_attribute_group(&mdevice->dev,
> +					    phy_dev->ops->mdev_attr_groups);
> +		mdevice->phy_dev = NULL;
> +		mutex_unlock(&phy_devices.list_lock);

Locking here appears arbitrary, how does the above code interact with
phy_devices.dev_list?

> +	}
> +
> +	mdev_put_device(mdevice);
> +	device_unregister(&mdevice->dev);
> +}
> +
> +/*
> + * Find mediated device from given iommu_group and increment refcount of
> + * mediated device. Caller should call mdev_put_device() when the use of
> + * mdev_device is done.
> + */
> +struct mdev_device *mdev_get_device_by_group(struct iommu_group *group)
> +{
> +	struct mdev_device *mdev = NULL, *p;
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_for_each_entry(p, &mdevices.dev_list, next) {
> +		if (!p->group)
> +			continue;
> +
> +		if (iommu_group_id(p->group) == iommu_group_id(group)) {
> +			mdev = mdev_get_device(p);
> +			break;
> +		}
> +	}
> +	mutex_unlock(&mdevices.list_lock);
> +	return mdev;
> +}
> +EXPORT_SYMBOL_GPL(mdev_get_device_by_group);
> +
> +/*
> + * mdev_register_device : Register a device
> + * @dev: device structure representing physical device.
> + * @phy_device_ops: Physical device operation structure to be registered.

Why are we insistent that there's a physical device?

> + *
> + * Add device to list of registered physical devices.
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_device(struct device *dev, const struct phy_device_ops *ops)
> +{
> +	int ret = 0;
> +	struct phy_device *phy_dev, *pdev;
> +
> +	if (!dev || !ops)
> +		return -EINVAL;
> +
> +	/* Check for duplicate */
> +	pdev = find_physical_device(dev);
> +	if (pdev)
> +		return -EEXIST;
> +

Why do we need a separate variable for this test vs just using phy_dev?

> +	phy_dev = kzalloc(sizeof(*phy_dev), GFP_KERNEL);
> +	if (!phy_dev)
> +		return -ENOMEM;
> +
> +	phy_dev->dev = dev;
> +	phy_dev->ops = ops;
> +

There's a race between where we searched for the physical device above
and when we grab this lock.

> +	mutex_lock(&phy_devices.list_lock);
> +	ret = mdev_create_sysfs_files(dev);
> +	if (ret)
> +		goto add_sysfs_error;
> +
> +	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
> +	if (ret)
> +		goto add_group_error;
> +
> +	list_add(&phy_dev->next, &phy_devices.dev_list);
> +	dev_info(dev, "MDEV: Registered\n");
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	return 0;
> +
> +add_group_error:
> +	mdev_remove_sysfs_files(dev);
> +add_sysfs_error:
> +	mutex_unlock(&phy_devices.list_lock);
> +	kfree(phy_dev);
> +	return ret;
> +}
> +EXPORT_SYMBOL(mdev_register_device);
> +
> +/*
> + * mdev_unregister_device : Unregister a physical device
> + * @dev: device structure representing physical device.
> + *
> + * Remove device from list of registered physical devices. Gives a change to
> + * free existing mediated devices for the given physical device.
> + */
> +
> +void mdev_unregister_device(struct device *dev)
> +{
> +	struct phy_device *phy_dev;
> +	struct mdev_device *vdev = NULL;

vdev?  Why not mdev?

> +
> +	phy_dev = find_physical_device(dev);
> +
> +	if (!phy_dev)
> +		return;
> +
> +	dev_info(dev, "MDEV: Unregistering\n");
> +
> +	while ((vdev = find_next_mdev_device(phy_dev)))
> +		mdev_destroy_device(vdev);
> +

I'm guessing there's a race here that a new mdev could be added before
the phy_dev gets removed.  Probably need to fix ordering to remove the
phy_dev from the list first to prevent new mdev's being created from it.

> +	mutex_lock(&phy_devices.list_lock);
> +	list_del(&phy_dev->next);
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	mdev_remove_attribute_group(dev,
> +				    phy_dev->ops->dev_attr_groups);
> +
> +	mdev_remove_sysfs_files(dev);
> +	kfree(phy_dev);
> +}
> +EXPORT_SYMBOL(mdev_unregister_device);
> +
> +/*
> + * Functions required for mdev-sysfs
> + */
> +
> +static struct mdev_device *mdev_device_alloc(uuid_le uuid, int instance)
> +{
> +	struct mdev_device *mdevice = NULL;

Used mdev above, then vdev, now mdevice, pick one.

> +
> +	mdevice = kzalloc(sizeof(*mdevice), GFP_KERNEL);
> +	if (!mdevice)
> +		return ERR_PTR(-ENOMEM);
> +
> +	kref_init(&mdevice->kref);
> +	memcpy(&mdevice->uuid, &uuid, sizeof(uuid_le));
> +	mdevice->instance = instance;
> +	mutex_init(&mdevice->ops_lock);
> +
> +	return mdevice;
> +}
> +
> +static void mdev_device_release(struct device *dev)
> +{
> +	struct mdev_device *mdevice = to_mdev_device(dev);
> +
> +	if (!mdevice)
> +		return;
> +
> +	dev_info(&mdevice->dev, "MDEV: destroying\n");
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_del(&mdevice->next);
> +	mutex_unlock(&mdevices.list_lock);
> +
> +	kfree(mdevice);
> +}
> +
> +int create_mdev_device(struct device *dev, uuid_le uuid, uint32_t instance,
> +		       char *mdev_params)

Naming seems inconsistent, mdev_device_create()?

> +{
> +	int retval = 0;
> +	struct mdev_device *mdevice = NULL;
> +	struct phy_device *phy_dev;
> +
> +	phy_dev = find_physical_device(dev);
> +	if (!phy_dev)
> +		return -EINVAL;
> +
> +	mdevice = mdev_device_alloc(uuid, instance);
> +	if (IS_ERR(mdevice)) {
> +		retval = PTR_ERR(mdevice);
> +		return retval;
> +	}
> +
> +	mdevice->dev.parent  = dev;
> +	mdevice->dev.bus     = &mdev_bus_type;
> +	mdevice->dev.release = mdev_device_release;
> +	dev_set_name(&mdevice->dev, "%pUb-%d", uuid.b, instance);
> +
> +	mutex_lock(&mdevices.list_lock);
> +	list_add(&mdevice->next, &mdevices.dev_list);

We assume no conflicts?

> +	mutex_unlock(&mdevices.list_lock);
> +
> +	retval = device_register(&mdevice->dev);
> +	if (retval) {
> +		mdev_put_device(mdevice);
> +		return retval;
> +	}
> +
> +	mutex_lock(&phy_devices.list_lock);

What are we locking here?  We found phy_dev under lock but we have
absolutely no guarantee that it still exists and holding this mutex
doesn't change that.

> +	if (phy_dev->ops->create) {
> +		retval = phy_dev->ops->create(dev, mdevice->uuid,
> +					      instance, mdev_params);
> +		if (retval)
> +			goto create_failed;
> +	}
> +
> +	retval = mdev_add_attribute_group(&mdevice->dev,
> +					  phy_dev->ops->mdev_attr_groups);
> +	if (retval)
> +		goto create_failed;
> +
> +	mdevice->phy_dev = phy_dev;
> +	mutex_unlock(&phy_devices.list_lock);
> +	mdev_get_device(mdevice);
> +	dev_info(&mdevice->dev, "MDEV: created\n");
> +
> +	return retval;
> +
> +create_failed:
> +	mutex_unlock(&phy_devices.list_lock);
> +	device_unregister(&mdevice->dev);
> +	return retval;
> +}
> +
> +int destroy_mdev_device(uuid_le uuid, uint32_t instance)
> +{
> +	struct mdev_device *vdev;
> +
> +	vdev = find_mdev_device(uuid, instance);
> +
> +	if (!vdev)
> +		return -EINVAL;
> +
> +	mdev_destroy_device(vdev);
> +	return 0;
> +}
> +
> +void get_mdev_supported_types(struct device *dev, char *str)

Is there some defined max for the string?  How do we know how much the
caller has allocated?  Should we have a char** here so we can allocate
it?

> +{
> +	struct phy_device *phy_dev;
> +
> +	phy_dev = find_physical_device(dev);
> +
> +	if (phy_dev) {
> +		mutex_lock(&phy_devices.list_lock);

Again, this lock doesn't protect anything.  We either need a reference
or we need end-to-end locking.

> +		if (phy_dev->ops->supported_config)
> +			phy_dev->ops->supported_config(phy_dev->dev, str);
> +		mutex_unlock(&phy_devices.list_lock);
> +	}
> +}
> +
> +int mdev_start_callback(uuid_le uuid, uint32_t instance)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdevice;
> +	struct phy_device *phy_dev;
> +
> +	mdevice = find_mdev_device(uuid, instance);
> +
> +	if (!mdevice)
> +		return -EINVAL;
> +
> +	phy_dev = mdevice->phy_dev;
> +
> +	mutex_lock(&phy_devices.list_lock);

Ineffective locking...

> +	if (phy_dev->ops->start)
> +		ret = phy_dev->ops->start(mdevice->uuid);
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	if (ret < 0)
> +		pr_err("mdev_start failed  %d\n", ret);
> +	else
> +		kobject_uevent(&mdevice->dev.kobj, KOBJ_ONLINE);
> +
> +	return ret;
> +}
> +
> +int mdev_shutdown_callback(uuid_le uuid, uint32_t instance)
> +{
> +	int ret = 0;
> +	struct mdev_device *mdevice;
> +	struct phy_device *phy_dev;
> +
> +	mdevice = find_mdev_device(uuid, instance);
> +
> +	if (!mdevice)
> +		return -EINVAL;
> +
> +	phy_dev = mdevice->phy_dev;
> +
> +	mutex_lock(&phy_devices.list_lock);
> +	if (phy_dev->ops->shutdown)
> +		ret = phy_dev->ops->shutdown(mdevice->uuid);
> +	mutex_unlock(&phy_devices.list_lock);
> +
> +	if (ret < 0)
> +		pr_err("mdev_shutdown failed %d\n", ret);
> +	else
> +		kobject_uevent(&mdevice->dev.kobj, KOBJ_OFFLINE);
> +
> +	return ret;
> +}
> +
> +static struct class mdev_class = {
> +	.name		= MDEV_CLASS_NAME,
> +	.owner		= THIS_MODULE,
> +	.class_attrs	= mdev_class_attrs,
> +};
> +
> +static int __init mdev_init(void)
> +{
> +	int rc = 0;
> +
> +	mutex_init(&mdevices.list_lock);
> +	INIT_LIST_HEAD(&mdevices.dev_list);
> +	mutex_init(&phy_devices.list_lock);
> +	INIT_LIST_HEAD(&phy_devices.dev_list);
> +
> +	rc = class_register(&mdev_class);
> +	if (rc < 0) {
> +		pr_err("Failed to register mdev class\n");
> +		return rc;
> +	}
> +
> +	rc = mdev_bus_register();
> +	if (rc < 0) {
> +		pr_err("Failed to register mdev bus\n");
> +		class_unregister(&mdev_class);
> +		return rc;
> +	}
> +
> +	return rc;
> +}
> +
> +static void __exit mdev_exit(void)
> +{
> +	mdev_bus_unregister();
> +	class_unregister(&mdev_class);
> +}
> +
> +module_init(mdev_init)
> +module_exit(mdev_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
> diff --git a/drivers/vfio/mdev/mdev-driver.c b/drivers/vfio/mdev/mdev-driver.c
> new file mode 100644
> index 000000000000..bc8a169782bc
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev-driver.c
> @@ -0,0 +1,139 @@
> +/*
> + * MDEV driver
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/device.h>
> +#include <linux/iommu.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +static int mdevice_attach_iommu(struct mdev_device *mdevice)
> +{
> +	int retval = 0;
> +	struct iommu_group *group = NULL;
> +
> +	group = iommu_group_alloc();
> +	if (IS_ERR(group)) {
> +		dev_err(&mdevice->dev, "MDEV: failed to allocate group!\n");
> +		return PTR_ERR(group);
> +	}
> +
> +	retval = iommu_group_add_device(group, &mdevice->dev);
> +	if (retval) {
> +		dev_err(&mdevice->dev, "MDEV: failed to add dev to group!\n");
> +		goto attach_fail;
> +	}
> +
> +	mdevice->group = group;
> +
> +	dev_info(&mdevice->dev, "MDEV: group_id = %d\n",
> +				 iommu_group_id(group));

I assume a lot of these should probably be dev_dbg() or just removed
before we actually think about committing this code.

> +attach_fail:
> +	iommu_group_put(group);
> +	return retval;
> +}
> +
> +static void mdevice_detach_iommu(struct mdev_device *mdevice)
> +{
> +	iommu_group_remove_device(&mdevice->dev);
> +	dev_info(&mdevice->dev, "MDEV: detaching iommu\n");
> +}
> +
> +static int mdevice_probe(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdevice = to_mdev_device(dev);
> +	int status = 0;

status here, retval above, ret in previous file, please use some
consistency.  mdevice vs mdev, same.

> +
> +	status = mdevice_attach_iommu(mdevice);
> +	if (status) {
> +		dev_err(dev, "Failed to attach IOMMU\n");
> +		return status;
> +	}
> +
> +	if (drv && drv->probe)
> +		status = drv->probe(dev);
> +
> +	return status;
> +}
> +
> +static int mdevice_remove(struct device *dev)
> +{
> +	struct mdev_driver *drv = to_mdev_driver(dev->driver);
> +	struct mdev_device *mdevice = to_mdev_device(dev);
> +
> +	if (drv && drv->remove)
> +		drv->remove(dev);
> +
> +	mdevice_detach_iommu(mdevice);
> +
> +	return 0;
> +}
> +
> +static int mdevice_match(struct device *dev, struct device_driver *drv)
> +{
> +	int ret = 0;
> +	struct mdev_driver *mdrv = to_mdev_driver(drv);
> +
> +	if (mdrv && mdrv->match)
> +		ret = mdrv->match(dev);
> +
> +	return ret;
> +}
> +
> +struct bus_type mdev_bus_type = {
> +	.name		= "mdev",
> +	.match		= mdevice_match,
> +	.probe		= mdevice_probe,
> +	.remove		= mdevice_remove,
> +};
> +EXPORT_SYMBOL_GPL(mdev_bus_type);
> +
> +/**
> + * mdev_register_driver - register a new MDEV driver
> + * @drv: the driver to register
> + * @owner: owner module of driver ro register
> + *
> + * Returns a negative value on error, otherwise 0.
> + */
> +int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
> +{
> +	/* initialize common driver fields */
> +	drv->driver.name = drv->name;
> +	drv->driver.bus = &mdev_bus_type;
> +	drv->driver.owner = owner;
> +
> +	/* register with core */
> +	return driver_register(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_register_driver);
> +
> +/**
> + * mdev_unregister_driver - unregister MDEV driver
> + * @drv: the driver to unregister
> + *
> + */
> +void mdev_unregister_driver(struct mdev_driver *drv)
> +{
> +	driver_unregister(&drv->driver);
> +}
> +EXPORT_SYMBOL(mdev_unregister_driver);
> +
> +int mdev_bus_register(void)
> +{
> +	return bus_register(&mdev_bus_type);
> +}
> +
> +void mdev_bus_unregister(void)
> +{
> +	bus_unregister(&mdev_bus_type);
> +}
> diff --git a/drivers/vfio/mdev/mdev-sysfs.c b/drivers/vfio/mdev/mdev-sysfs.c
> new file mode 100644
> index 000000000000..79d351a7a502
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev-sysfs.c
> @@ -0,0 +1,312 @@
> +/*
> + * File attributes for Mediated devices
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/sysfs.h>
> +#include <linux/ctype.h>
> +#include <linux/device.h>
> +#include <linux/slab.h>
> +#include <linux/uuid.h>
> +#include <linux/mdev.h>
> +
> +#include "mdev_private.h"
> +
> +/* Prototypes */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf);
> +static DEVICE_ATTR_RO(mdev_supported_types);
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_create);
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count);
> +static DEVICE_ATTR_WO(mdev_destroy);
> +
> +/* Static functions */
> +
> +#define UUID_CHAR_LENGTH	36
> +#define UUID_BYTE_LENGTH	16
> +
> +#define SUPPORTED_TYPE_BUFFER_LENGTH	1024
> +
> +static inline bool is_uuid_sep(char sep)
> +{
> +	if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
> +		return true;
> +	return false;
> +}
> +
> +static int uuid_parse(const char *str, uuid_le *uuid)
> +{
> +	int i;
> +
> +	if (strlen(str) < UUID_CHAR_LENGTH)
> +		return -1;
> +
> +	for (i = 0; i < UUID_BYTE_LENGTH; i++) {
> +		if (!isxdigit(str[0]) || !isxdigit(str[1])) {
> +			pr_err("%s err", __func__);
> +			return -EINVAL;
> +		}
> +
> +		uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
> +		str += 2;
> +		if (is_uuid_sep(*str))
> +			str++;
> +	}
> +
> +	return 0;
> +}
> +
> +/* Functions */
> +static ssize_t mdev_supported_types_show(struct device *dev,
> +					 struct device_attribute *attr,
> +					 char *buf)
> +{
> +	char *str;
> +	ssize_t n;
> +
> +	str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL);
> +	if (!str)
> +		return -ENOMEM;
> +
> +	get_mdev_supported_types(dev, str);
> +
> +	n = sprintf(buf, "%s\n", str);
> +	kfree(str);
> +
> +	return n;
> +}
> +
> +static ssize_t mdev_create_store(struct device *dev,
> +				 struct device_attribute *attr,
> +				 const char *buf, size_t count)
> +{
> +	char *str, *pstr;
> +	char *uuid_str, *instance_str, *mdev_params = NULL;
> +	uuid_le uuid;
> +	uint32_t instance;
> +	int ret = 0;
> +
> +	pstr = str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_create: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (!str) {
> +		pr_err("mdev_create: mdev instance not present %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	instance_str = strsep(&str, ":");
> +	if (!instance_str) {
> +		pr_err("mdev_create: Empty instance string %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	ret = kstrtouint(instance_str, 0, &instance);
> +	if (ret) {
> +		pr_err("mdev_create: mdev instance parsing error %s\n", buf);
> +		goto create_error;
> +	}
> +
> +	if (!str) {
> +		pr_err("mdev_create: mdev params not specified %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}

Are they necessarily required?  What if the driver doesn't have
multiple types?  The supported_config callback is optional per previous
code.

> +
> +	mdev_params = kstrdup(str, GFP_KERNEL);
> +
> +	if (!mdev_params) {
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_create: UUID parse error %s\n", buf);
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +
> +	if (create_mdev_device(dev, uuid, instance, mdev_params) < 0) {
> +		pr_err("mdev_create: Failed to create mdev device\n");
> +		ret = -EINVAL;
> +		goto create_error;
> +	}
> +	ret = count;
> +
> +create_error:
> +	kfree(mdev_params);
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +static ssize_t mdev_destroy_store(struct device *dev,
> +				  struct device_attribute *attr,
> +				  const char *buf, size_t count)
> +{
> +	char *uuid_str, *str, *pstr;
> +	uuid_le uuid;
> +	unsigned int instance;
> +	int ret;
> +
> +	str = pstr = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!str)
> +		return -ENOMEM;
> +
> +	uuid_str = strsep(&str, ":");
> +	if (!uuid_str) {
> +		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	if (str == NULL) {
> +		pr_err("mdev_destroy: instance not specified %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	ret = kstrtouint(str, 0, &instance);
> +	if (ret) {
> +		pr_err("mdev_destroy: instance parsing error %s\n", buf);
> +		goto destroy_error;
> +	}
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
> +		ret = -EINVAL;
> +		goto destroy_error;
> +	}
> +
> +	ret = destroy_mdev_device(uuid, instance);
> +	if (ret < 0)
> +		goto destroy_error;
> +
> +	ret = count;
> +
> +destroy_error:
> +	kfree(pstr);
> +	return ret;
> +}
> +
> +ssize_t mdev_start_store(struct class *class, struct class_attribute *attr,
> +			 const char *buf, size_t count)
> +{
> +	char *uuid_str;
> +	uuid_le uuid;
> +	int ret = 0;
> +
> +	uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_start: UUID parse error  %s\n", buf);
> +		ret = -EINVAL;
> +		goto start_error;
> +	}
> +
> +	ret = mdev_start_callback(uuid, 0);
> +	if (ret < 0)
> +		goto start_error;
> +
> +	ret = count;
> +
> +start_error:
> +	kfree(uuid_str);
> +	return ret;
> +}
> +
> +ssize_t mdev_shutdown_store(struct class *class, struct class_attribute *attr,
> +			    const char *buf, size_t count)
> +{
> +	char *uuid_str;
> +	uuid_le uuid;
> +	int ret = 0;
> +
> +	uuid_str = kstrndup(buf, count, GFP_KERNEL);
> +
> +	if (!uuid_str)
> +		return -ENOMEM;
> +
> +	if (uuid_parse(uuid_str, &uuid) < 0) {
> +		pr_err("mdev_shutdown: UUID parse error %s\n", buf);
> +		ret = -EINVAL;
> +	}
> +
> +	ret = mdev_shutdown_callback(uuid, 0);
> +	if (ret < 0)
> +		goto shutdown_error;
> +
> +	ret = count;
> +
> +shutdown_error:
> +	kfree(uuid_str);
> +	return ret;
> +
> +}
> +
> +struct class_attribute mdev_class_attrs[] = {
> +	__ATTR_WO(mdev_start),
> +	__ATTR_WO(mdev_shutdown),
> +	__ATTR_NULL
> +};
> +
> +int mdev_create_sysfs_files(struct device *dev)
> +{
> +	int retval;
> +
> +	retval = sysfs_create_file(&dev->kobj,
> +				   &dev_attr_mdev_supported_types.attr);
> +	if (retval) {
> +		pr_err("Failed to create mdev_supported_types sysfs entry\n");
> +		return retval;
> +	}
> +
> +	retval = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	if (retval) {
> +		pr_err("Failed to create mdev_create sysfs entry\n");
> +		return retval;
> +	}
> +
> +	retval = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +	if (retval) {
> +		pr_err("Failed to create mdev_destroy sysfs entry\n");
> +		return retval;
> +	}
> +
> +	return 0;
> +}
> +
> +void mdev_remove_sysfs_files(struct device *dev)
> +{
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
> +	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
> +}
> diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
> new file mode 100644
> index 000000000000..a472310c7749
> --- /dev/null
> +++ b/drivers/vfio/mdev/mdev_private.h
> @@ -0,0 +1,33 @@
> +/*
> + * Mediated device interal definitions
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_PRIVATE_H
> +#define MDEV_PRIVATE_H
> +
> +int  mdev_bus_register(void);
> +void mdev_bus_unregister(void);
> +
> +/* Function prototypes for mdev_sysfs */
> +
> +extern struct class_attribute mdev_class_attrs[];
> +
> +int  mdev_create_sysfs_files(struct device *dev);
> +void mdev_remove_sysfs_files(struct device *dev);
> +
> +int  create_mdev_device(struct device *dev, uuid_le uuid, uint32_t instance,
> +			char *mdev_params);
> +int  destroy_mdev_device(uuid_le uuid, uint32_t instance);
> +void get_mdev_supported_types(struct device *dev, char *str);
> +int  mdev_start_callback(uuid_le uuid, uint32_t instance);
> +int  mdev_shutdown_callback(uuid_le uuid, uint32_t instance);
> +
> +#endif /* MDEV_PRIVATE_H */
> diff --git a/include/linux/mdev.h b/include/linux/mdev.h
> new file mode 100644
> index 000000000000..d9633acd85f2
> --- /dev/null
> +++ b/include/linux/mdev.h
> @@ -0,0 +1,224 @@
> +/*
> + * Mediated device definition
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#ifndef MDEV_H
> +#define MDEV_H
> +
> +/* Common Data structures */
> +
> +struct pci_region_info {
> +	uint64_t start;
> +	uint64_t size;
> +	uint32_t flags;		/*!< VFIO region info flags */

Perhaps more clear to say "Same as vfio_region_info.flags"  Also prefix
with mdev_

What's with this comment style /*!< ?

> +};
> +
> +enum mdev_emul_space {
> +	EMUL_CONFIG_SPACE,	/*!< PCI configuration space */
> +	EMUL_IO,		/*!< I/O register space */
> +	EMUL_MMIO		/*!< Memory-mapped I/O space */
> +};
> +
> +struct phy_device;
> +
> +/*
> + * Mediated device
> + */
> +
> +struct mdev_device {
> +	struct kref		kref;
> +	struct device		dev;
> +	struct phy_device	*phy_dev;
> +	struct iommu_group	*group;
> +	void			*iommu_data;
> +	uuid_le			uuid;
> +	uint32_t		instance;
> +	void			*driver_data;
> +	struct mutex		ops_lock;
> +	struct list_head	next;
> +};

Could this be in the private header?  Seems like this should be opaque
outside of mdev core.

> +
> +
> +/**
> + * struct phy_device_ops - Structure to be registered for each physical device
> + * to register the device to mdev module.
> + *
> + * @owner:		The module owner.
> + * @dev_attr_groups:	Default attributes of the physical device.
> + * @mdev_attr_groups:	Default attributes of the mediated device.
> + * @supported_config:	Called to get information about supported types.
> + *			@dev : device structure of physical device.
> + *			@config: should return string listing supported config
> + *			Returns integer: success (0) or error (< 0)
> + * @create:		Called to allocate basic resources in physical device's
> + *			driver for a particular mediated device
> + *			@dev: physical pci device structure on which mediated
> + *			      device should be created

Not necessarily pci.

> + *			@uuid: VM's uuid for which VM it is intended to
> + *			@instance: mediated instance in that VM
> + *			@mdev_params: extra parameters required by physical
> + *			device's driver.
> + *			Returns integer: success (0) or error (< 0)
> + * @destroy:		Called to free resources in physical device's driver for
> + *			a mediated device instance of that VM.
> + *			@dev: physical device structure to which this mediated
> + *			      device points to.
> + *			@uuid: VM's uuid for which the mediated device belongs
> + *			@instance: mdev instance in that VM
> + *			Returns integer: success (0) or error (< 0)
> + *			If VM is running and destroy() is called that means the
> + *			mdev is being hotunpluged. Return error if VM is running
> + *			and driver doesn't support mediated device hotplug.
> + * @start:		Called to do initiate mediated device initialization
> + *			process in physical device's driver when VM boots before
> + *			qemu starts.
> + *			@uuid: VM's UUID which is booting.
> + *			Returns integer: success (0) or error (< 0)
> + * @shutdown:		Called to teardown mediated device related resources for
> + *			the VM
> + *			@uuid: VM's UUID which is shutting down .
> + *			Returns integer: success (0) or error (< 0)
> + * @read:		Read emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: read buffer
> + *			@count: number bytes to read
> + *			@address_space: specifies for which address
> + *			space the request is: pci_config_space, IO
> + *			register space or MMIO space.

Seems like I asked before and it's no more clear in the code, how do we
handle multiple spaces for various types?  ie. a device might have
multiple MMIO spaces.

> + *			@pos: offset from base address.

What's the base address, zero?

> + *			Retuns number on bytes read on success or error.
> + * @write:		Write emulation callback
> + *			@mdev: mediated device structure
> + *			@buf: write buffer
> + *			@count: number bytes to be written
> + *			@address_space: specifies for which address space the
> + *			request is: pci_config_space, IO register space or MMIO
> + *			space.
> + *			@pos: offset from base address.
> + *			Retuns number on bytes written on success or error.
> + * @set_irqs:		Called to send about interrupts configuration
> + *			information that VMM sets.
> + *			@mdev: mediated device structure
> + *			@flags, index, start, count and *data : same as that of
> + *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
> + * @get_region_info:	Called to get BAR size and flags of mediated device.
> + *			@mdev: mediated device structure
> + *			@region_index: VFIO region index
> + *			@region_info: output, returns size and flags of
> + *				      requested region.

I don't see how vfio regions indexes here map to read/write
address_space and position above.

> + *			Returns integer: success (0) or error (< 0)
> + * @validate_map_request: Validate remap pfn request
> + *			@mdev: mediated device structure
> + *			@virtaddr: target user address to start at
> + *			@pfn: physical address of kernel memory, vendor driver
> + *			      can change if required.
> + *			@size: size of map area, vendor driver can change the
> + *			       size of map area if desired.
> + *			@prot: page protection flags for this mapping, vendor
> + *			       driver can change, if required.
> + *			Returns integer: success (0) or error (< 0)

Still no invalidation interface?

> + *
> + * Physical device that support mediated device should be registered with mdev
> + * module with phy_device_ops structure.
> + */
> +
> +struct phy_device_ops {
> +	struct module   *owner;
> +	const struct attribute_group **dev_attr_groups;
> +	const struct attribute_group **mdev_attr_groups;
> +
> +	int	(*supported_config)(struct device *dev, char *config);
> +	int     (*create)(struct device *dev, uuid_le uuid,
> +			  uint32_t instance, char *mdev_params);
> +	int     (*destroy)(struct device *dev, uuid_le uuid,
> +			   uint32_t instance);
> +	int     (*start)(uuid_le uuid);
> +	int     (*shutdown)(uuid_le uuid);
> +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> +			enum mdev_emul_space address_space, loff_t pos);
> +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> +			 enum mdev_emul_space address_space, loff_t pos);
> +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> +			    unsigned int index, unsigned int start,
> +			    unsigned int count, void *data);
> +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> +				 struct pci_region_info *region_info);

*pci*_region_info??

> +	int	(*validate_map_request)(struct mdev_device *vdev,
> +					unsigned long virtaddr,
> +					unsigned long *pfn, unsigned long *size,
> +					pgprot_t *prot);
> +};
> +
> +/*
> + * Physical Device
> + */
> +struct phy_device {
> +	struct device                   *dev;
> +	const struct phy_device_ops     *ops;
> +	struct list_head                next;
> +};

I would really like to be able to use the mediated device interface to
create a purely virtual device, is the expectation that my physical
device interface would create a virtual struct device which would
become the parent and control point in sysfs for creating all the mdev
devices? Should we be calling this a host_device or mdev_parent_dev in
that case since there's really no requirement that it be a physical
device?  Thanks,

Alex

> +
> +/**
> + * struct mdev_driver - Mediated device driver
> + * @name: driver name
> + * @probe: called when new device created
> + * @remove: called when device removed
> + * @match: called when new device or driver is added for this bus. Return 1 if
> + *	   given device can be handled by given driver and zero otherwise.
> + * @driver: device driver structure
> + *
> + **/
> +struct mdev_driver {
> +	const char *name;
> +	int  (*probe)(struct device *dev);
> +	void (*remove)(struct device *dev);
> +	int  (*match)(struct device *dev);
> +	struct device_driver driver;
> +};
> +
> +static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
> +{
> +	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
> +}
> +
> +static inline struct mdev_device *to_mdev_device(struct device *dev)
> +{
> +	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
> +}
> +
> +static inline struct mdev_device *mdev_get_device(struct mdev_device *vdev)
> +{
> +	return (vdev && get_device(&vdev->dev)) ? vdev : NULL;
> +}
> +
> +static inline  void mdev_put_device(struct mdev_device *vdev)
> +{
> +	if (vdev)
> +		put_device(&vdev->dev);
> +}
> +
> +extern struct bus_type mdev_bus_type;
> +
> +#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
> +
> +extern int  mdev_register_device(struct device *dev,
> +				 const struct phy_device_ops *ops);
> +extern void mdev_unregister_device(struct device *dev);
> +
> +extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
> +extern void mdev_unregister_driver(struct mdev_driver *drv);
> +
> +extern int mdev_map_virtual_bar(uint64_t virt_bar_addr, uint64_t phys_bar_addr,
> +				uint32_t len, uint32_t flags);
> +
> +extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
> +
> +#endif /* MDEV_H */
Kirti Wankhede May 26, 2016, 9:03 a.m. UTC | #4
Thanks Alex.

I'll consider all the nits and fix those in next version of patch.

More below:

On 5/26/2016 4:09 AM, Alex Williamson wrote:
> On Wed, 25 May 2016 01:28:15 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>

...

>> +
>> +config MDEV
>> +    tristate "Mediated device driver framework"
>> +    depends on VFIO
>> +    default n
>> +    help
>> +        MDEV provides a framework to virtualize device without
SR-IOV cap
>> +        See Documentation/mdev.txt for more details.
>
> I don't see that file anywhere in this series.

Yes, missed this file in this patch. I'll add it in next version of patch.
Since mdev module is moved in vfio directory, should I place this file
in vfio directory, Documentation/vfio/mdev.txt? or keep documentation of
mdev module within vfio.txt itself?


>> +	if (phy_dev) {
>> +		mutex_lock(&phy_devices.list_lock);
>> +
>> +		/*
>> +		* If vendor driver doesn't return success that means vendor
>> +		* driver doesn't support hot-unplug
>> +		*/
>> +		if (phy_dev->ops->destroy) {
>> +			if (phy_dev->ops->destroy(phy_dev->dev, mdevice->uuid,
>> +						  mdevice->instance)) {
>> +				mutex_unlock(&phy_devices.list_lock);
>> +				return;
>> +			}
>> +		}
>> +
>> +		mdev_remove_attribute_group(&mdevice->dev,
>> +					    phy_dev->ops->mdev_attr_groups);
>> +		mdevice->phy_dev = NULL;
>> +		mutex_unlock(&phy_devices.list_lock);
>
> Locking here appears arbitrary, how does the above code interact with
> phy_devices.dev_list?
>

Sorry for not being clear about phy_devices.list_lock, probably I
shouldn't have named it 'list_lock'. This lock is also to synchronize
register_device & unregister_device and physical device specific
callbacks: supported_config, create, destroy, start and shutdown.
Although supported_config, create and destroy are per phy_device
specific callbacks while start and shutdown could refer to multiple
phy_devices indirectly when there are multiple mdev devices of same type
on different physical devices. There could be race condition in start
callback and destroy & unregister_device. I'm revisiting this lock again
and will see to use per phy device lock for phy_device specific callbacks.


>> +struct mdev_device {
>> +	struct kref		kref;
>> +	struct device		dev;
>> +	struct phy_device	*phy_dev;
>> +	struct iommu_group	*group;
>> +	void			*iommu_data;
>> +	uuid_le			uuid;
>> +	uint32_t		instance;
>> +	void			*driver_data;
>> +	struct mutex		ops_lock;
>> +	struct list_head	next;
>> +};
>
> Could this be in the private header?  Seems like this should be opaque
> outside of mdev core.
>

No, this structure is used in mediated device call back functions to
vendor driver so that vendor driver could identify mdev device, similar
to pci_dev structure in pci bus subsystem. (I'll remove kref which is
not being used at all.)


>> + * @read:		Read emulation callback
>> + *			@mdev: mediated device structure
>> + *			@buf: read buffer
>> + *			@count: number bytes to read
>> + *			@address_space: specifies for which address
>> + *			space the request is: pci_config_space, IO
>> + *			register space or MMIO space.
>
> Seems like I asked before and it's no more clear in the code, how do we
> handle multiple spaces for various types?  ie. a device might have
> multiple MMIO spaces.
>
>> + *			@pos: offset from base address.

Sorry, updated the code but missed to update comment here.
pos = base_address + offset
(its not 'pos' anymore, will rename it to addr)

so vendor driver is aware about base addresses of multiple MMIO spaces
and its size, they can identify MMIO space based on addr.

>> +/*
>> + * Physical Device
>> + */
>> +struct phy_device {
>> +	struct device                   *dev;
>> +	const struct phy_device_ops     *ops;
>> +	struct list_head                next;
>> +};
>
> I would really like to be able to use the mediated device interface to
> create a purely virtual device, is the expectation that my physical
> device interface would create a virtual struct device which would
> become the parent and control point in sysfs for creating all the mdev
> devices? Should we be calling this a host_device or mdev_parent_dev in
> that case since there's really no requirement that it be a physical
> device?

Makes sense. I'll rename it to parent_device.

Thanks,
Kirti.
Alex Williamson May 26, 2016, 2:06 p.m. UTC | #5
On Thu, 26 May 2016 14:33:39 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Thanks Alex.
> 
> I'll consider all the nits and fix those in next version of patch.
> 
> More below:
> 
> On 5/26/2016 4:09 AM, Alex Williamson wrote:
> > On Wed, 25 May 2016 01:28:15 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >  
> 
> ...
> 
> >> +
> >> +config MDEV
> >> +    tristate "Mediated device driver framework"
> >> +    depends on VFIO
> >> +    default n
> >> +    help
> >> +        MDEV provides a framework to virtualize device without  
> SR-IOV cap
> >> +        See Documentation/mdev.txt for more details.  
> >
> > I don't see that file anywhere in this series.  
> 
> Yes, missed this file in this patch. I'll add it in next version of patch.
> Since mdev module is moved in vfio directory, should I place this file
> in vfio directory, Documentation/vfio/mdev.txt? or keep documentation of
> mdev module within vfio.txt itself?

Maybe just call it vfio-mediated-device.txt

> >> +	if (phy_dev) {
> >> +		mutex_lock(&phy_devices.list_lock);
> >> +
> >> +		/*
> >> +		* If vendor driver doesn't return success that means vendor
> >> +		* driver doesn't support hot-unplug
> >> +		*/
> >> +		if (phy_dev->ops->destroy) {
> >> +			if (phy_dev->ops->destroy(phy_dev->dev, mdevice->uuid,
> >> +						  mdevice->instance)) {
> >> +				mutex_unlock(&phy_devices.list_lock);
> >> +				return;
> >> +			}
> >> +		}
> >> +
> >> +		mdev_remove_attribute_group(&mdevice->dev,
> >> +					    phy_dev->ops->mdev_attr_groups);
> >> +		mdevice->phy_dev = NULL;
> >> +		mutex_unlock(&phy_devices.list_lock);  
> >
> > Locking here appears arbitrary, how does the above code interact with
> > phy_devices.dev_list?
> >  
> 
> Sorry for not being clear about phy_devices.list_lock, probably I
> shouldn't have named it 'list_lock'. This lock is also to synchronize
> register_device & unregister_device and physical device specific
> callbacks: supported_config, create, destroy, start and shutdown.
> Although supported_config, create and destroy are per phy_device
> specific callbacks while start and shutdown could refer to multiple
> phy_devices indirectly when there are multiple mdev devices of same type
> on different physical devices. There could be race condition in start
> callback and destroy & unregister_device. I'm revisiting this lock again
> and will see to use per phy device lock for phy_device specific callbacks.
> 
> 
> >> +struct mdev_device {
> >> +	struct kref		kref;
> >> +	struct device		dev;
> >> +	struct phy_device	*phy_dev;
> >> +	struct iommu_group	*group;
> >> +	void			*iommu_data;
> >> +	uuid_le			uuid;
> >> +	uint32_t		instance;
> >> +	void			*driver_data;
> >> +	struct mutex		ops_lock;
> >> +	struct list_head	next;
> >> +};  
> >
> > Could this be in the private header?  Seems like this should be opaque
> > outside of mdev core.
> >  
> 
> No, this structure is used in mediated device call back functions to
> vendor driver so that vendor driver could identify mdev device, similar
> to pci_dev structure in pci bus subsystem. (I'll remove kref which is
> not being used at all.)

Personally I'd prefer to see more use of reference counting and less
locking, especially since the locking is mostly ineffective in this
version.

> >> + * @read:		Read emulation callback
> >> + *			@mdev: mediated device structure
> >> + *			@buf: read buffer
> >> + *			@count: number bytes to read
> >> + *			@address_space: specifies for which address
> >> + *			space the request is: pci_config_space, IO
> >> + *			register space or MMIO space.  
> >
> > Seems like I asked before and it's no more clear in the code, how do we
> > handle multiple spaces for various types?  ie. a device might have
> > multiple MMIO spaces.
> >  
> >> + *			@pos: offset from base address.  
> 
> Sorry, updated the code but missed to update comment here.
> pos = base_address + offset
> (its not 'pos' anymore, will rename it to addr)
> 
> so vendor driver is aware about base addresses of multiple MMIO spaces
> and its size, they can identify MMIO space based on addr.

Why not let the vendor driver provide vfio_region_info directly,
including the offset within the device file descriptor thedn the
mediated device core simply pass read/write through without caring what
the address space is?  Thanks,

Alex
 
> >> +/*
> >> + * Physical Device
> >> + */
> >> +struct phy_device {
> >> +	struct device                   *dev;
> >> +	const struct phy_device_ops     *ops;
> >> +	struct list_head                next;
> >> +};  
> >
> > I would really like to be able to use the mediated device interface to
> > create a purely virtual device, is the expectation that my physical
> > device interface would create a virtual struct device which would
> > become the parent and control point in sysfs for creating all the mdev
> > devices? Should we be calling this a host_device or mdev_parent_dev in
> > that case since there's really no requirement that it be a physical
> > device?  
> 
> Makes sense. I'll rename it to parent_device.
> 
> Thanks,
> Kirti.
>
Tian, Kevin May 27, 2016, 9 a.m. UTC | #6
> From: Kirti Wankhede
> Sent: Wednesday, May 25, 2016 10:47 PM
> 
> 
> >> +static struct devices_list {
> >> +	struct list_head    dev_list;
> >> +	struct mutex        list_lock;
> >> +} mdevices, phy_devices;
> >
> > phy_devices -> pdevices? and similarly we can use pdev/mdev
> > pair in other places...
> >
> 
> 'pdevices' sometimes also refers to 'pointer to devices' that's the
> reason I perfer to use phy_devices to represent 'physical devices'

well, I think it should be clear in this context where 'p' means 'physical'.
Just like frequently used pdev in pci files for pci_dev...

> 
> 
> >> +static struct mdev_device *find_mdev_device(uuid_le uuid, int instance)
> >
> > can we just call it "struct mdev* or "mdevice"? "dev_device" looks
> redundant.
> >
> 
> 'struct mdev_device' represents 'device structure for device created by
> mdev module'. Still that doesn't satisfy major folks, I'm open to change
> it.

IMO 'mdev' should be clearly enough to represent a mediated device

> 
> 
> > Sorry I may have to ask same question since I didn't get an answer yet.
> > what exactly does 'instance' mean here? since uuid is unique, why do
> > we need match instance too?
> >
> 
> 'uuid' could be UUID of a VM for whom it is created. To support mutiple
> mediated devices for same VM, name should be unique. Hence we need a
> instance number to identify each mediated device uniquely in one VM.

If UUID alone cannot universally identify a mediated device, what's the
point of explicitly tagging it in kernel? Either we assign a new UUID for
this mdev itself, or possibly better define it as a string? Management 
stack can pass any ID/name in string format which is sufficiently to identify 
mdev to its own purpose? Then in this framework we just do simple
string match...

> 
> 
> 
> >> +		if (phy_dev->ops->destroy) {
> >> +			if (phy_dev->ops->destroy(phy_dev->dev, mdevice->uuid,
> >> +						  mdevice->instance)) {
> >> +				mutex_unlock(&phy_devices.list_lock);
> >
> > a warning message is preferred. Also better to return -EBUSY here.
> >
> 
> mdev_destroy_device() is called from 2 paths, one is sysfs mdev_destroy
> and mdev_unregister_device(). For the later case, return from here will
> any ways ignored. mdev_unregister_device() is called from the remove
> function of physical device and that doesn't care about return error, it
> just removes the device from subsystem.

Regardless of whether caller will handle error, this function itself should
return error since it makes sense in other path e.g. sysfs to let user
know what's happening.

> 
> >> +				return;
> >> +			}
> >> +		}
> >> +
> >> +		mdev_remove_attribute_group(&mdevice->dev,
> >> +					    phy_dev->ops->mdev_attr_groups);
> >> +		mdevice->phy_dev = NULL;
> >
> > Am I missing something here? You didn't remove this mdev node from
> > the list, and below...
> >
> 
> device_unregister() calls put_device(dev) and if refcount is zero its
> release function is called, which is mdev_device_release(), that is
> hooked during device_register(). This node is removed from list from
> mdev_device_release().

I'm not sure whether there'll be some race condition here, since you
put a defunc mdev on the list... 

> 
> >> +	phy_dev->dev = dev;
> >> +	phy_dev->ops = ops;
> >> +
> >> +	mutex_lock(&phy_devices.list_lock);
> >> +	ret = mdev_create_sysfs_files(dev);
> >> +	if (ret)
> >> +		goto add_sysfs_error;
> >> +
> >> +	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
> >> +	if (ret)
> >> +		goto add_group_error;
> >
> > any reason to include sysfs operations inside the mutex which is
> > purely about phy_devices list?
> >
> 
> dev_attr_groups attribute is for physical device, hence inside
> phy_devices.list_lock.

phy_devices.list_lock only protects the list, when you plan to add a
new phy_device node after it's initialized and get ready. sysfs
attribute setup is still part of initialization.


Thanks
Kevin
Dong Jia Shi June 3, 2016, 8:57 a.m. UTC | #7
On Wed, 25 May 2016 01:28:15 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:


...snip...

> +struct phy_device_ops {
> +	struct module   *owner;
> +	const struct attribute_group **dev_attr_groups;
> +	const struct attribute_group **mdev_attr_groups;
> +
> +	int	(*supported_config)(struct device *dev, char *config);
> +	int     (*create)(struct device *dev, uuid_le uuid,
> +			  uint32_t instance, char *mdev_params);
> +	int     (*destroy)(struct device *dev, uuid_le uuid,
> +			   uint32_t instance);
> +	int     (*start)(uuid_le uuid);
> +	int     (*shutdown)(uuid_le uuid);
> +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> +			enum mdev_emul_space address_space, loff_t pos);
> +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> +			 enum mdev_emul_space address_space, loff_t pos);
> +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> +			    unsigned int index, unsigned int start,
> +			    unsigned int count, void *data);
> +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> +				 struct pci_region_info *region_info);
> +	int	(*validate_map_request)(struct mdev_device *vdev,
> +					unsigned long virtaddr,
> +					unsigned long *pfn, unsigned long *size,
> +					pgprot_t *prot);
> +};

Dear Kirti:

When I rebased my vfio-ccw patches on this series, I found I need an
extra 'ioctl' callback in phy_device_ops.

The ccw physical device only supports one ccw mediated device. And I
have two new ioctl commands for the ccw mediated device. One is 
to hot-reset the resource in the physical device that allocated for
the mediated device, the other is to do an I/O instruction translation
and perform an I/O operation on the physical device. I found the
existing callbacks could not meet my requirements.

Something like the following would be fine for my case:
	int (*ioctl)(struct mdev_device *vdev,
		     unsigned int cmd,
		     unsigned long arg);

What do you think about this?

--------
Dong Jia
Tian, Kevin June 3, 2016, 9:40 a.m. UTC | #8
> From: Dong Jia [mailto:bjsdjshi@linux.vnet.ibm.com]
> Sent: Friday, June 03, 2016 4:58 PM
> 
> 
> ...snip...
> 
> > +struct phy_device_ops {
> > +	struct module   *owner;
> > +	const struct attribute_group **dev_attr_groups;
> > +	const struct attribute_group **mdev_attr_groups;
> > +
> > +	int	(*supported_config)(struct device *dev, char *config);
> > +	int     (*create)(struct device *dev, uuid_le uuid,
> > +			  uint32_t instance, char *mdev_params);
> > +	int     (*destroy)(struct device *dev, uuid_le uuid,
> > +			   uint32_t instance);
> > +	int     (*start)(uuid_le uuid);
> > +	int     (*shutdown)(uuid_le uuid);
> > +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> > +			enum mdev_emul_space address_space, loff_t pos);
> > +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> > +			 enum mdev_emul_space address_space, loff_t pos);
> > +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> > +			    unsigned int index, unsigned int start,
> > +			    unsigned int count, void *data);
> > +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> > +				 struct pci_region_info *region_info);
> > +	int	(*validate_map_request)(struct mdev_device *vdev,
> > +					unsigned long virtaddr,
> > +					unsigned long *pfn, unsigned long *size,
> > +					pgprot_t *prot);
> > +};
> 
> Dear Kirti:
> 
> When I rebased my vfio-ccw patches on this series, I found I need an
> extra 'ioctl' callback in phy_device_ops.
> 
> The ccw physical device only supports one ccw mediated device. And I
> have two new ioctl commands for the ccw mediated device. One is
> to hot-reset the resource in the physical device that allocated for
> the mediated device, the other is to do an I/O instruction translation
> and perform an I/O operation on the physical device. I found the
> existing callbacks could not meet my requirements.
> 
> Something like the following would be fine for my case:
> 	int (*ioctl)(struct mdev_device *vdev,
> 		     unsigned int cmd,
> 		     unsigned long arg);
> 
> What do you think about this?
> 

'reset' should be generic. better to define an individual callback
for it (then we can also expose a node under vgpu path in sysfs).

Thanks
Kevin
Dong Jia Shi June 6, 2016, 2:24 a.m. UTC | #9
On Fri, 3 Jun 2016 09:40:16 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Dong Jia [mailto:bjsdjshi@linux.vnet.ibm.com]
> > Sent: Friday, June 03, 2016 4:58 PM
> > 
> > 
> > ...snip...
> > 
> > > +struct phy_device_ops {
> > > +	struct module   *owner;
> > > +	const struct attribute_group **dev_attr_groups;
> > > +	const struct attribute_group **mdev_attr_groups;
> > > +
> > > +	int	(*supported_config)(struct device *dev, char *config);
> > > +	int     (*create)(struct device *dev, uuid_le uuid,
> > > +			  uint32_t instance, char *mdev_params);
> > > +	int     (*destroy)(struct device *dev, uuid_le uuid,
> > > +			   uint32_t instance);
> > > +	int     (*start)(uuid_le uuid);
> > > +	int     (*shutdown)(uuid_le uuid);
> > > +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> > > +			enum mdev_emul_space address_space, loff_t pos);
> > > +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> > > +			 enum mdev_emul_space address_space, loff_t pos);
> > > +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> > > +			    unsigned int index, unsigned int start,
> > > +			    unsigned int count, void *data);
> > > +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> > > +				 struct pci_region_info *region_info);
> > > +	int	(*validate_map_request)(struct mdev_device *vdev,
> > > +					unsigned long virtaddr,
> > > +					unsigned long *pfn, unsigned long *size,
> > > +					pgprot_t *prot);
> > > +};
> > 
> > Dear Kirti:
> > 
> > When I rebased my vfio-ccw patches on this series, I found I need an
> > extra 'ioctl' callback in phy_device_ops.
> > 
> > The ccw physical device only supports one ccw mediated device. And I
> > have two new ioctl commands for the ccw mediated device. One is
> > to hot-reset the resource in the physical device that allocated for
> > the mediated device, the other is to do an I/O instruction translation
> > and perform an I/O operation on the physical device. I found the
> > existing callbacks could not meet my requirements.
> > 
> > Something like the following would be fine for my case:
> > 	int (*ioctl)(struct mdev_device *vdev,
> > 		     unsigned int cmd,
> > 		     unsigned long arg);
> > 
> > What do you think about this?
> > 
> 
> 'reset' should be generic. better to define an individual callback
> for it (then we can also expose a node under vgpu path in sysfs).
> 
Sounds reasonable for me. :>

> Thanks
> Kevin
> 



--------
Dong Jia
Kirti Wankhede June 6, 2016, 5:27 a.m. UTC | #10
On 6/3/2016 2:27 PM, Dong Jia wrote:
> On Wed, 25 May 2016 01:28:15 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> 
> ...snip...
> 
>> +struct phy_device_ops {
>> +	struct module   *owner;
>> +	const struct attribute_group **dev_attr_groups;
>> +	const struct attribute_group **mdev_attr_groups;
>> +
>> +	int	(*supported_config)(struct device *dev, char *config);
>> +	int     (*create)(struct device *dev, uuid_le uuid,
>> +			  uint32_t instance, char *mdev_params);
>> +	int     (*destroy)(struct device *dev, uuid_le uuid,
>> +			   uint32_t instance);
>> +	int     (*start)(uuid_le uuid);
>> +	int     (*shutdown)(uuid_le uuid);
>> +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
>> +			enum mdev_emul_space address_space, loff_t pos);
>> +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
>> +			 enum mdev_emul_space address_space, loff_t pos);
>> +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
>> +			    unsigned int index, unsigned int start,
>> +			    unsigned int count, void *data);
>> +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
>> +				 struct pci_region_info *region_info);
>> +	int	(*validate_map_request)(struct mdev_device *vdev,
>> +					unsigned long virtaddr,
>> +					unsigned long *pfn, unsigned long *size,
>> +					pgprot_t *prot);
>> +};
> 
> Dear Kirti:
> 
> When I rebased my vfio-ccw patches on this series, I found I need an
> extra 'ioctl' callback in phy_device_ops.
> 

Thanks for taking closer look. As per my knowledge ccw is not PCI
device, right? Correct me if I'm wrong. I'm curious to know. Are you
planning to write a driver (vfio-mccw) for mediated ccw device?

Thanks,
Kirti

> The ccw physical device only supports one ccw mediated device. And I
> have two new ioctl commands for the ccw mediated device. One is 
> to hot-reset the resource in the physical device that allocated for
> the mediated device, the other is to do an I/O instruction translation
> and perform an I/O operation on the physical device. I found the
> existing callbacks could not meet my requirements.
> 
> Something like the following would be fine for my case:
> 	int (*ioctl)(struct mdev_device *vdev,
> 		     unsigned int cmd,
> 		     unsigned long arg);
> 
> What do you think about this?
> 
> --------
> Dong Jia
>
Dong Jia Shi June 6, 2016, 6:01 a.m. UTC | #11
On Mon, 6 Jun 2016 10:57:49 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> 
> 
> On 6/3/2016 2:27 PM, Dong Jia wrote:
> > On Wed, 25 May 2016 01:28:15 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > 
> > 
> > ...snip...
> > 
> >> +struct phy_device_ops {
> >> +	struct module   *owner;
> >> +	const struct attribute_group **dev_attr_groups;
> >> +	const struct attribute_group **mdev_attr_groups;
> >> +
> >> +	int	(*supported_config)(struct device *dev, char *config);
> >> +	int     (*create)(struct device *dev, uuid_le uuid,
> >> +			  uint32_t instance, char *mdev_params);
> >> +	int     (*destroy)(struct device *dev, uuid_le uuid,
> >> +			   uint32_t instance);
> >> +	int     (*start)(uuid_le uuid);
> >> +	int     (*shutdown)(uuid_le uuid);
> >> +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> >> +			enum mdev_emul_space address_space, loff_t pos);
> >> +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> >> +			 enum mdev_emul_space address_space, loff_t pos);
> >> +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> >> +			    unsigned int index, unsigned int start,
> >> +			    unsigned int count, void *data);
> >> +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> >> +				 struct pci_region_info *region_info);
> >> +	int	(*validate_map_request)(struct mdev_device *vdev,
> >> +					unsigned long virtaddr,
> >> +					unsigned long *pfn, unsigned long *size,
> >> +					pgprot_t *prot);
> >> +};
> > 
> > Dear Kirti:
> > 
> > When I rebased my vfio-ccw patches on this series, I found I need an
> > extra 'ioctl' callback in phy_device_ops.
> > 
> 
> Thanks for taking closer look. As per my knowledge ccw is not PCI
> device, right? Correct me if I'm wrong.
Dear Kirti:

You are right. CCW is different to PCI. The official term is 'Channel
I/O device'. They use 'Channels' (co-processors) and CCWs (channel
command words) to handle I/O operations.

> I'm curious to know. Are you planning to write a driver (vfio-mccw) for
> mediated ccw device?
I wrote two drivers:
1. A vfio-pccw driver for the physical ccw device, which will reigister
the device and callbacks to mdev framework. With this, I could create
a mediated ccw device for the physical one then.
2. A vfio-mccw driver for the mediated ccw device, which will add
itself to a vfio_group, mimiced what vfio-mpci did.

The problem is, vfio-mccw need to implement new ioctls besides the
existing ones (VFIO_DEVICE_GET_INFO, etc). And these ioctls really need
the physical device help to handle.

> 
> Thanks,
> Kirti
> 
> > The ccw physical device only supports one ccw mediated device. And I
> > have two new ioctl commands for the ccw mediated device. One is 
> > to hot-reset the resource in the physical device that allocated for
> > the mediated device, the other is to do an I/O instruction translation
> > and perform an I/O operation on the physical device. I found the
> > existing callbacks could not meet my requirements.
> > 
> > Something like the following would be fine for my case:
> > 	int (*ioctl)(struct mdev_device *vdev,
> > 		     unsigned int cmd,
> > 		     unsigned long arg);
> > 
> > What do you think about this?
> > 
> > --------
> > Dong Jia
> > 
> 

--------
Dong Jia
Neo Jia June 6, 2016, 6:27 a.m. UTC | #12
On Mon, Jun 06, 2016 at 02:01:48PM +0800, Dong Jia wrote:
> On Mon, 6 Jun 2016 10:57:49 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
> > 
> > 
> > On 6/3/2016 2:27 PM, Dong Jia wrote:
> > > On Wed, 25 May 2016 01:28:15 +0530
> > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > 
> > > 
> > > ...snip...
> > > 
> > >> +struct phy_device_ops {
> > >> +	struct module   *owner;
> > >> +	const struct attribute_group **dev_attr_groups;
> > >> +	const struct attribute_group **mdev_attr_groups;
> > >> +
> > >> +	int	(*supported_config)(struct device *dev, char *config);
> > >> +	int     (*create)(struct device *dev, uuid_le uuid,
> > >> +			  uint32_t instance, char *mdev_params);
> > >> +	int     (*destroy)(struct device *dev, uuid_le uuid,
> > >> +			   uint32_t instance);
> > >> +	int     (*start)(uuid_le uuid);
> > >> +	int     (*shutdown)(uuid_le uuid);
> > >> +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> > >> +			enum mdev_emul_space address_space, loff_t pos);
> > >> +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> > >> +			 enum mdev_emul_space address_space, loff_t pos);
> > >> +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> > >> +			    unsigned int index, unsigned int start,
> > >> +			    unsigned int count, void *data);
> > >> +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> > >> +				 struct pci_region_info *region_info);
> > >> +	int	(*validate_map_request)(struct mdev_device *vdev,
> > >> +					unsigned long virtaddr,
> > >> +					unsigned long *pfn, unsigned long *size,
> > >> +					pgprot_t *prot);
> > >> +};
> > > 
> > > Dear Kirti:
> > > 
> > > When I rebased my vfio-ccw patches on this series, I found I need an
> > > extra 'ioctl' callback in phy_device_ops.
> > > 
> > 
> > Thanks for taking closer look. As per my knowledge ccw is not PCI
> > device, right? Correct me if I'm wrong.
> Dear Kirti:
> 
> You are right. CCW is different to PCI. The official term is 'Channel
> I/O device'. They use 'Channels' (co-processors) and CCWs (channel
> command words) to handle I/O operations.
> 
> > I'm curious to know. Are you planning to write a driver (vfio-mccw) for
> > mediated ccw device?
> I wrote two drivers:
> 1. A vfio-pccw driver for the physical ccw device, which will reigister
> the device and callbacks to mdev framework. With this, I could create
> a mediated ccw device for the physical one then.
> 2. A vfio-mccw driver for the mediated ccw device, which will add
> itself to a vfio_group, mimiced what vfio-mpci did.
> 
> The problem is, vfio-mccw need to implement new ioctls besides the
> existing ones (VFIO_DEVICE_GET_INFO, etc). And these ioctls really need
> the physical device help to handle.

Hi Dong,

Could you please help us understand a bit more about the new VFIO ioctl? Since it is
a new ioctl it is send down by QEMU in this case right? More details?

Thanks,
Neo

> 
> > 
> > Thanks,
> > Kirti
> > 
> > > The ccw physical device only supports one ccw mediated device. And I
> > > have two new ioctl commands for the ccw mediated device. One is 
> > > to hot-reset the resource in the physical device that allocated for
> > > the mediated device, the other is to do an I/O instruction translation
> > > and perform an I/O operation on the physical device. I found the
> > > existing callbacks could not meet my requirements.
> > > 
> > > Something like the following would be fine for my case:
> > > 	int (*ioctl)(struct mdev_device *vdev,
> > > 		     unsigned int cmd,
> > > 		     unsigned long arg);
> > > 
> > > What do you think about this?
> > > 
> > > --------
> > > Dong Jia
> > > 
> > 
> 
> --------
> Dong Jia
>
Dong Jia Shi June 6, 2016, 8:29 a.m. UTC | #13
On Sun, 5 Jun 2016 23:27:42 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Mon, Jun 06, 2016 at 02:01:48PM +0800, Dong Jia wrote:
> > On Mon, 6 Jun 2016 10:57:49 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > 
> > > 
> > > 
> > > On 6/3/2016 2:27 PM, Dong Jia wrote:
> > > > On Wed, 25 May 2016 01:28:15 +0530
> > > > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> > > > 
> > > > 
> > > > ...snip...
> > > > 
> > > >> +struct phy_device_ops {
> > > >> +	struct module   *owner;
> > > >> +	const struct attribute_group **dev_attr_groups;
> > > >> +	const struct attribute_group **mdev_attr_groups;
> > > >> +
> > > >> +	int	(*supported_config)(struct device *dev, char *config);
> > > >> +	int     (*create)(struct device *dev, uuid_le uuid,
> > > >> +			  uint32_t instance, char *mdev_params);
> > > >> +	int     (*destroy)(struct device *dev, uuid_le uuid,
> > > >> +			   uint32_t instance);
> > > >> +	int     (*start)(uuid_le uuid);
> > > >> +	int     (*shutdown)(uuid_le uuid);
> > > >> +	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
> > > >> +			enum mdev_emul_space address_space, loff_t pos);
> > > >> +	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
> > > >> +			 enum mdev_emul_space address_space, loff_t pos);
> > > >> +	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
> > > >> +			    unsigned int index, unsigned int start,
> > > >> +			    unsigned int count, void *data);
> > > >> +	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
> > > >> +				 struct pci_region_info *region_info);
> > > >> +	int	(*validate_map_request)(struct mdev_device *vdev,
> > > >> +					unsigned long virtaddr,
> > > >> +					unsigned long *pfn, unsigned long *size,
> > > >> +					pgprot_t *prot);
> > > >> +};
> > > > 
> > > > Dear Kirti:
> > > > 
> > > > When I rebased my vfio-ccw patches on this series, I found I need an
> > > > extra 'ioctl' callback in phy_device_ops.
> > > > 
> > > 
> > > Thanks for taking closer look. As per my knowledge ccw is not PCI
> > > device, right? Correct me if I'm wrong.
> > Dear Kirti:
> > 
> > You are right. CCW is different to PCI. The official term is 'Channel
> > I/O device'. They use 'Channels' (co-processors) and CCWs (channel
> > command words) to handle I/O operations.
> > 
> > > I'm curious to know. Are you planning to write a driver (vfio-mccw) for
> > > mediated ccw device?
> > I wrote two drivers:
> > 1. A vfio-pccw driver for the physical ccw device, which will reigister
> > the device and callbacks to mdev framework. With this, I could create
> > a mediated ccw device for the physical one then.
> > 2. A vfio-mccw driver for the mediated ccw device, which will add
> > itself to a vfio_group, mimiced what vfio-mpci did.
> > 
> > The problem is, vfio-mccw need to implement new ioctls besides the
> > existing ones (VFIO_DEVICE_GET_INFO, etc). And these ioctls really need
> > the physical device help to handle.
> 
> Hi Dong,
> 
> Could you please help us understand a bit more about the new VFIO ioctl?
Dear Neo,

Sure, with pleasure.
Since I tried not to bring too much ccw specific technical details
here, I wrote quite briefly. Please feel free to ask me for
supplements. :>

> Since it is a new ioctl it is send down by QEMU in this case right?
Right.

> More details?
As mentioned in the former emails, I currently added two new ioctl
commands.

1. VFIO_DEVICE_CCW_HOT_RESET
Both the name and the purpose of this command looks pretty the same
with VFIO_DEVICE_PCI_HOT_RESET.
Since Kevin proposed an individual callback for hot-reset handling, I
believe this will not be a problem (only if you accept the proposal).

2. VFIO_DEVICE_CCW_CMD_REQUEST
This intends to handle an intercepted channel I/O instruction. It
basically need to do the following thing:
  a. Copy the raw data of the CCW program (a group of chained CCWs) from
     user into kernel space buffers.
  b. Do CCW program translation based on the raw data to get a
     real-device runnable CCW program. We'd pin pages for those CCWs
     which have memory space pointers for their offload, and update the
     CCW program with the pinned results (phys).
  c. Issue the translated CCW program to a real-device to perform the
     I/O operation, and wait for the I/O result interrupt.
  d. Once we got the I/O result, copy the result back to user, and
     unpin the pages.

Step c could only be done by the physical device driver, since it's it
that the int_handler belongs to.
Step b and d should be done by the physical device driver. Or we'd
pin/unpin pages in the mediated device driver?

That's why I asked for the new callback.

> 
> Thanks,
> Neo
> 
> > 
> > > 
> > > Thanks,
> > > Kirti
> > > 
> > > > The ccw physical device only supports one ccw mediated device. And I
> > > > have two new ioctl commands for the ccw mediated device. One is 
> > > > to hot-reset the resource in the physical device that allocated for
> > > > the mediated device, the other is to do an I/O instruction translation
> > > > and perform an I/O operation on the physical device. I found the
> > > > existing callbacks could not meet my requirements.
> > > > 
> > > > Something like the following would be fine for my case:
> > > > 	int (*ioctl)(struct mdev_device *vdev,
> > > > 		     unsigned int cmd,
> > > > 		     unsigned long arg);
> > > > 
> > > > What do you think about this?
> > > > 
> > > > --------
> > > > Dong Jia
> > > > 
> > > 
> > 
> > --------
> > Dong Jia
> > 
> 



--------
Dong Jia
Neo Jia June 6, 2016, 5:44 p.m. UTC | #14
On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:
> On Sun, 5 Jun 2016 23:27:42 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> 2. VFIO_DEVICE_CCW_CMD_REQUEST
> This intends to handle an intercepted channel I/O instruction. It
> basically need to do the following thing:

May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
first place?

Thanks,
Neo

>   a. Copy the raw data of the CCW program (a group of chained CCWs) from
>      user into kernel space buffers.
>   b. Do CCW program translation based on the raw data to get a
>      real-device runnable CCW program. We'd pin pages for those CCWs
>      which have memory space pointers for their offload, and update the
>      CCW program with the pinned results (phys).
>   c. Issue the translated CCW program to a real-device to perform the
>      I/O operation, and wait for the I/O result interrupt.
>   d. Once we got the I/O result, copy the result back to user, and
>      unpin the pages.
> 
> Step c could only be done by the physical device driver, since it's it
> that the int_handler belongs to.
> Step b and d should be done by the physical device driver. Or we'd
> pin/unpin pages in the mediated device driver?
> 
> That's why I asked for the new callback.
>
Alex Williamson June 6, 2016, 7:31 p.m. UTC | #15
On Mon, 6 Jun 2016 10:44:25 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:
> > On Sun, 5 Jun 2016 23:27:42 -0700
> > Neo Jia <cjia@nvidia.com> wrote:
> > 
> > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > This intends to handle an intercepted channel I/O instruction. It
> > basically need to do the following thing:  
> 
> May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> first place?

Yep, this is my question as well.  It sounds a bit like there's an
emulated device in QEMU that's trying to tell the mediated device when
to start an operation when we probably should be passing through
whatever i/o operations indicate that status directly to the mediated
device. Thanks,

Alex
Tian, Kevin June 7, 2016, 3:03 a.m. UTC | #16
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Tuesday, June 07, 2016 3:31 AM
> 
> On Mon, 6 Jun 2016 10:44:25 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:
> > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > Neo Jia <cjia@nvidia.com> wrote:
> > >
> > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > This intends to handle an intercepted channel I/O instruction. It
> > > basically need to do the following thing:
> >
> > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > first place?
> 
> Yep, this is my question as well.  It sounds a bit like there's an
> emulated device in QEMU that's trying to tell the mediated device when
> to start an operation when we probably should be passing through
> whatever i/o operations indicate that status directly to the mediated
> device. Thanks,
> 
> Alex

Below is copied from Dong's earlier post which said clear that
a guest cmd submission will trigger the whole flow:

----
Explanation:
Q1-Q4: Qemu side process.
K1-K6: Kernel side process.

Q1. Intercept a ssch instruction.
Q2. Translate the guest ccw program to a user space ccw program
    (u_ccwchain).
Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
    K1. Copy from u_ccwchain to kernel (k_ccwchain).
    K2. Translate the user space ccw program to a kernel space ccw
        program, which becomes runnable for a real device.
    K3. With the necessary information contained in the orb passed in
        by Qemu, issue the k_ccwchain to the device, and wait event q
        for the I/O result.
    K4. Interrupt handler gets the I/O result, and wakes up the wait q.
    K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
        update the user space irb.
    K6. Copy irb and scsw back to user space.
Q4. Update the irb for the guest.
----

My understanding is that such thing belongs to how device is mediated
(so device driver specific), instead of something to be abstracted in 
VFIO which manages resource but doesn't care how resource is used.

Actually we have same requirement in vGPU case, that a guest driver 
needs submit GPU commands through some MMIO register. vGPU device 
model will intercept the submission request (in its own way), do its 
necessary scan/audit to ensure correctness/security, and then submit 
to physical GPU through vendor specific interface. 

No difference with channel I/O here.

Thanks
Kevin
Alex Williamson June 7, 2016, 10:42 p.m. UTC | #17
On Tue, 7 Jun 2016 03:03:32 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Tuesday, June 07, 2016 3:31 AM
> > 
> > On Mon, 6 Jun 2016 10:44:25 -0700
> > Neo Jia <cjia@nvidia.com> wrote:
> >   
> > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:  
> > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > Neo Jia <cjia@nvidia.com> wrote:
> > > >
> > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > This intends to handle an intercepted channel I/O instruction. It
> > > > basically need to do the following thing:  
> > >
> > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > first place?  
> > 
> > Yep, this is my question as well.  It sounds a bit like there's an
> > emulated device in QEMU that's trying to tell the mediated device when
> > to start an operation when we probably should be passing through
> > whatever i/o operations indicate that status directly to the mediated
> > device. Thanks,
> > 
> > Alex  
> 
> Below is copied from Dong's earlier post which said clear that
> a guest cmd submission will trigger the whole flow:
> 
> ----
> Explanation:
> Q1-Q4: Qemu side process.
> K1-K6: Kernel side process.
> 
> Q1. Intercept a ssch instruction.
> Q2. Translate the guest ccw program to a user space ccw program
>     (u_ccwchain).
> Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
>     K1. Copy from u_ccwchain to kernel (k_ccwchain).
>     K2. Translate the user space ccw program to a kernel space ccw
>         program, which becomes runnable for a real device.
>     K3. With the necessary information contained in the orb passed in
>         by Qemu, issue the k_ccwchain to the device, and wait event q
>         for the I/O result.
>     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
>     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
>         update the user space irb.
>     K6. Copy irb and scsw back to user space.
> Q4. Update the irb for the guest.
> ----

Right, but this was the pre-mediated device approach, now we no longer
need step Q2 so we really only need Q1 and therefore Q3 to exist in
QEMU if those are operations that are not visible to the mediated
device; which they very well might be, since it's described as an
instruction rather than an i/o operation.  It's not terrible if that's
the case, vfio-pci has its own ioctl for doing a hot reset.
 
> My understanding is that such thing belongs to how device is mediated
> (so device driver specific), instead of something to be abstracted in 
> VFIO which manages resource but doesn't care how resource is used.
> 
> Actually we have same requirement in vGPU case, that a guest driver 
> needs submit GPU commands through some MMIO register. vGPU device 
> model will intercept the submission request (in its own way), do its 
> necessary scan/audit to ensure correctness/security, and then submit 
> to physical GPU through vendor specific interface. 
> 
> No difference with channel I/O here.

Well, if the GPU command is submitted through an MMIO register, is that
MMIO register part of the mediated device?  If so, could the mediated
device recognize the command and do the scan/audit itself?  QEMU must
not be the point at which mediation occurs for security purposes, QEMU
is userspace and userspace is not to be trusted.  I'm still open to
ioctls where it makes sense, as above, we have PCI specific ioctls and
already, but we need to evaluate each one, why it needs to exist, and
whether we can skip it if the mediated device can trigger the action on
its own.  After all, that's why we're using the vfio api, so we can
re-use much of the existing infrastructure, especially for a vGPU that
exposes itself as a PCI device.  Thanks,

Alex
Tian, Kevin June 8, 2016, 1:18 a.m. UTC | #18
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, June 08, 2016 6:42 AM
> 
> On Tue, 7 Jun 2016 03:03:32 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Tuesday, June 07, 2016 3:31 AM
> > >
> > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > Neo Jia <cjia@nvidia.com> wrote:
> > >
> > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:
> > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > >
> > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > basically need to do the following thing:
> > > >
> > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > first place?
> > >
> > > Yep, this is my question as well.  It sounds a bit like there's an
> > > emulated device in QEMU that's trying to tell the mediated device when
> > > to start an operation when we probably should be passing through
> > > whatever i/o operations indicate that status directly to the mediated
> > > device. Thanks,
> > >
> > > Alex
> >
> > Below is copied from Dong's earlier post which said clear that
> > a guest cmd submission will trigger the whole flow:
> >
> > ----
> > Explanation:
> > Q1-Q4: Qemu side process.
> > K1-K6: Kernel side process.
> >
> > Q1. Intercept a ssch instruction.
> > Q2. Translate the guest ccw program to a user space ccw program
> >     (u_ccwchain).
> > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> >     K2. Translate the user space ccw program to a kernel space ccw
> >         program, which becomes runnable for a real device.
> >     K3. With the necessary information contained in the orb passed in
> >         by Qemu, issue the k_ccwchain to the device, and wait event q
> >         for the I/O result.
> >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> >         update the user space irb.
> >     K6. Copy irb and scsw back to user space.
> > Q4. Update the irb for the guest.
> > ----
> 
> Right, but this was the pre-mediated device approach, now we no longer
> need step Q2 so we really only need Q1 and therefore Q3 to exist in
> QEMU if those are operations that are not visible to the mediated
> device; which they very well might be, since it's described as an
> instruction rather than an i/o operation.  It's not terrible if that's
> the case, vfio-pci has its own ioctl for doing a hot reset.



> 
> > My understanding is that such thing belongs to how device is mediated
> > (so device driver specific), instead of something to be abstracted in
> > VFIO which manages resource but doesn't care how resource is used.
> >
> > Actually we have same requirement in vGPU case, that a guest driver
> > needs submit GPU commands through some MMIO register. vGPU device
> > model will intercept the submission request (in its own way), do its
> > necessary scan/audit to ensure correctness/security, and then submit
> > to physical GPU through vendor specific interface.
> >
> > No difference with channel I/O here.
> 
> Well, if the GPU command is submitted through an MMIO register, is that
> MMIO register part of the mediated device?  If so, could the mediated
> device recognize the command and do the scan/audit itself?  QEMU must
> not be the point at which mediation occurs for security purposes, QEMU
> is userspace and userspace is not to be trusted.  I'm still open to
> ioctls where it makes sense, as above, we have PCI specific ioctls and
> already, but we need to evaluate each one, why it needs to exist, and
> whether we can skip it if the mediated device can trigger the action on
> its own.  After all, that's why we're using the vfio api, so we can
> re-use much of the existing infrastructure, especially for a vGPU that
> exposes itself as a PCI device.  Thanks,
> 

My point is that a guest submission on vGPU is just a normal trapped 
register write, which is forwarded from Qemu to VFIO through pwrite 
interface and then hit mediated vGPU device. The mediated device
will recognize this register write as a submission request and then do
necessary scan (looks we are saying same thing) and then submit to
physical device driver. If loading ccw cmds on channel i/o are also 
through some I/O registers, it can be implemented same way w/o
introducing new ioctl. The r/w handler of mediated device can figure
out whether it's a ccw submission or not. But my understanding might 
be wrong here.

Thanks
Kevin
Alex Williamson June 8, 2016, 1:39 a.m. UTC | #19
On Wed, 8 Jun 2016 01:18:42 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Wednesday, June 08, 2016 6:42 AM
> > 
> > On Tue, 7 Jun 2016 03:03:32 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Tuesday, June 07, 2016 3:31 AM
> > > >
> > > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > > Neo Jia <cjia@nvidia.com> wrote:
> > > >  
> > > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:  
> > > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > >
> > > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > > basically need to do the following thing:  
> > > > >
> > > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > > first place?  
> > > >
> > > > Yep, this is my question as well.  It sounds a bit like there's an
> > > > emulated device in QEMU that's trying to tell the mediated device when
> > > > to start an operation when we probably should be passing through
> > > > whatever i/o operations indicate that status directly to the mediated
> > > > device. Thanks,
> > > >
> > > > Alex  
> > >
> > > Below is copied from Dong's earlier post which said clear that
> > > a guest cmd submission will trigger the whole flow:
> > >
> > > ----
> > > Explanation:
> > > Q1-Q4: Qemu side process.
> > > K1-K6: Kernel side process.
> > >
> > > Q1. Intercept a ssch instruction.
> > > Q2. Translate the guest ccw program to a user space ccw program
> > >     (u_ccwchain).
> > > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> > >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> > >     K2. Translate the user space ccw program to a kernel space ccw
> > >         program, which becomes runnable for a real device.
> > >     K3. With the necessary information contained in the orb passed in
> > >         by Qemu, issue the k_ccwchain to the device, and wait event q
> > >         for the I/O result.
> > >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> > >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> > >         update the user space irb.
> > >     K6. Copy irb and scsw back to user space.
> > > Q4. Update the irb for the guest.
> > > ----  
> > 
> > Right, but this was the pre-mediated device approach, now we no longer
> > need step Q2 so we really only need Q1 and therefore Q3 to exist in
> > QEMU if those are operations that are not visible to the mediated
> > device; which they very well might be, since it's described as an
> > instruction rather than an i/o operation.  It's not terrible if that's
> > the case, vfio-pci has its own ioctl for doing a hot reset.  
> 
> 
> 
> >   
> > > My understanding is that such thing belongs to how device is mediated
> > > (so device driver specific), instead of something to be abstracted in
> > > VFIO which manages resource but doesn't care how resource is used.
> > >
> > > Actually we have same requirement in vGPU case, that a guest driver
> > > needs submit GPU commands through some MMIO register. vGPU device
> > > model will intercept the submission request (in its own way), do its
> > > necessary scan/audit to ensure correctness/security, and then submit
> > > to physical GPU through vendor specific interface.
> > >
> > > No difference with channel I/O here.  
> > 
> > Well, if the GPU command is submitted through an MMIO register, is that
> > MMIO register part of the mediated device?  If so, could the mediated
> > device recognize the command and do the scan/audit itself?  QEMU must
> > not be the point at which mediation occurs for security purposes, QEMU
> > is userspace and userspace is not to be trusted.  I'm still open to
> > ioctls where it makes sense, as above, we have PCI specific ioctls and
> > already, but we need to evaluate each one, why it needs to exist, and
> > whether we can skip it if the mediated device can trigger the action on
> > its own.  After all, that's why we're using the vfio api, so we can
> > re-use much of the existing infrastructure, especially for a vGPU that
> > exposes itself as a PCI device.  Thanks,
> >   
> 
> My point is that a guest submission on vGPU is just a normal trapped 
> register write, which is forwarded from Qemu to VFIO through pwrite 
> interface and then hit mediated vGPU device. The mediated device
> will recognize this register write as a submission request and then do
> necessary scan (looks we are saying same thing) and then submit to
> physical device driver. If loading ccw cmds on channel i/o are also 
> through some I/O registers, it can be implemented same way w/o
> introducing new ioctl. The r/w handler of mediated device can figure
> out whether it's a ccw submission or not. But my understanding might 
> be wrong here.

I think we're in violent agreement ;)
Dong Jia Shi June 8, 2016, 3:18 a.m. UTC | #20
On Tue, 7 Jun 2016 19:39:21 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 8 Jun 2016 01:18:42 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Wednesday, June 08, 2016 6:42 AM
> > > 
> > > On Tue, 7 Jun 2016 03:03:32 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >   
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Tuesday, June 07, 2016 3:31 AM
> > > > >
> > > > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > >  
> > > > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:  
> > > > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > >
> > > > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > > > basically need to do the following thing:  
> > > > > >
> > > > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > > > first place?  
> > > > >
> > > > > Yep, this is my question as well.  It sounds a bit like there's an
> > > > > emulated device in QEMU that's trying to tell the mediated device when
> > > > > to start an operation when we probably should be passing through
> > > > > whatever i/o operations indicate that status directly to the mediated
> > > > > device. Thanks,
> > > > >
> > > > > Alex  
> > > >
> > > > Below is copied from Dong's earlier post which said clear that
> > > > a guest cmd submission will trigger the whole flow:
> > > >
> > > > ----
> > > > Explanation:
> > > > Q1-Q4: Qemu side process.
> > > > K1-K6: Kernel side process.
> > > >
> > > > Q1. Intercept a ssch instruction.
> > > > Q2. Translate the guest ccw program to a user space ccw program
> > > >     (u_ccwchain).
> > > > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> > > >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> > > >     K2. Translate the user space ccw program to a kernel space ccw
> > > >         program, which becomes runnable for a real device.
> > > >     K3. With the necessary information contained in the orb passed in
> > > >         by Qemu, issue the k_ccwchain to the device, and wait event q
> > > >         for the I/O result.
> > > >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> > > >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> > > >         update the user space irb.
> > > >     K6. Copy irb and scsw back to user space.
> > > > Q4. Update the irb for the guest.
> > > > ----  
> > > 
> > > Right, but this was the pre-mediated device approach, now we no longer
> > > need step Q2 so we really only need Q1 and therefore Q3 to exist in
> > > QEMU if those are operations that are not visible to the mediated
> > > device; which they very well might be, since it's described as an
> > > instruction rather than an i/o operation.  It's not terrible if that's
> > > the case, vfio-pci has its own ioctl for doing a hot reset.  
Dear Alex, Kevin and Neo,

'ssch' is a privileged I/O instruction, which should be finally issued
to the dedicated subchannel of the physical device.

BTW, I did remove step Q2 with all of the user-space translation code,
according to your comments in another thread.

> > 
> > 
> > >   
> > > > My understanding is that such thing belongs to how device is mediated
> > > > (so device driver specific), instead of something to be abstracted in
> > > > VFIO which manages resource but doesn't care how resource is used.
> > > >
> > > > Actually we have same requirement in vGPU case, that a guest driver
> > > > needs submit GPU commands through some MMIO register. vGPU device
> > > > model will intercept the submission request (in its own way), do its
> > > > necessary scan/audit to ensure correctness/security, and then submit
> > > > to physical GPU through vendor specific interface.
> > > >
> > > > No difference with channel I/O here.  
> > > 
> > > Well, if the GPU command is submitted through an MMIO register, is that
> > > MMIO register part of the mediated device?  If so, could the mediated
> > > device recognize the command and do the scan/audit itself?  QEMU must
> > > not be the point at which mediation occurs for security purposes, QEMU
> > > is userspace and userspace is not to be trusted.  I'm still open to
> > > ioctls where it makes sense, as above, we have PCI specific ioctls and
> > > already, but we need to evaluate each one, why it needs to exist, and
> > > whether we can skip it if the mediated device can trigger the action on
> > > its own.  After all, that's why we're using the vfio api, so we can
> > > re-use much of the existing infrastructure, especially for a vGPU that
> > > exposes itself as a PCI device.  Thanks,
> > >   
> > 
> > My point is that a guest submission on vGPU is just a normal trapped 
> > register write, which is forwarded from Qemu to VFIO through pwrite 
> > interface and then hit mediated vGPU device. The mediated device
> > will recognize this register write as a submission request and then do
> > necessary scan (looks we are saying same thing) and then submit to
> > physical device driver. If loading ccw cmds on channel i/o are also 
> > through some I/O registers, it can be implemented same way w/o
> > introducing new ioctl.
We are different here. The target of an I/O instruction is the
subchannel. CCW devices don't have these kind of registers. The mediated
ccw device can not recognize such an submission by its own capbilities. 

A CCW device does not have such registers in both the physical and the
mediated devices to sense or recognize the submission request. It's the
CPU that recognizes the submission by intercepting the guest ssch
instruction.

CPU can not tell if it is issued from a passed thru device driver or a
virtio device driver from the guest. So it has to exit to QEMU, and let
QEMU take over.

Once QEMU identifies the target subchannel is serving a passed thru
device, it uses the ioctl to pass the instruction parameters into the
kernel all the way along the mediated driver to the physical driver to
the subchannel to perform the I/O operation.

> > The r/w handler of mediated device can figure
> > out whether it's a ccw submission or not. But my understanding might 
> > be wrong here.
We don't have registers to sense an instruction or operation.

> 
> I think we're in violent agreement ;)
> 

--------
Dong Jia
Neo Jia June 8, 2016, 3:48 a.m. UTC | #21
On Wed, Jun 08, 2016 at 11:18:42AM +0800, Dong Jia wrote:
> On Tue, 7 Jun 2016 19:39:21 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Wed, 8 Jun 2016 01:18:42 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > 
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Wednesday, June 08, 2016 6:42 AM
> > > > 
> > > > On Tue, 7 Jun 2016 03:03:32 +0000
> > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > >   
> > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > Sent: Tuesday, June 07, 2016 3:31 AM
> > > > > >
> > > > > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > >  
> > > > > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:  
> > > > > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > > >
> > > > > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > > > > basically need to do the following thing:  
> > > > > > >
> > > > > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > > > > first place?  
> > > > > >
> > > > > > Yep, this is my question as well.  It sounds a bit like there's an
> > > > > > emulated device in QEMU that's trying to tell the mediated device when
> > > > > > to start an operation when we probably should be passing through
> > > > > > whatever i/o operations indicate that status directly to the mediated
> > > > > > device. Thanks,
> > > > > >
> > > > > > Alex  
> > > > >
> > > > > Below is copied from Dong's earlier post which said clear that
> > > > > a guest cmd submission will trigger the whole flow:
> > > > >
> > > > > ----
> > > > > Explanation:
> > > > > Q1-Q4: Qemu side process.
> > > > > K1-K6: Kernel side process.
> > > > >
> > > > > Q1. Intercept a ssch instruction.
> > > > > Q2. Translate the guest ccw program to a user space ccw program
> > > > >     (u_ccwchain).
> > > > > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> > > > >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> > > > >     K2. Translate the user space ccw program to a kernel space ccw
> > > > >         program, which becomes runnable for a real device.
> > > > >     K3. With the necessary information contained in the orb passed in
> > > > >         by Qemu, issue the k_ccwchain to the device, and wait event q
> > > > >         for the I/O result.
> > > > >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> > > > >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> > > > >         update the user space irb.
> > > > >     K6. Copy irb and scsw back to user space.
> > > > > Q4. Update the irb for the guest.
> > > > > ----  
> > > > 
> > > > Right, but this was the pre-mediated device approach, now we no longer
> > > > need step Q2 so we really only need Q1 and therefore Q3 to exist in
> > > > QEMU if those are operations that are not visible to the mediated
> > > > device; which they very well might be, since it's described as an
> > > > instruction rather than an i/o operation.  It's not terrible if that's
> > > > the case, vfio-pci has its own ioctl for doing a hot reset.  
> Dear Alex, Kevin and Neo,
> 
> 'ssch' is a privileged I/O instruction, which should be finally issued
> to the dedicated subchannel of the physical device.
> 
> BTW, I did remove step Q2 with all of the user-space translation code,
> according to your comments in another thread.
> 
> > > 
> > > 
> > > >   
> > > > > My understanding is that such thing belongs to how device is mediated
> > > > > (so device driver specific), instead of something to be abstracted in
> > > > > VFIO which manages resource but doesn't care how resource is used.
> > > > >
> > > > > Actually we have same requirement in vGPU case, that a guest driver
> > > > > needs submit GPU commands through some MMIO register. vGPU device
> > > > > model will intercept the submission request (in its own way), do its
> > > > > necessary scan/audit to ensure correctness/security, and then submit
> > > > > to physical GPU through vendor specific interface.
> > > > >
> > > > > No difference with channel I/O here.  
> > > > 
> > > > Well, if the GPU command is submitted through an MMIO register, is that
> > > > MMIO register part of the mediated device?  If so, could the mediated
> > > > device recognize the command and do the scan/audit itself?  QEMU must
> > > > not be the point at which mediation occurs for security purposes, QEMU
> > > > is userspace and userspace is not to be trusted.  I'm still open to
> > > > ioctls where it makes sense, as above, we have PCI specific ioctls and
> > > > already, but we need to evaluate each one, why it needs to exist, and
> > > > whether we can skip it if the mediated device can trigger the action on
> > > > its own.  After all, that's why we're using the vfio api, so we can
> > > > re-use much of the existing infrastructure, especially for a vGPU that
> > > > exposes itself as a PCI device.  Thanks,
> > > >   
> > > 
> > > My point is that a guest submission on vGPU is just a normal trapped 
> > > register write, which is forwarded from Qemu to VFIO through pwrite 
> > > interface and then hit mediated vGPU device. The mediated device
> > > will recognize this register write as a submission request and then do
> > > necessary scan (looks we are saying same thing) and then submit to
> > > physical device driver. If loading ccw cmds on channel i/o are also 
> > > through some I/O registers, it can be implemented same way w/o
> > > introducing new ioctl.
> We are different here. The target of an I/O instruction is the
> subchannel. CCW devices don't have these kind of registers. The mediated
> ccw device can not recognize such an submission by its own capbilities. 
> 
> A CCW device does not have such registers in both the physical and the
> mediated devices to sense or recognize the submission request. It's the
> CPU that recognizes the submission by intercepting the guest ssch
> instruction.
> 
> CPU can not tell if it is issued from a passed thru device driver or a
> virtio device driver from the guest. So it has to exit to QEMU, and let
> QEMU take over.

Hi Dong,

What actually has triggered the VM_EXIT to QEMU of that vCPU? Is it an MMIO
access of the "virtual device" inside guest?

Thanks,
Neo

> 
> Once QEMU identifies the target subchannel is serving a passed thru
> device, it uses the ioctl to pass the instruction parameters into the
> kernel all the way along the mediated driver to the physical driver to
> the subchannel to perform the I/O operation.
> 
> > > The r/w handler of mediated device can figure
> > > out whether it's a ccw submission or not. But my understanding might 
> > > be wrong here.
> We don't have registers to sense an instruction or operation.
> 
> > 
> > I think we're in violent agreement ;)
> > 
> 
> --------
> Dong Jia
>
Alex Williamson June 8, 2016, 4:29 a.m. UTC | #22
On Wed, 8 Jun 2016 11:18:42 +0800
Dong Jia <bjsdjshi@linux.vnet.ibm.com> wrote:

> On Tue, 7 Jun 2016 19:39:21 -0600
> Alex Williamson <alex.williamson@redhat.com> wrote:
> 
> > On Wed, 8 Jun 2016 01:18:42 +0000
> > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> >   
> > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > Sent: Wednesday, June 08, 2016 6:42 AM
> > > > 
> > > > On Tue, 7 Jun 2016 03:03:32 +0000
> > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > >     
> > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > Sent: Tuesday, June 07, 2016 3:31 AM
> > > > > >
> > > > > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > >    
> > > > > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:    
> > > > > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > > >
> > > > > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > > > > basically need to do the following thing:    
> > > > > > >
> > > > > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > > > > first place?    
> > > > > >
> > > > > > Yep, this is my question as well.  It sounds a bit like there's an
> > > > > > emulated device in QEMU that's trying to tell the mediated device when
> > > > > > to start an operation when we probably should be passing through
> > > > > > whatever i/o operations indicate that status directly to the mediated
> > > > > > device. Thanks,
> > > > > >
> > > > > > Alex    
> > > > >
> > > > > Below is copied from Dong's earlier post which said clear that
> > > > > a guest cmd submission will trigger the whole flow:
> > > > >
> > > > > ----
> > > > > Explanation:
> > > > > Q1-Q4: Qemu side process.
> > > > > K1-K6: Kernel side process.
> > > > >
> > > > > Q1. Intercept a ssch instruction.
> > > > > Q2. Translate the guest ccw program to a user space ccw program
> > > > >     (u_ccwchain).
> > > > > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> > > > >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> > > > >     K2. Translate the user space ccw program to a kernel space ccw
> > > > >         program, which becomes runnable for a real device.
> > > > >     K3. With the necessary information contained in the orb passed in
> > > > >         by Qemu, issue the k_ccwchain to the device, and wait event q
> > > > >         for the I/O result.
> > > > >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> > > > >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> > > > >         update the user space irb.
> > > > >     K6. Copy irb and scsw back to user space.
> > > > > Q4. Update the irb for the guest.
> > > > > ----    
> > > > 
> > > > Right, but this was the pre-mediated device approach, now we no longer
> > > > need step Q2 so we really only need Q1 and therefore Q3 to exist in
> > > > QEMU if those are operations that are not visible to the mediated
> > > > device; which they very well might be, since it's described as an
> > > > instruction rather than an i/o operation.  It's not terrible if that's
> > > > the case, vfio-pci has its own ioctl for doing a hot reset.    
> Dear Alex, Kevin and Neo,
> 
> 'ssch' is a privileged I/O instruction, which should be finally issued
> to the dedicated subchannel of the physical device.
> 
> BTW, I did remove step Q2 with all of the user-space translation code,
> according to your comments in another thread.
> 
> > > 
> > >   
> > > >     
> > > > > My understanding is that such thing belongs to how device is mediated
> > > > > (so device driver specific), instead of something to be abstracted in
> > > > > VFIO which manages resource but doesn't care how resource is used.
> > > > >
> > > > > Actually we have same requirement in vGPU case, that a guest driver
> > > > > needs submit GPU commands through some MMIO register. vGPU device
> > > > > model will intercept the submission request (in its own way), do its
> > > > > necessary scan/audit to ensure correctness/security, and then submit
> > > > > to physical GPU through vendor specific interface.
> > > > >
> > > > > No difference with channel I/O here.    
> > > > 
> > > > Well, if the GPU command is submitted through an MMIO register, is that
> > > > MMIO register part of the mediated device?  If so, could the mediated
> > > > device recognize the command and do the scan/audit itself?  QEMU must
> > > > not be the point at which mediation occurs for security purposes, QEMU
> > > > is userspace and userspace is not to be trusted.  I'm still open to
> > > > ioctls where it makes sense, as above, we have PCI specific ioctls and
> > > > already, but we need to evaluate each one, why it needs to exist, and
> > > > whether we can skip it if the mediated device can trigger the action on
> > > > its own.  After all, that's why we're using the vfio api, so we can
> > > > re-use much of the existing infrastructure, especially for a vGPU that
> > > > exposes itself as a PCI device.  Thanks,
> > > >     
> > > 
> > > My point is that a guest submission on vGPU is just a normal trapped 
> > > register write, which is forwarded from Qemu to VFIO through pwrite 
> > > interface and then hit mediated vGPU device. The mediated device
> > > will recognize this register write as a submission request and then do
> > > necessary scan (looks we are saying same thing) and then submit to
> > > physical device driver. If loading ccw cmds on channel i/o are also 
> > > through some I/O registers, it can be implemented same way w/o
> > > introducing new ioctl.  
> We are different here. The target of an I/O instruction is the
> subchannel. CCW devices don't have these kind of registers. The mediated
> ccw device can not recognize such an submission by its own capbilities. 
> 
> A CCW device does not have such registers in both the physical and the
> mediated devices to sense or recognize the submission request. It's the
> CPU that recognizes the submission by intercepting the guest ssch
> instruction.
> 
> CPU can not tell if it is issued from a passed thru device driver or a
> virtio device driver from the guest. So it has to exit to QEMU, and let
> QEMU take over.
> 
> Once QEMU identifies the target subchannel is serving a passed thru
> device, it uses the ioctl to pass the instruction parameters into the
> kernel all the way along the mediated driver to the physical driver to
> the subchannel to perform the I/O operation.
> 
> > > The r/w handler of mediated device can figure
> > > out whether it's a ccw submission or not. But my understanding might 
> > > be wrong here.  
> We don't have registers to sense an instruction or operation.

Ok, so it seems we need to create some sort of interface to initiate
the ccw program, but I suppose I'm not yet convinced that it needs a
new ioctl.  For instance if you only need to "kick" the device to tell
it when to begin translation and execution, we could create a virtual
interrupt into the mediated device with an irqfd.  QEMU writes to the
irqfd (eventfd), the mediated driver receives this kick and begins
processing.  Another virtual interrupt out to the user might indicate
completion. On the other hand if the ioctl was intended to write the
ccw program itself to the device, we have vfio device regions that can
do this.  Simply define within the vfio-ccw API that one of the regions
is a virtual program buffer and define the API between the mediated
driver and user the sequence of writes that load the program state,
initiate the program, and return the result.

The vfio API already has a very extensible mechanism for communicating
with a device through regions and interrupts, not all of which
necessarily need to match physical attributes of the device.  ioctls
can be added, but lets exhaust the mechanisms we already have through
the vfio api first.  Thanks,

Alex
Dong Jia Shi June 8, 2016, 6:13 a.m. UTC | #23
On Tue, 7 Jun 2016 20:48:42 -0700
Neo Jia <cjia@nvidia.com> wrote:

> On Wed, Jun 08, 2016 at 11:18:42AM +0800, Dong Jia wrote:
> > On Tue, 7 Jun 2016 19:39:21 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > > On Wed, 8 Jun 2016 01:18:42 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > 
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Wednesday, June 08, 2016 6:42 AM
> > > > > 
> > > > > On Tue, 7 Jun 2016 03:03:32 +0000
> > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > >   
> > > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > > Sent: Tuesday, June 07, 2016 3:31 AM
> > > > > > >
> > > > > > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > >  
> > > > > > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:  
> > > > > > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > > > >
> > > > > > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > > > > > basically need to do the following thing:  
> > > > > > > >
> > > > > > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > > > > > first place?  
> > > > > > >
> > > > > > > Yep, this is my question as well.  It sounds a bit like there's an
> > > > > > > emulated device in QEMU that's trying to tell the mediated device when
> > > > > > > to start an operation when we probably should be passing through
> > > > > > > whatever i/o operations indicate that status directly to the mediated
> > > > > > > device. Thanks,
> > > > > > >
> > > > > > > Alex  
> > > > > >
> > > > > > Below is copied from Dong's earlier post which said clear that
> > > > > > a guest cmd submission will trigger the whole flow:
> > > > > >
> > > > > > ----
> > > > > > Explanation:
> > > > > > Q1-Q4: Qemu side process.
> > > > > > K1-K6: Kernel side process.
> > > > > >
> > > > > > Q1. Intercept a ssch instruction.
> > > > > > Q2. Translate the guest ccw program to a user space ccw program
> > > > > >     (u_ccwchain).
> > > > > > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> > > > > >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> > > > > >     K2. Translate the user space ccw program to a kernel space ccw
> > > > > >         program, which becomes runnable for a real device.
> > > > > >     K3. With the necessary information contained in the orb passed in
> > > > > >         by Qemu, issue the k_ccwchain to the device, and wait event q
> > > > > >         for the I/O result.
> > > > > >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> > > > > >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> > > > > >         update the user space irb.
> > > > > >     K6. Copy irb and scsw back to user space.
> > > > > > Q4. Update the irb for the guest.
> > > > > > ----  
> > > > > 
> > > > > Right, but this was the pre-mediated device approach, now we no longer
> > > > > need step Q2 so we really only need Q1 and therefore Q3 to exist in
> > > > > QEMU if those are operations that are not visible to the mediated
> > > > > device; which they very well might be, since it's described as an
> > > > > instruction rather than an i/o operation.  It's not terrible if that's
> > > > > the case, vfio-pci has its own ioctl for doing a hot reset.  
> > Dear Alex, Kevin and Neo,
> > 
> > 'ssch' is a privileged I/O instruction, which should be finally issued
> > to the dedicated subchannel of the physical device.
> > 
> > BTW, I did remove step Q2 with all of the user-space translation code,
> > according to your comments in another thread.
> > 
> > > > 
> > > > 
> > > > >   
> > > > > > My understanding is that such thing belongs to how device is mediated
> > > > > > (so device driver specific), instead of something to be abstracted in
> > > > > > VFIO which manages resource but doesn't care how resource is used.
> > > > > >
> > > > > > Actually we have same requirement in vGPU case, that a guest driver
> > > > > > needs submit GPU commands through some MMIO register. vGPU device
> > > > > > model will intercept the submission request (in its own way), do its
> > > > > > necessary scan/audit to ensure correctness/security, and then submit
> > > > > > to physical GPU through vendor specific interface.
> > > > > >
> > > > > > No difference with channel I/O here.  
> > > > > 
> > > > > Well, if the GPU command is submitted through an MMIO register, is that
> > > > > MMIO register part of the mediated device?  If so, could the mediated
> > > > > device recognize the command and do the scan/audit itself?  QEMU must
> > > > > not be the point at which mediation occurs for security purposes, QEMU
> > > > > is userspace and userspace is not to be trusted.  I'm still open to
> > > > > ioctls where it makes sense, as above, we have PCI specific ioctls and
> > > > > already, but we need to evaluate each one, why it needs to exist, and
> > > > > whether we can skip it if the mediated device can trigger the action on
> > > > > its own.  After all, that's why we're using the vfio api, so we can
> > > > > re-use much of the existing infrastructure, especially for a vGPU that
> > > > > exposes itself as a PCI device.  Thanks,
> > > > >   
> > > > 
> > > > My point is that a guest submission on vGPU is just a normal trapped 
> > > > register write, which is forwarded from Qemu to VFIO through pwrite 
> > > > interface and then hit mediated vGPU device. The mediated device
> > > > will recognize this register write as a submission request and then do
> > > > necessary scan (looks we are saying same thing) and then submit to
> > > > physical device driver. If loading ccw cmds on channel i/o are also 
> > > > through some I/O registers, it can be implemented same way w/o
> > > > introducing new ioctl.
> > We are different here. The target of an I/O instruction is the
> > subchannel. CCW devices don't have these kind of registers. The mediated
> > ccw device can not recognize such an submission by its own capbilities. 
> > 
> > A CCW device does not have such registers in both the physical and the
> > mediated devices to sense or recognize the submission request. It's the
> > CPU that recognizes the submission by intercepting the guest ssch
> > instruction.
> > 
> > CPU can not tell if it is issued from a passed thru device driver or a
> > virtio device driver from the guest. So it has to exit to QEMU, and let
> > QEMU take over.
> 
> Hi Dong,
> 
> What actually has triggered the VM_EXIT to QEMU of that vCPU? Is it an MMIO
> access of the "virtual device" inside guest?
Dear Neo,

It's not a MMIO access, but an I/O instruction.

Our cpu has a mode (like vt-x in the x86 world? I guess...) to oversee
the execution of programs in a virtual machine environment. Once the cpu
enters this mode, it commence execution of the guest program. It could
handle many aspects of an virtual machine, or, when for some
instructions if such handling is not provided, cpu will exit from this
mode. The I/O instruction 'ssch' is one kind of the instructions that
this cpu mode could not handle. So a ssch issued from the guest will
trigger the exit of this cpu mode with the exit_reason, and then the
vcpu gets the reason and exit to QEMU.

> 
> Thanks,
> Neo
> 
> > 
> > Once QEMU identifies the target subchannel is serving a passed thru
> > device, it uses the ioctl to pass the instruction parameters into the
> > kernel all the way along the mediated driver to the physical driver to
> > the subchannel to perform the I/O operation.
> > 
> > > > The r/w handler of mediated device can figure
> > > > out whether it's a ccw submission or not. But my understanding might 
> > > > be wrong here.
> > We don't have registers to sense an instruction or operation.
> > 
> > > 
> > > I think we're in violent agreement ;)
> > > 
> > 
> > --------
> > Dong Jia
> > 
> 



--------
Dong Jia
Neo Jia June 8, 2016, 6:22 a.m. UTC | #24
On Wed, Jun 08, 2016 at 02:13:49PM +0800, Dong Jia wrote:
> On Tue, 7 Jun 2016 20:48:42 -0700
> Neo Jia <cjia@nvidia.com> wrote:
> 
> > On Wed, Jun 08, 2016 at 11:18:42AM +0800, Dong Jia wrote:
> > > On Tue, 7 Jun 2016 19:39:21 -0600
> > > Alex Williamson <alex.williamson@redhat.com> wrote:
> > > 
> > > > On Wed, 8 Jun 2016 01:18:42 +0000
> > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > 
> > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > Sent: Wednesday, June 08, 2016 6:42 AM
> > > > > > 
> > > > > > On Tue, 7 Jun 2016 03:03:32 +0000
> > > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > > >   
> > > > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > > > Sent: Tuesday, June 07, 2016 3:31 AM
> > > > > > > >
> > > > > > > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > > >  
> > > > > > > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:  
> > > > > > > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > > > > >
> > > > > > > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > > > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > > > > > > basically need to do the following thing:  
> > > > > > > > >
> > > > > > > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > > > > > > first place?  
> > > > > > > >
> > > > > > > > Yep, this is my question as well.  It sounds a bit like there's an
> > > > > > > > emulated device in QEMU that's trying to tell the mediated device when
> > > > > > > > to start an operation when we probably should be passing through
> > > > > > > > whatever i/o operations indicate that status directly to the mediated
> > > > > > > > device. Thanks,
> > > > > > > >
> > > > > > > > Alex  
> > > > > > >
> > > > > > > Below is copied from Dong's earlier post which said clear that
> > > > > > > a guest cmd submission will trigger the whole flow:
> > > > > > >
> > > > > > > ----
> > > > > > > Explanation:
> > > > > > > Q1-Q4: Qemu side process.
> > > > > > > K1-K6: Kernel side process.
> > > > > > >
> > > > > > > Q1. Intercept a ssch instruction.
> > > > > > > Q2. Translate the guest ccw program to a user space ccw program
> > > > > > >     (u_ccwchain).
> > > > > > > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> > > > > > >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> > > > > > >     K2. Translate the user space ccw program to a kernel space ccw
> > > > > > >         program, which becomes runnable for a real device.
> > > > > > >     K3. With the necessary information contained in the orb passed in
> > > > > > >         by Qemu, issue the k_ccwchain to the device, and wait event q
> > > > > > >         for the I/O result.
> > > > > > >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> > > > > > >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> > > > > > >         update the user space irb.
> > > > > > >     K6. Copy irb and scsw back to user space.
> > > > > > > Q4. Update the irb for the guest.
> > > > > > > ----  
> > > > > > 
> > > > > > Right, but this was the pre-mediated device approach, now we no longer
> > > > > > need step Q2 so we really only need Q1 and therefore Q3 to exist in
> > > > > > QEMU if those are operations that are not visible to the mediated
> > > > > > device; which they very well might be, since it's described as an
> > > > > > instruction rather than an i/o operation.  It's not terrible if that's
> > > > > > the case, vfio-pci has its own ioctl for doing a hot reset.  
> > > Dear Alex, Kevin and Neo,
> > > 
> > > 'ssch' is a privileged I/O instruction, which should be finally issued
> > > to the dedicated subchannel of the physical device.
> > > 
> > > BTW, I did remove step Q2 with all of the user-space translation code,
> > > according to your comments in another thread.
> > > 
> > > > > 
> > > > > 
> > > > > >   
> > > > > > > My understanding is that such thing belongs to how device is mediated
> > > > > > > (so device driver specific), instead of something to be abstracted in
> > > > > > > VFIO which manages resource but doesn't care how resource is used.
> > > > > > >
> > > > > > > Actually we have same requirement in vGPU case, that a guest driver
> > > > > > > needs submit GPU commands through some MMIO register. vGPU device
> > > > > > > model will intercept the submission request (in its own way), do its
> > > > > > > necessary scan/audit to ensure correctness/security, and then submit
> > > > > > > to physical GPU through vendor specific interface.
> > > > > > >
> > > > > > > No difference with channel I/O here.  
> > > > > > 
> > > > > > Well, if the GPU command is submitted through an MMIO register, is that
> > > > > > MMIO register part of the mediated device?  If so, could the mediated
> > > > > > device recognize the command and do the scan/audit itself?  QEMU must
> > > > > > not be the point at which mediation occurs for security purposes, QEMU
> > > > > > is userspace and userspace is not to be trusted.  I'm still open to
> > > > > > ioctls where it makes sense, as above, we have PCI specific ioctls and
> > > > > > already, but we need to evaluate each one, why it needs to exist, and
> > > > > > whether we can skip it if the mediated device can trigger the action on
> > > > > > its own.  After all, that's why we're using the vfio api, so we can
> > > > > > re-use much of the existing infrastructure, especially for a vGPU that
> > > > > > exposes itself as a PCI device.  Thanks,
> > > > > >   
> > > > > 
> > > > > My point is that a guest submission on vGPU is just a normal trapped 
> > > > > register write, which is forwarded from Qemu to VFIO through pwrite 
> > > > > interface and then hit mediated vGPU device. The mediated device
> > > > > will recognize this register write as a submission request and then do
> > > > > necessary scan (looks we are saying same thing) and then submit to
> > > > > physical device driver. If loading ccw cmds on channel i/o are also 
> > > > > through some I/O registers, it can be implemented same way w/o
> > > > > introducing new ioctl.
> > > We are different here. The target of an I/O instruction is the
> > > subchannel. CCW devices don't have these kind of registers. The mediated
> > > ccw device can not recognize such an submission by its own capbilities. 
> > > 
> > > A CCW device does not have such registers in both the physical and the
> > > mediated devices to sense or recognize the submission request. It's the
> > > CPU that recognizes the submission by intercepting the guest ssch
> > > instruction.
> > > 
> > > CPU can not tell if it is issued from a passed thru device driver or a
> > > virtio device driver from the guest. So it has to exit to QEMU, and let
> > > QEMU take over.
> > 
> > Hi Dong,
> > 
> > What actually has triggered the VM_EXIT to QEMU of that vCPU? Is it an MMIO
> > access of the "virtual device" inside guest?
> Dear Neo,
> 
> It's not a MMIO access, but an I/O instruction.
> 
> Our cpu has a mode (like vt-x in the x86 world? I guess...) to oversee
> the execution of programs in a virtual machine environment. Once the cpu
> enters this mode, it commence execution of the guest program. It could
> handle many aspects of an virtual machine, or, when for some
> instructions if such handling is not provided, cpu will exit from this
> mode. The I/O instruction 'ssch' is one kind of the instructions that
> this cpu mode could not handle. So a ssch issued from the guest will
> trigger the exit of this cpu mode with the exit_reason, and then the
> vcpu gets the reason and exit to QEMU.

Hi Dong,

Thanks for the details. 

Can you claim a MMIO region for your virtual device? If yes, then the I/O
instruction triggered VM_EXIT can be forward to your device by a pwrite from
QEMU thru this new region.

Thanks,
Neo

> 
> > 
> > Thanks,
> > Neo
> > 
> > > 
> > > Once QEMU identifies the target subchannel is serving a passed thru
> > > device, it uses the ioctl to pass the instruction parameters into the
> > > kernel all the way along the mediated driver to the physical driver to
> > > the subchannel to perform the I/O operation.
> > > 
> > > > > The r/w handler of mediated device can figure
> > > > > out whether it's a ccw submission or not. But my understanding might 
> > > > > be wrong here.
> > > We don't have registers to sense an instruction or operation.
> > > 
> > > > 
> > > > I think we're in violent agreement ;)
> > > > 
> > > 
> > > --------
> > > Dong Jia
> > > 
> > 
> 
> 
> 
> --------
> Dong Jia
>
Dong Jia Shi June 15, 2016, 6:37 a.m. UTC | #25
On Tue, 7 Jun 2016 22:29:30 -0600
Alex Williamson <alex.williamson@redhat.com> wrote:

> On Wed, 8 Jun 2016 11:18:42 +0800
> Dong Jia <bjsdjshi@linux.vnet.ibm.com> wrote:
> 
> > On Tue, 7 Jun 2016 19:39:21 -0600
> > Alex Williamson <alex.williamson@redhat.com> wrote:
> > 
> > > On Wed, 8 Jun 2016 01:18:42 +0000
> > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > >   
> > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > Sent: Wednesday, June 08, 2016 6:42 AM
> > > > > 
> > > > > On Tue, 7 Jun 2016 03:03:32 +0000
> > > > > "Tian, Kevin" <kevin.tian@intel.com> wrote:
> > > > >     
> > > > > > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > > > > > Sent: Tuesday, June 07, 2016 3:31 AM
> > > > > > >
> > > > > > > On Mon, 6 Jun 2016 10:44:25 -0700
> > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > >    
> > > > > > > > On Mon, Jun 06, 2016 at 04:29:11PM +0800, Dong Jia wrote:    
> > > > > > > > > On Sun, 5 Jun 2016 23:27:42 -0700
> > > > > > > > > Neo Jia <cjia@nvidia.com> wrote:
> > > > > > > > >
> > > > > > > > > 2. VFIO_DEVICE_CCW_CMD_REQUEST
> > > > > > > > > This intends to handle an intercepted channel I/O instruction. It
> > > > > > > > > basically need to do the following thing:    
> > > > > > > >
> > > > > > > > May I ask how and when QEMU knows that he needs to issue such VFIO ioctl at
> > > > > > > > first place?    
> > > > > > >
> > > > > > > Yep, this is my question as well.  It sounds a bit like there's an
> > > > > > > emulated device in QEMU that's trying to tell the mediated device when
> > > > > > > to start an operation when we probably should be passing through
> > > > > > > whatever i/o operations indicate that status directly to the mediated
> > > > > > > device. Thanks,
> > > > > > >
> > > > > > > Alex    
> > > > > >
> > > > > > Below is copied from Dong's earlier post which said clear that
> > > > > > a guest cmd submission will trigger the whole flow:
> > > > > >
> > > > > > ----
> > > > > > Explanation:
> > > > > > Q1-Q4: Qemu side process.
> > > > > > K1-K6: Kernel side process.
> > > > > >
> > > > > > Q1. Intercept a ssch instruction.
> > > > > > Q2. Translate the guest ccw program to a user space ccw program
> > > > > >     (u_ccwchain).
> > > > > > Q3. Call VFIO_DEVICE_CCW_CMD_REQUEST (u_ccwchain, orb, irb).
> > > > > >     K1. Copy from u_ccwchain to kernel (k_ccwchain).
> > > > > >     K2. Translate the user space ccw program to a kernel space ccw
> > > > > >         program, which becomes runnable for a real device.
> > > > > >     K3. With the necessary information contained in the orb passed in
> > > > > >         by Qemu, issue the k_ccwchain to the device, and wait event q
> > > > > >         for the I/O result.
> > > > > >     K4. Interrupt handler gets the I/O result, and wakes up the wait q.
> > > > > >     K5. CMD_REQUEST ioctl gets the I/O result, and uses the result to
> > > > > >         update the user space irb.
> > > > > >     K6. Copy irb and scsw back to user space.
> > > > > > Q4. Update the irb for the guest.
> > > > > > ----    
> > > > > 
> > > > > Right, but this was the pre-mediated device approach, now we no longer
> > > > > need step Q2 so we really only need Q1 and therefore Q3 to exist in
> > > > > QEMU if those are operations that are not visible to the mediated
> > > > > device; which they very well might be, since it's described as an
> > > > > instruction rather than an i/o operation.  It's not terrible if that's
> > > > > the case, vfio-pci has its own ioctl for doing a hot reset.    
> > Dear Alex, Kevin and Neo,
> > 
> > 'ssch' is a privileged I/O instruction, which should be finally issued
> > to the dedicated subchannel of the physical device.
> > 
> > BTW, I did remove step Q2 with all of the user-space translation code,
> > according to your comments in another thread.
> > 
> > > > 
> > > >   
> > > > >     
> > > > > > My understanding is that such thing belongs to how device is mediated
> > > > > > (so device driver specific), instead of something to be abstracted in
> > > > > > VFIO which manages resource but doesn't care how resource is used.
> > > > > >
> > > > > > Actually we have same requirement in vGPU case, that a guest driver
> > > > > > needs submit GPU commands through some MMIO register. vGPU device
> > > > > > model will intercept the submission request (in its own way), do its
> > > > > > necessary scan/audit to ensure correctness/security, and then submit
> > > > > > to physical GPU through vendor specific interface.
> > > > > >
> > > > > > No difference with channel I/O here.    
> > > > > 
> > > > > Well, if the GPU command is submitted through an MMIO register, is that
> > > > > MMIO register part of the mediated device?  If so, could the mediated
> > > > > device recognize the command and do the scan/audit itself?  QEMU must
> > > > > not be the point at which mediation occurs for security purposes, QEMU
> > > > > is userspace and userspace is not to be trusted.  I'm still open to
> > > > > ioctls where it makes sense, as above, we have PCI specific ioctls and
> > > > > already, but we need to evaluate each one, why it needs to exist, and
> > > > > whether we can skip it if the mediated device can trigger the action on
> > > > > its own.  After all, that's why we're using the vfio api, so we can
> > > > > re-use much of the existing infrastructure, especially for a vGPU that
> > > > > exposes itself as a PCI device.  Thanks,
> > > > >     
> > > > 
> > > > My point is that a guest submission on vGPU is just a normal trapped 
> > > > register write, which is forwarded from Qemu to VFIO through pwrite 
> > > > interface and then hit mediated vGPU device. The mediated device
> > > > will recognize this register write as a submission request and then do
> > > > necessary scan (looks we are saying same thing) and then submit to
> > > > physical device driver. If loading ccw cmds on channel i/o are also 
> > > > through some I/O registers, it can be implemented same way w/o
> > > > introducing new ioctl.  
> > We are different here. The target of an I/O instruction is the
> > subchannel. CCW devices don't have these kind of registers. The mediated
> > ccw device can not recognize such an submission by its own capbilities. 
> > 
> > A CCW device does not have such registers in both the physical and the
> > mediated devices to sense or recognize the submission request. It's the
> > CPU that recognizes the submission by intercepting the guest ssch
> > instruction.
> > 
> > CPU can not tell if it is issued from a passed thru device driver or a
> > virtio device driver from the guest. So it has to exit to QEMU, and let
> > QEMU take over.
> > 
> > Once QEMU identifies the target subchannel is serving a passed thru
> > device, it uses the ioctl to pass the instruction parameters into the
> > kernel all the way along the mediated driver to the physical driver to
> > the subchannel to perform the I/O operation.
> > 
> > > > The r/w handler of mediated device can figure
> > > > out whether it's a ccw submission or not. But my understanding might 
> > > > be wrong here.  
> > We don't have registers to sense an instruction or operation.
> 
> Ok, so it seems we need to create some sort of interface to initiate
> the ccw program, but I suppose I'm not yet convinced that it needs a
> new ioctl.  For instance if you only need to "kick" the device to tell
> it when to begin translation and execution, we could create a virtual
> interrupt into the mediated device with an irqfd.  QEMU writes to the
> irqfd (eventfd), the mediated driver receives this kick and begins
> processing.  Another virtual interrupt out to the user might indicate
> completion. On the other hand if the ioctl was intended to write the
> ccw program itself to the device, we have vfio device regions that can
> do this.  Simply define within the vfio-ccw API that one of the regions
> is a virtual program buffer and define the API between the mediated
> driver and user the sequence of writes that load the program state,
> initiate the program, and return the result.
> 
> The vfio API already has a very extensible mechanism for communicating
> with a device through regions and interrupts, not all of which
> necessarily need to match physical attributes of the device.  ioctls
> can be added, but lets exhaust the mechanisms we already have through
> the vfio api first.  Thanks,
Dear Alex and Neo,

I tried as what you suggested - add an MMIO region to the device, and
it works fine. It's an interesting and elegant way. I like it. :>

So indeed, we neither need to introduce a new ioctl command, nor the
ioctl callback on phy_device_ops.

Thanks!

> 
> Alex
> 



--------
Dong Jia
diff mbox

Patch

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index da6e2ce77495..23eced02aaf6 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -48,4 +48,5 @@  menuconfig VFIO_NOIOMMU
 
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
+source "drivers/vfio/mdev/Kconfig"
 source "virt/lib/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 7b8a31f63fea..7c70753e54ab 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -7,3 +7,4 @@  obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
 obj-$(CONFIG_VFIO_PLATFORM) += platform/
+obj-$(CONFIG_MDEV) += mdev/
diff --git a/drivers/vfio/mdev/Kconfig b/drivers/vfio/mdev/Kconfig
new file mode 100644
index 000000000000..951e2bb06a3f
--- /dev/null
+++ b/drivers/vfio/mdev/Kconfig
@@ -0,0 +1,11 @@ 
+
+config MDEV
+    tristate "Mediated device driver framework"
+    depends on VFIO
+    default n
+    help
+        MDEV provides a framework to virtualize device without SR-IOV cap
+        See Documentation/mdev.txt for more details.
+
+        If you don't know what do here, say N.
+
diff --git a/drivers/vfio/mdev/Makefile b/drivers/vfio/mdev/Makefile
new file mode 100644
index 000000000000..4adb069febce
--- /dev/null
+++ b/drivers/vfio/mdev/Makefile
@@ -0,0 +1,5 @@ 
+
+mdev-y := mdev-core.o mdev-sysfs.o mdev-driver.o
+
+obj-$(CONFIG_MDEV) += mdev.o
+
diff --git a/drivers/vfio/mdev/mdev-core.c b/drivers/vfio/mdev/mdev-core.c
new file mode 100644
index 000000000000..af070d73735f
--- /dev/null
+++ b/drivers/vfio/mdev/mdev-core.c
@@ -0,0 +1,462 @@ 
+/*
+ * Mediated device Core Driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <linux/sched.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/sysfs.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+#define DRIVER_VERSION		"0.1"
+#define DRIVER_AUTHOR		"NVIDIA Corporation"
+#define DRIVER_DESC		"Mediated device Core Driver"
+
+#define MDEV_CLASS_NAME		"mdev"
+
+/*
+ * Global Structures
+ */
+
+static struct devices_list {
+	struct list_head    dev_list;
+	struct mutex        list_lock;
+} mdevices, phy_devices;
+
+/*
+ * Functions
+ */
+
+static int mdev_add_attribute_group(struct device *dev,
+				    const struct attribute_group **groups)
+{
+	return sysfs_create_groups(&dev->kobj, groups);
+}
+
+static void mdev_remove_attribute_group(struct device *dev,
+					const struct attribute_group **groups)
+{
+	sysfs_remove_groups(&dev->kobj, groups);
+}
+
+static struct mdev_device *find_mdev_device(uuid_le uuid, int instance)
+{
+	struct mdev_device *vdev = NULL, *v;
+
+	mutex_lock(&mdevices.list_lock);
+	list_for_each_entry(v, &mdevices.dev_list, next) {
+		if ((uuid_le_cmp(v->uuid, uuid) == 0) &&
+		    (v->instance == instance)) {
+			vdev = v;
+			break;
+		}
+	}
+	mutex_unlock(&mdevices.list_lock);
+	return vdev;
+}
+
+static struct mdev_device *find_next_mdev_device(struct phy_device *phy_dev)
+{
+	struct mdev_device *mdev = NULL, *p;
+
+	mutex_lock(&mdevices.list_lock);
+	list_for_each_entry(p, &mdevices.dev_list, next) {
+		if (p->phy_dev == phy_dev) {
+			mdev = p;
+			break;
+		}
+	}
+	mutex_unlock(&mdevices.list_lock);
+	return mdev;
+}
+
+static struct phy_device *find_physical_device(struct device *dev)
+{
+	struct phy_device *pdev = NULL, *p;
+
+	mutex_lock(&phy_devices.list_lock);
+	list_for_each_entry(p, &phy_devices.dev_list, next) {
+		if (p->dev == dev) {
+			pdev = p;
+			break;
+		}
+	}
+	mutex_unlock(&phy_devices.list_lock);
+	return pdev;
+}
+
+static void mdev_destroy_device(struct mdev_device *mdevice)
+{
+	struct phy_device *phy_dev = mdevice->phy_dev;
+
+	if (phy_dev) {
+		mutex_lock(&phy_devices.list_lock);
+
+		/*
+		* If vendor driver doesn't return success that means vendor
+		* driver doesn't support hot-unplug
+		*/
+		if (phy_dev->ops->destroy) {
+			if (phy_dev->ops->destroy(phy_dev->dev, mdevice->uuid,
+						  mdevice->instance)) {
+				mutex_unlock(&phy_devices.list_lock);
+				return;
+			}
+		}
+
+		mdev_remove_attribute_group(&mdevice->dev,
+					    phy_dev->ops->mdev_attr_groups);
+		mdevice->phy_dev = NULL;
+		mutex_unlock(&phy_devices.list_lock);
+	}
+
+	mdev_put_device(mdevice);
+	device_unregister(&mdevice->dev);
+}
+
+/*
+ * Find mediated device from given iommu_group and increment refcount of
+ * mediated device. Caller should call mdev_put_device() when the use of
+ * mdev_device is done.
+ */
+struct mdev_device *mdev_get_device_by_group(struct iommu_group *group)
+{
+	struct mdev_device *mdev = NULL, *p;
+
+	mutex_lock(&mdevices.list_lock);
+	list_for_each_entry(p, &mdevices.dev_list, next) {
+		if (!p->group)
+			continue;
+
+		if (iommu_group_id(p->group) == iommu_group_id(group)) {
+			mdev = mdev_get_device(p);
+			break;
+		}
+	}
+	mutex_unlock(&mdevices.list_lock);
+	return mdev;
+}
+EXPORT_SYMBOL_GPL(mdev_get_device_by_group);
+
+/*
+ * mdev_register_device : Register a device
+ * @dev: device structure representing physical device.
+ * @phy_device_ops: Physical device operation structure to be registered.
+ *
+ * Add device to list of registered physical devices.
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_device(struct device *dev, const struct phy_device_ops *ops)
+{
+	int ret = 0;
+	struct phy_device *phy_dev, *pdev;
+
+	if (!dev || !ops)
+		return -EINVAL;
+
+	/* Check for duplicate */
+	pdev = find_physical_device(dev);
+	if (pdev)
+		return -EEXIST;
+
+	phy_dev = kzalloc(sizeof(*phy_dev), GFP_KERNEL);
+	if (!phy_dev)
+		return -ENOMEM;
+
+	phy_dev->dev = dev;
+	phy_dev->ops = ops;
+
+	mutex_lock(&phy_devices.list_lock);
+	ret = mdev_create_sysfs_files(dev);
+	if (ret)
+		goto add_sysfs_error;
+
+	ret = mdev_add_attribute_group(dev, ops->dev_attr_groups);
+	if (ret)
+		goto add_group_error;
+
+	list_add(&phy_dev->next, &phy_devices.dev_list);
+	dev_info(dev, "MDEV: Registered\n");
+	mutex_unlock(&phy_devices.list_lock);
+
+	return 0;
+
+add_group_error:
+	mdev_remove_sysfs_files(dev);
+add_sysfs_error:
+	mutex_unlock(&phy_devices.list_lock);
+	kfree(phy_dev);
+	return ret;
+}
+EXPORT_SYMBOL(mdev_register_device);
+
+/*
+ * mdev_unregister_device : Unregister a physical device
+ * @dev: device structure representing physical device.
+ *
+ * Remove device from list of registered physical devices. Gives a change to
+ * free existing mediated devices for the given physical device.
+ */
+
+void mdev_unregister_device(struct device *dev)
+{
+	struct phy_device *phy_dev;
+	struct mdev_device *vdev = NULL;
+
+	phy_dev = find_physical_device(dev);
+
+	if (!phy_dev)
+		return;
+
+	dev_info(dev, "MDEV: Unregistering\n");
+
+	while ((vdev = find_next_mdev_device(phy_dev)))
+		mdev_destroy_device(vdev);
+
+	mutex_lock(&phy_devices.list_lock);
+	list_del(&phy_dev->next);
+	mutex_unlock(&phy_devices.list_lock);
+
+	mdev_remove_attribute_group(dev,
+				    phy_dev->ops->dev_attr_groups);
+
+	mdev_remove_sysfs_files(dev);
+	kfree(phy_dev);
+}
+EXPORT_SYMBOL(mdev_unregister_device);
+
+/*
+ * Functions required for mdev-sysfs
+ */
+
+static struct mdev_device *mdev_device_alloc(uuid_le uuid, int instance)
+{
+	struct mdev_device *mdevice = NULL;
+
+	mdevice = kzalloc(sizeof(*mdevice), GFP_KERNEL);
+	if (!mdevice)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&mdevice->kref);
+	memcpy(&mdevice->uuid, &uuid, sizeof(uuid_le));
+	mdevice->instance = instance;
+	mutex_init(&mdevice->ops_lock);
+
+	return mdevice;
+}
+
+static void mdev_device_release(struct device *dev)
+{
+	struct mdev_device *mdevice = to_mdev_device(dev);
+
+	if (!mdevice)
+		return;
+
+	dev_info(&mdevice->dev, "MDEV: destroying\n");
+
+	mutex_lock(&mdevices.list_lock);
+	list_del(&mdevice->next);
+	mutex_unlock(&mdevices.list_lock);
+
+	kfree(mdevice);
+}
+
+int create_mdev_device(struct device *dev, uuid_le uuid, uint32_t instance,
+		       char *mdev_params)
+{
+	int retval = 0;
+	struct mdev_device *mdevice = NULL;
+	struct phy_device *phy_dev;
+
+	phy_dev = find_physical_device(dev);
+	if (!phy_dev)
+		return -EINVAL;
+
+	mdevice = mdev_device_alloc(uuid, instance);
+	if (IS_ERR(mdevice)) {
+		retval = PTR_ERR(mdevice);
+		return retval;
+	}
+
+	mdevice->dev.parent  = dev;
+	mdevice->dev.bus     = &mdev_bus_type;
+	mdevice->dev.release = mdev_device_release;
+	dev_set_name(&mdevice->dev, "%pUb-%d", uuid.b, instance);
+
+	mutex_lock(&mdevices.list_lock);
+	list_add(&mdevice->next, &mdevices.dev_list);
+	mutex_unlock(&mdevices.list_lock);
+
+	retval = device_register(&mdevice->dev);
+	if (retval) {
+		mdev_put_device(mdevice);
+		return retval;
+	}
+
+	mutex_lock(&phy_devices.list_lock);
+	if (phy_dev->ops->create) {
+		retval = phy_dev->ops->create(dev, mdevice->uuid,
+					      instance, mdev_params);
+		if (retval)
+			goto create_failed;
+	}
+
+	retval = mdev_add_attribute_group(&mdevice->dev,
+					  phy_dev->ops->mdev_attr_groups);
+	if (retval)
+		goto create_failed;
+
+	mdevice->phy_dev = phy_dev;
+	mutex_unlock(&phy_devices.list_lock);
+	mdev_get_device(mdevice);
+	dev_info(&mdevice->dev, "MDEV: created\n");
+
+	return retval;
+
+create_failed:
+	mutex_unlock(&phy_devices.list_lock);
+	device_unregister(&mdevice->dev);
+	return retval;
+}
+
+int destroy_mdev_device(uuid_le uuid, uint32_t instance)
+{
+	struct mdev_device *vdev;
+
+	vdev = find_mdev_device(uuid, instance);
+
+	if (!vdev)
+		return -EINVAL;
+
+	mdev_destroy_device(vdev);
+	return 0;
+}
+
+void get_mdev_supported_types(struct device *dev, char *str)
+{
+	struct phy_device *phy_dev;
+
+	phy_dev = find_physical_device(dev);
+
+	if (phy_dev) {
+		mutex_lock(&phy_devices.list_lock);
+		if (phy_dev->ops->supported_config)
+			phy_dev->ops->supported_config(phy_dev->dev, str);
+		mutex_unlock(&phy_devices.list_lock);
+	}
+}
+
+int mdev_start_callback(uuid_le uuid, uint32_t instance)
+{
+	int ret = 0;
+	struct mdev_device *mdevice;
+	struct phy_device *phy_dev;
+
+	mdevice = find_mdev_device(uuid, instance);
+
+	if (!mdevice)
+		return -EINVAL;
+
+	phy_dev = mdevice->phy_dev;
+
+	mutex_lock(&phy_devices.list_lock);
+	if (phy_dev->ops->start)
+		ret = phy_dev->ops->start(mdevice->uuid);
+	mutex_unlock(&phy_devices.list_lock);
+
+	if (ret < 0)
+		pr_err("mdev_start failed  %d\n", ret);
+	else
+		kobject_uevent(&mdevice->dev.kobj, KOBJ_ONLINE);
+
+	return ret;
+}
+
+int mdev_shutdown_callback(uuid_le uuid, uint32_t instance)
+{
+	int ret = 0;
+	struct mdev_device *mdevice;
+	struct phy_device *phy_dev;
+
+	mdevice = find_mdev_device(uuid, instance);
+
+	if (!mdevice)
+		return -EINVAL;
+
+	phy_dev = mdevice->phy_dev;
+
+	mutex_lock(&phy_devices.list_lock);
+	if (phy_dev->ops->shutdown)
+		ret = phy_dev->ops->shutdown(mdevice->uuid);
+	mutex_unlock(&phy_devices.list_lock);
+
+	if (ret < 0)
+		pr_err("mdev_shutdown failed %d\n", ret);
+	else
+		kobject_uevent(&mdevice->dev.kobj, KOBJ_OFFLINE);
+
+	return ret;
+}
+
+static struct class mdev_class = {
+	.name		= MDEV_CLASS_NAME,
+	.owner		= THIS_MODULE,
+	.class_attrs	= mdev_class_attrs,
+};
+
+static int __init mdev_init(void)
+{
+	int rc = 0;
+
+	mutex_init(&mdevices.list_lock);
+	INIT_LIST_HEAD(&mdevices.dev_list);
+	mutex_init(&phy_devices.list_lock);
+	INIT_LIST_HEAD(&phy_devices.dev_list);
+
+	rc = class_register(&mdev_class);
+	if (rc < 0) {
+		pr_err("Failed to register mdev class\n");
+		return rc;
+	}
+
+	rc = mdev_bus_register();
+	if (rc < 0) {
+		pr_err("Failed to register mdev bus\n");
+		class_unregister(&mdev_class);
+		return rc;
+	}
+
+	return rc;
+}
+
+static void __exit mdev_exit(void)
+{
+	mdev_bus_unregister();
+	class_unregister(&mdev_class);
+}
+
+module_init(mdev_init)
+module_exit(mdev_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/drivers/vfio/mdev/mdev-driver.c b/drivers/vfio/mdev/mdev-driver.c
new file mode 100644
index 000000000000..bc8a169782bc
--- /dev/null
+++ b/drivers/vfio/mdev/mdev-driver.c
@@ -0,0 +1,139 @@ 
+/*
+ * MDEV driver
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/device.h>
+#include <linux/iommu.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+static int mdevice_attach_iommu(struct mdev_device *mdevice)
+{
+	int retval = 0;
+	struct iommu_group *group = NULL;
+
+	group = iommu_group_alloc();
+	if (IS_ERR(group)) {
+		dev_err(&mdevice->dev, "MDEV: failed to allocate group!\n");
+		return PTR_ERR(group);
+	}
+
+	retval = iommu_group_add_device(group, &mdevice->dev);
+	if (retval) {
+		dev_err(&mdevice->dev, "MDEV: failed to add dev to group!\n");
+		goto attach_fail;
+	}
+
+	mdevice->group = group;
+
+	dev_info(&mdevice->dev, "MDEV: group_id = %d\n",
+				 iommu_group_id(group));
+attach_fail:
+	iommu_group_put(group);
+	return retval;
+}
+
+static void mdevice_detach_iommu(struct mdev_device *mdevice)
+{
+	iommu_group_remove_device(&mdevice->dev);
+	dev_info(&mdevice->dev, "MDEV: detaching iommu\n");
+}
+
+static int mdevice_probe(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdevice = to_mdev_device(dev);
+	int status = 0;
+
+	status = mdevice_attach_iommu(mdevice);
+	if (status) {
+		dev_err(dev, "Failed to attach IOMMU\n");
+		return status;
+	}
+
+	if (drv && drv->probe)
+		status = drv->probe(dev);
+
+	return status;
+}
+
+static int mdevice_remove(struct device *dev)
+{
+	struct mdev_driver *drv = to_mdev_driver(dev->driver);
+	struct mdev_device *mdevice = to_mdev_device(dev);
+
+	if (drv && drv->remove)
+		drv->remove(dev);
+
+	mdevice_detach_iommu(mdevice);
+
+	return 0;
+}
+
+static int mdevice_match(struct device *dev, struct device_driver *drv)
+{
+	int ret = 0;
+	struct mdev_driver *mdrv = to_mdev_driver(drv);
+
+	if (mdrv && mdrv->match)
+		ret = mdrv->match(dev);
+
+	return ret;
+}
+
+struct bus_type mdev_bus_type = {
+	.name		= "mdev",
+	.match		= mdevice_match,
+	.probe		= mdevice_probe,
+	.remove		= mdevice_remove,
+};
+EXPORT_SYMBOL_GPL(mdev_bus_type);
+
+/**
+ * mdev_register_driver - register a new MDEV driver
+ * @drv: the driver to register
+ * @owner: owner module of driver ro register
+ *
+ * Returns a negative value on error, otherwise 0.
+ */
+int mdev_register_driver(struct mdev_driver *drv, struct module *owner)
+{
+	/* initialize common driver fields */
+	drv->driver.name = drv->name;
+	drv->driver.bus = &mdev_bus_type;
+	drv->driver.owner = owner;
+
+	/* register with core */
+	return driver_register(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_register_driver);
+
+/**
+ * mdev_unregister_driver - unregister MDEV driver
+ * @drv: the driver to unregister
+ *
+ */
+void mdev_unregister_driver(struct mdev_driver *drv)
+{
+	driver_unregister(&drv->driver);
+}
+EXPORT_SYMBOL(mdev_unregister_driver);
+
+int mdev_bus_register(void)
+{
+	return bus_register(&mdev_bus_type);
+}
+
+void mdev_bus_unregister(void)
+{
+	bus_unregister(&mdev_bus_type);
+}
diff --git a/drivers/vfio/mdev/mdev-sysfs.c b/drivers/vfio/mdev/mdev-sysfs.c
new file mode 100644
index 000000000000..79d351a7a502
--- /dev/null
+++ b/drivers/vfio/mdev/mdev-sysfs.c
@@ -0,0 +1,312 @@ 
+/*
+ * File attributes for Mediated devices
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/sysfs.h>
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/uuid.h>
+#include <linux/mdev.h>
+
+#include "mdev_private.h"
+
+/* Prototypes */
+static ssize_t mdev_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static DEVICE_ATTR_RO(mdev_supported_types);
+
+static ssize_t mdev_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count);
+static DEVICE_ATTR_WO(mdev_create);
+
+static ssize_t mdev_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count);
+static DEVICE_ATTR_WO(mdev_destroy);
+
+/* Static functions */
+
+#define UUID_CHAR_LENGTH	36
+#define UUID_BYTE_LENGTH	16
+
+#define SUPPORTED_TYPE_BUFFER_LENGTH	1024
+
+static inline bool is_uuid_sep(char sep)
+{
+	if (sep == '\n' || sep == '-' || sep == ':' || sep == '\0')
+		return true;
+	return false;
+}
+
+static int uuid_parse(const char *str, uuid_le *uuid)
+{
+	int i;
+
+	if (strlen(str) < UUID_CHAR_LENGTH)
+		return -1;
+
+	for (i = 0; i < UUID_BYTE_LENGTH; i++) {
+		if (!isxdigit(str[0]) || !isxdigit(str[1])) {
+			pr_err("%s err", __func__);
+			return -EINVAL;
+		}
+
+		uuid->b[i] = (hex_to_bin(str[0]) << 4) | hex_to_bin(str[1]);
+		str += 2;
+		if (is_uuid_sep(*str))
+			str++;
+	}
+
+	return 0;
+}
+
+/* Functions */
+static ssize_t mdev_supported_types_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	char *str;
+	ssize_t n;
+
+	str = kzalloc(sizeof(*str) * SUPPORTED_TYPE_BUFFER_LENGTH, GFP_KERNEL);
+	if (!str)
+		return -ENOMEM;
+
+	get_mdev_supported_types(dev, str);
+
+	n = sprintf(buf, "%s\n", str);
+	kfree(str);
+
+	return n;
+}
+
+static ssize_t mdev_create_store(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	char *str, *pstr;
+	char *uuid_str, *instance_str, *mdev_params = NULL;
+	uuid_le uuid;
+	uint32_t instance;
+	int ret = 0;
+
+	pstr = str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	uuid_str = strsep(&str, ":");
+	if (!uuid_str) {
+		pr_err("mdev_create: Empty UUID string %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (!str) {
+		pr_err("mdev_create: mdev instance not present %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	instance_str = strsep(&str, ":");
+	if (!instance_str) {
+		pr_err("mdev_create: Empty instance string %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	ret = kstrtouint(instance_str, 0, &instance);
+	if (ret) {
+		pr_err("mdev_create: mdev instance parsing error %s\n", buf);
+		goto create_error;
+	}
+
+	if (!str) {
+		pr_err("mdev_create: mdev params not specified %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	mdev_params = kstrdup(str, GFP_KERNEL);
+
+	if (!mdev_params) {
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		pr_err("mdev_create: UUID parse error %s\n", buf);
+		ret = -EINVAL;
+		goto create_error;
+	}
+
+	if (create_mdev_device(dev, uuid, instance, mdev_params) < 0) {
+		pr_err("mdev_create: Failed to create mdev device\n");
+		ret = -EINVAL;
+		goto create_error;
+	}
+	ret = count;
+
+create_error:
+	kfree(mdev_params);
+	kfree(pstr);
+	return ret;
+}
+
+static ssize_t mdev_destroy_store(struct device *dev,
+				  struct device_attribute *attr,
+				  const char *buf, size_t count)
+{
+	char *uuid_str, *str, *pstr;
+	uuid_le uuid;
+	unsigned int instance;
+	int ret;
+
+	str = pstr = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!str)
+		return -ENOMEM;
+
+	uuid_str = strsep(&str, ":");
+	if (!uuid_str) {
+		pr_err("mdev_destroy: Empty UUID string %s\n", buf);
+		ret = -EINVAL;
+		goto destroy_error;
+	}
+
+	if (str == NULL) {
+		pr_err("mdev_destroy: instance not specified %s\n", buf);
+		ret = -EINVAL;
+		goto destroy_error;
+	}
+
+	ret = kstrtouint(str, 0, &instance);
+	if (ret) {
+		pr_err("mdev_destroy: instance parsing error %s\n", buf);
+		goto destroy_error;
+	}
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		pr_err("mdev_destroy: UUID parse error  %s\n", buf);
+		ret = -EINVAL;
+		goto destroy_error;
+	}
+
+	ret = destroy_mdev_device(uuid, instance);
+	if (ret < 0)
+		goto destroy_error;
+
+	ret = count;
+
+destroy_error:
+	kfree(pstr);
+	return ret;
+}
+
+ssize_t mdev_start_store(struct class *class, struct class_attribute *attr,
+			 const char *buf, size_t count)
+{
+	char *uuid_str;
+	uuid_le uuid;
+	int ret = 0;
+
+	uuid_str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!uuid_str)
+		return -ENOMEM;
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		pr_err("mdev_start: UUID parse error  %s\n", buf);
+		ret = -EINVAL;
+		goto start_error;
+	}
+
+	ret = mdev_start_callback(uuid, 0);
+	if (ret < 0)
+		goto start_error;
+
+	ret = count;
+
+start_error:
+	kfree(uuid_str);
+	return ret;
+}
+
+ssize_t mdev_shutdown_store(struct class *class, struct class_attribute *attr,
+			    const char *buf, size_t count)
+{
+	char *uuid_str;
+	uuid_le uuid;
+	int ret = 0;
+
+	uuid_str = kstrndup(buf, count, GFP_KERNEL);
+
+	if (!uuid_str)
+		return -ENOMEM;
+
+	if (uuid_parse(uuid_str, &uuid) < 0) {
+		pr_err("mdev_shutdown: UUID parse error %s\n", buf);
+		ret = -EINVAL;
+	}
+
+	ret = mdev_shutdown_callback(uuid, 0);
+	if (ret < 0)
+		goto shutdown_error;
+
+	ret = count;
+
+shutdown_error:
+	kfree(uuid_str);
+	return ret;
+
+}
+
+struct class_attribute mdev_class_attrs[] = {
+	__ATTR_WO(mdev_start),
+	__ATTR_WO(mdev_shutdown),
+	__ATTR_NULL
+};
+
+int mdev_create_sysfs_files(struct device *dev)
+{
+	int retval;
+
+	retval = sysfs_create_file(&dev->kobj,
+				   &dev_attr_mdev_supported_types.attr);
+	if (retval) {
+		pr_err("Failed to create mdev_supported_types sysfs entry\n");
+		return retval;
+	}
+
+	retval = sysfs_create_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	if (retval) {
+		pr_err("Failed to create mdev_create sysfs entry\n");
+		return retval;
+	}
+
+	retval = sysfs_create_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
+	if (retval) {
+		pr_err("Failed to create mdev_destroy sysfs entry\n");
+		return retval;
+	}
+
+	return 0;
+}
+
+void mdev_remove_sysfs_files(struct device *dev)
+{
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_supported_types.attr);
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_create.attr);
+	sysfs_remove_file(&dev->kobj, &dev_attr_mdev_destroy.attr);
+}
diff --git a/drivers/vfio/mdev/mdev_private.h b/drivers/vfio/mdev/mdev_private.h
new file mode 100644
index 000000000000..a472310c7749
--- /dev/null
+++ b/drivers/vfio/mdev/mdev_private.h
@@ -0,0 +1,33 @@ 
+/*
+ * Mediated device interal definitions
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_PRIVATE_H
+#define MDEV_PRIVATE_H
+
+int  mdev_bus_register(void);
+void mdev_bus_unregister(void);
+
+/* Function prototypes for mdev_sysfs */
+
+extern struct class_attribute mdev_class_attrs[];
+
+int  mdev_create_sysfs_files(struct device *dev);
+void mdev_remove_sysfs_files(struct device *dev);
+
+int  create_mdev_device(struct device *dev, uuid_le uuid, uint32_t instance,
+			char *mdev_params);
+int  destroy_mdev_device(uuid_le uuid, uint32_t instance);
+void get_mdev_supported_types(struct device *dev, char *str);
+int  mdev_start_callback(uuid_le uuid, uint32_t instance);
+int  mdev_shutdown_callback(uuid_le uuid, uint32_t instance);
+
+#endif /* MDEV_PRIVATE_H */
diff --git a/include/linux/mdev.h b/include/linux/mdev.h
new file mode 100644
index 000000000000..d9633acd85f2
--- /dev/null
+++ b/include/linux/mdev.h
@@ -0,0 +1,224 @@ 
+/*
+ * Mediated device definition
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#ifndef MDEV_H
+#define MDEV_H
+
+/* Common Data structures */
+
+struct pci_region_info {
+	uint64_t start;
+	uint64_t size;
+	uint32_t flags;		/*!< VFIO region info flags */
+};
+
+enum mdev_emul_space {
+	EMUL_CONFIG_SPACE,	/*!< PCI configuration space */
+	EMUL_IO,		/*!< I/O register space */
+	EMUL_MMIO		/*!< Memory-mapped I/O space */
+};
+
+struct phy_device;
+
+/*
+ * Mediated device
+ */
+
+struct mdev_device {
+	struct kref		kref;
+	struct device		dev;
+	struct phy_device	*phy_dev;
+	struct iommu_group	*group;
+	void			*iommu_data;
+	uuid_le			uuid;
+	uint32_t		instance;
+	void			*driver_data;
+	struct mutex		ops_lock;
+	struct list_head	next;
+};
+
+
+/**
+ * struct phy_device_ops - Structure to be registered for each physical device
+ * to register the device to mdev module.
+ *
+ * @owner:		The module owner.
+ * @dev_attr_groups:	Default attributes of the physical device.
+ * @mdev_attr_groups:	Default attributes of the mediated device.
+ * @supported_config:	Called to get information about supported types.
+ *			@dev : device structure of physical device.
+ *			@config: should return string listing supported config
+ *			Returns integer: success (0) or error (< 0)
+ * @create:		Called to allocate basic resources in physical device's
+ *			driver for a particular mediated device
+ *			@dev: physical pci device structure on which mediated
+ *			      device should be created
+ *			@uuid: VM's uuid for which VM it is intended to
+ *			@instance: mediated instance in that VM
+ *			@mdev_params: extra parameters required by physical
+ *			device's driver.
+ *			Returns integer: success (0) or error (< 0)
+ * @destroy:		Called to free resources in physical device's driver for
+ *			a mediated device instance of that VM.
+ *			@dev: physical device structure to which this mediated
+ *			      device points to.
+ *			@uuid: VM's uuid for which the mediated device belongs
+ *			@instance: mdev instance in that VM
+ *			Returns integer: success (0) or error (< 0)
+ *			If VM is running and destroy() is called that means the
+ *			mdev is being hotunpluged. Return error if VM is running
+ *			and driver doesn't support mediated device hotplug.
+ * @start:		Called to do initiate mediated device initialization
+ *			process in physical device's driver when VM boots before
+ *			qemu starts.
+ *			@uuid: VM's UUID which is booting.
+ *			Returns integer: success (0) or error (< 0)
+ * @shutdown:		Called to teardown mediated device related resources for
+ *			the VM
+ *			@uuid: VM's UUID which is shutting down .
+ *			Returns integer: success (0) or error (< 0)
+ * @read:		Read emulation callback
+ *			@mdev: mediated device structure
+ *			@buf: read buffer
+ *			@count: number bytes to read
+ *			@address_space: specifies for which address
+ *			space the request is: pci_config_space, IO
+ *			register space or MMIO space.
+ *			@pos: offset from base address.
+ *			Retuns number on bytes read on success or error.
+ * @write:		Write emulation callback
+ *			@mdev: mediated device structure
+ *			@buf: write buffer
+ *			@count: number bytes to be written
+ *			@address_space: specifies for which address space the
+ *			request is: pci_config_space, IO register space or MMIO
+ *			space.
+ *			@pos: offset from base address.
+ *			Retuns number on bytes written on success or error.
+ * @set_irqs:		Called to send about interrupts configuration
+ *			information that VMM sets.
+ *			@mdev: mediated device structure
+ *			@flags, index, start, count and *data : same as that of
+ *			struct vfio_irq_set of VFIO_DEVICE_SET_IRQS API.
+ * @get_region_info:	Called to get BAR size and flags of mediated device.
+ *			@mdev: mediated device structure
+ *			@region_index: VFIO region index
+ *			@region_info: output, returns size and flags of
+ *				      requested region.
+ *			Returns integer: success (0) or error (< 0)
+ * @validate_map_request: Validate remap pfn request
+ *			@mdev: mediated device structure
+ *			@virtaddr: target user address to start at
+ *			@pfn: physical address of kernel memory, vendor driver
+ *			      can change if required.
+ *			@size: size of map area, vendor driver can change the
+ *			       size of map area if desired.
+ *			@prot: page protection flags for this mapping, vendor
+ *			       driver can change, if required.
+ *			Returns integer: success (0) or error (< 0)
+ *
+ * Physical device that support mediated device should be registered with mdev
+ * module with phy_device_ops structure.
+ */
+
+struct phy_device_ops {
+	struct module   *owner;
+	const struct attribute_group **dev_attr_groups;
+	const struct attribute_group **mdev_attr_groups;
+
+	int	(*supported_config)(struct device *dev, char *config);
+	int     (*create)(struct device *dev, uuid_le uuid,
+			  uint32_t instance, char *mdev_params);
+	int     (*destroy)(struct device *dev, uuid_le uuid,
+			   uint32_t instance);
+	int     (*start)(uuid_le uuid);
+	int     (*shutdown)(uuid_le uuid);
+	ssize_t (*read)(struct mdev_device *vdev, char *buf, size_t count,
+			enum mdev_emul_space address_space, loff_t pos);
+	ssize_t (*write)(struct mdev_device *vdev, char *buf, size_t count,
+			 enum mdev_emul_space address_space, loff_t pos);
+	int     (*set_irqs)(struct mdev_device *vdev, uint32_t flags,
+			    unsigned int index, unsigned int start,
+			    unsigned int count, void *data);
+	int	(*get_region_info)(struct mdev_device *vdev, int region_index,
+				 struct pci_region_info *region_info);
+	int	(*validate_map_request)(struct mdev_device *vdev,
+					unsigned long virtaddr,
+					unsigned long *pfn, unsigned long *size,
+					pgprot_t *prot);
+};
+
+/*
+ * Physical Device
+ */
+struct phy_device {
+	struct device                   *dev;
+	const struct phy_device_ops     *ops;
+	struct list_head                next;
+};
+
+/**
+ * struct mdev_driver - Mediated device driver
+ * @name: driver name
+ * @probe: called when new device created
+ * @remove: called when device removed
+ * @match: called when new device or driver is added for this bus. Return 1 if
+ *	   given device can be handled by given driver and zero otherwise.
+ * @driver: device driver structure
+ *
+ **/
+struct mdev_driver {
+	const char *name;
+	int  (*probe)(struct device *dev);
+	void (*remove)(struct device *dev);
+	int  (*match)(struct device *dev);
+	struct device_driver driver;
+};
+
+static inline struct mdev_driver *to_mdev_driver(struct device_driver *drv)
+{
+	return drv ? container_of(drv, struct mdev_driver, driver) : NULL;
+}
+
+static inline struct mdev_device *to_mdev_device(struct device *dev)
+{
+	return dev ? container_of(dev, struct mdev_device, dev) : NULL;
+}
+
+static inline struct mdev_device *mdev_get_device(struct mdev_device *vdev)
+{
+	return (vdev && get_device(&vdev->dev)) ? vdev : NULL;
+}
+
+static inline  void mdev_put_device(struct mdev_device *vdev)
+{
+	if (vdev)
+		put_device(&vdev->dev);
+}
+
+extern struct bus_type mdev_bus_type;
+
+#define dev_is_mdev(d) ((d)->bus == &mdev_bus_type)
+
+extern int  mdev_register_device(struct device *dev,
+				 const struct phy_device_ops *ops);
+extern void mdev_unregister_device(struct device *dev);
+
+extern int  mdev_register_driver(struct mdev_driver *drv, struct module *owner);
+extern void mdev_unregister_driver(struct mdev_driver *drv);
+
+extern int mdev_map_virtual_bar(uint64_t virt_bar_addr, uint64_t phys_bar_addr,
+				uint32_t len, uint32_t flags);
+
+extern struct mdev_device *mdev_get_device_by_group(struct iommu_group *group);
+
+#endif /* MDEV_H */