diff mbox

[RFC,v3,2/3] VFIO driver for vGPU device

Message ID 1462214441-3732-3-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede May 2, 2016, 6:40 p.m. UTC
VFIO driver registers with vGPU core driver. vGPU core driver creates vGPU
device and calls probe routine of vGPU VFIO driver. This vGPU VFIO driver adds
vGPU device to VFIO core module.
Main aim of this module is to manage all VFIO APIs for each vGPU device.
Those are:
- get region information from GPU driver.
- trap and emulate PCI config space and BAR region.
- Send interrupt configuration information to GPU driver.
- mmap mappable region with invalidate mapping and fault on access to remap pfn.

Thanks,
Kirti.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I949a6b499d2e98d9c3352ae579535a608729b223
---
 drivers/vgpu/Makefile    |    1 +
 drivers/vgpu/vgpu_vfio.c |  671 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 672 insertions(+), 0 deletions(-)
 create mode 100644 drivers/vgpu/vgpu_vfio.c

Comments

Alex Williamson May 3, 2016, 10:43 p.m. UTC | #1
On Tue, 3 May 2016 00:10:40 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> VFIO driver registers with vGPU core driver. vGPU core driver creates vGPU
> device and calls probe routine of vGPU VFIO driver. This vGPU VFIO driver adds
> vGPU device to VFIO core module.
> Main aim of this module is to manage all VFIO APIs for each vGPU device.
> Those are:
> - get region information from GPU driver.
> - trap and emulate PCI config space and BAR region.
> - Send interrupt configuration information to GPU driver.
> - mmap mappable region with invalidate mapping and fault on access to remap pfn.
> 
> Thanks,
> Kirti.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I949a6b499d2e98d9c3352ae579535a608729b223
> ---
>  drivers/vgpu/Makefile    |    1 +
>  drivers/vgpu/vgpu_vfio.c |  671 ++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 672 insertions(+), 0 deletions(-)
>  create mode 100644 drivers/vgpu/vgpu_vfio.c
> 
> diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile
> index f5be980..a0a2655 100644
> --- a/drivers/vgpu/Makefile
> +++ b/drivers/vgpu/Makefile
> @@ -2,3 +2,4 @@
>  vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o
>  
>  obj-$(CONFIG_VGPU)			+= vgpu.o
> +obj-$(CONFIG_VGPU_VFIO)                 += vgpu_vfio.o

This is where we should add a new Kconfig entry for VGPU_VFIO, nothing
in patch 1 has any vfio dependency.  Perhaps it should also depend on
VFIO_PCI rather than VFIO since you are getting very PCI specific below.

> diff --git a/drivers/vgpu/vgpu_vfio.c b/drivers/vgpu/vgpu_vfio.c
> new file mode 100644
> index 0000000..460a4dc
> --- /dev/null
> +++ b/drivers/vgpu/vgpu_vfio.c
> @@ -0,0 +1,671 @@
> +/*
> + * VGPU VFIO device
> + *
> + * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
> + *     Author: Neo Jia <cjia@nvidia.com>
> + *	       Kirti Wankhede <kwankhede@nvidia.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/init.h>
> +#include <linux/module.h>
> +#include <linux/device.h>
> +#include <linux/kernel.h>
> +#include <linux/fs.h>
> +#include <linux/poll.h>
> +#include <linux/slab.h>
> +#include <linux/cdev.h>
> +#include <linux/sched.h>
> +#include <linux/wait.h>
> +#include <linux/uuid.h>
> +#include <linux/vfio.h>
> +#include <linux/iommu.h>
> +#include <linux/vgpu.h>
> +
> +#include "vgpu_private.h"
> +
> +#define DRIVER_VERSION  "0.1"
> +#define DRIVER_AUTHOR   "NVIDIA Corporation"
> +#define DRIVER_DESC     "VGPU VFIO Driver"
> +
> +#define VFIO_PCI_OFFSET_SHIFT   40
> +
> +#define VFIO_PCI_OFFSET_TO_INDEX(off)	(off >> VFIO_PCI_OFFSET_SHIFT)
> +#define VFIO_PCI_INDEX_TO_OFFSET(index)	((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
> +#define VFIO_PCI_OFFSET_MASK	(((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)

Change the name of these from vfio-pci please or shift code around to
use them directly.  You're certainly free to redefine these, but using
the same name is confusing.

> +
> +struct vfio_vgpu_device {
> +	struct iommu_group *group;
> +	struct vgpu_device *vgpu_dev;
> +	int		    refcnt;
> +	struct pci_bar_info bar_info[VFIO_PCI_NUM_REGIONS];
> +	u8		    *vconfig;
> +};
> +
> +static DEFINE_MUTEX(vfio_vgpu_lock);
> +
> +static int get_virtual_bar_info(struct vgpu_device *vgpu_dev,
> +				struct pci_bar_info *bar_info,
> +				int index)
> +{
> +	int ret = -1;

Use a real errno.

> +	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
> +
> +	if (gpu_dev->ops->vgpu_bar_info)
> +		ret = gpu_dev->ops->vgpu_bar_info(vgpu_dev, index, bar_info);

vgpu_bar_info is already optional, further validating that the vgpu
core is not PCI specific.

> +	return ret;
> +}
> +
> +static int vdev_read_base(struct vfio_vgpu_device *vdev)
> +{
> +	int index, pos;
> +	u32 start_lo, start_hi;
> +	u32 mem_type;
> +
> +	pos = PCI_BASE_ADDRESS_0;
> +
> +	for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) {
> +
> +		if (!vdev->bar_info[index].size)
> +			continue;
> +
> +		start_lo = (*(u32 *)(vdev->vconfig + pos)) &
> +					PCI_BASE_ADDRESS_MEM_MASK;
> +		mem_type = (*(u32 *)(vdev->vconfig + pos)) &
> +					PCI_BASE_ADDRESS_MEM_TYPE_MASK;
> +
> +		switch (mem_type) {
> +		case PCI_BASE_ADDRESS_MEM_TYPE_64:
> +			start_hi = (*(u32 *)(vdev->vconfig + pos + 4));
> +			pos += 4;
> +			break;
> +		case PCI_BASE_ADDRESS_MEM_TYPE_32:
> +		case PCI_BASE_ADDRESS_MEM_TYPE_1M:
> +			/* 1M mem BAR treated as 32-bit BAR */
> +		default:
> +			/* mem unknown type treated as 32-bit BAR */
> +			start_hi = 0;
> +			break;
> +		}

Let's not neglect ioport BARs here, IO_MASK is different.

> +		pos += 4;
> +		vdev->bar_info[index].start = ((u64)start_hi << 32) | start_lo;
> +	}
> +	return 0;
> +}
> +
> +static int vgpu_dev_open(void *device_data)
> +{
> +	int ret = 0;
> +	struct vfio_vgpu_device *vdev = device_data;
> +
> +	if (!try_module_get(THIS_MODULE))
> +		return -ENODEV;
> +
> +	mutex_lock(&vfio_vgpu_lock);
> +
> +	if (!vdev->refcnt) {
> +		u8 *vconfig;
> +		int vconfig_size, index;
> +
> +		for (index = 0; index < VFIO_PCI_NUM_REGIONS; index++) {

nit, region indexes are not all BARs.

> +			ret = get_virtual_bar_info(vdev->vgpu_dev,
> +						   &vdev->bar_info[index],
> +						   index);
> +			if (ret)
> +				goto open_error;
> +		}
> +		vconfig_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size;

nit, config space is not a BAR.

> +		if (!vconfig_size)
> +			goto open_error;
> +
> +		vconfig = kzalloc(vconfig_size, GFP_KERNEL);
> +		if (!vconfig) {
> +			ret = -ENOMEM;
> +			goto open_error;
> +		}
> +
> +		vdev->vconfig = vconfig;
> +	}
> +
> +	vdev->refcnt++;
> +open_error:
> +
> +	mutex_unlock(&vfio_vgpu_lock);
> +
> +	if (ret)
> +		module_put(THIS_MODULE);
> +
> +	return ret;
> +}
> +
> +static void vgpu_dev_close(void *device_data)
> +{
> +	struct vfio_vgpu_device *vdev = device_data;
> +
> +	mutex_lock(&vfio_vgpu_lock);
> +
> +	vdev->refcnt--;
> +	if (!vdev->refcnt) {
> +		memset(&vdev->bar_info, 0, sizeof(vdev->bar_info));

Why?

> +		if (vdev->vconfig)

How would we ever achieve that?

> +			kfree(vdev->vconfig);
> +	}
> +
> +	mutex_unlock(&vfio_vgpu_lock);
> +
> +	module_put(THIS_MODULE);
> +}
> +
> +static int vgpu_get_irq_count(struct vfio_vgpu_device *vdev, int irq_type)
> +{
> +	// Don't support MSIX for now
> +	if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
> +		return -1;

How are we going to expand the API later for it?  Shouldn't this just
be a passthrough to a gpu_devices_ops.vgpu_vfio_get_irq_info callback?

> +
> +	return 1;
> +}
> +
> +static long vgpu_dev_unlocked_ioctl(void *device_data,
> +		unsigned int cmd, unsigned long arg)
> +{
> +	int ret = 0;
> +	struct vfio_vgpu_device *vdev = device_data;
> +	unsigned long minsz;
> +
> +	switch (cmd)
> +	{
> +	case VFIO_DEVICE_GET_INFO:
> +	{
> +		struct vfio_device_info info;
> +		printk(KERN_INFO "%s VFIO_DEVICE_GET_INFO cmd index ", __FUNCTION__);
> +		minsz = offsetofend(struct vfio_device_info, num_irqs);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		info.flags = VFIO_DEVICE_FLAGS_PCI;
> +		info.num_regions = VFIO_PCI_NUM_REGIONS;
> +		info.num_irqs = VFIO_PCI_NUM_IRQS;
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +
> +	case VFIO_DEVICE_GET_REGION_INFO:
> +	{
> +		struct vfio_region_info info;
> +
> +		minsz = offsetofend(struct vfio_region_info, offset);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz)
> +			return -EINVAL;
> +
> +		printk(KERN_INFO "%s VFIO_DEVICE_GET_REGION_INFO cmd for region_index %d", __FUNCTION__, info.index);
> +		switch (info.index) {
> +		case VFIO_PCI_CONFIG_REGION_INDEX:
> +		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> +			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
> +			info.size = vdev->bar_info[info.index].size;
> +			if (!info.size) {
> +				info.flags = 0;
> +				break;
> +			}
> +
> +			info.flags = vdev->bar_info[info.index].flags;

Ah, so bar_info.flags are vfio region info flags, that's not documented
anywhere in the API.

> +			break;
> +		case VFIO_PCI_VGA_REGION_INDEX:
> +			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
> +			info.size = 0xc0000;
> +			info.flags = VFIO_REGION_INFO_FLAG_READ |
> +				     VFIO_REGION_INFO_FLAG_WRITE;
> +				break;

I think VGA support needs to be at the discretion of the vendor
driver.  There are certainly use cases that don't require VGA.

> +
> +		case VFIO_PCI_ROM_REGION_INDEX:

So should ROM support.  What's the assumption here, that QEMU will
provide a ROM, much like is required for SR-IOV VFs?

> +		default:
> +			return -EINVAL;
> +		}
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +
> +	}
> +	case VFIO_DEVICE_GET_IRQ_INFO:
> +	{
> +		struct vfio_irq_info info;
> +
> +		printk(KERN_INFO "%s VFIO_DEVICE_GET_IRQ_INFO cmd", __FUNCTION__);

Clearly lots of debug remaining in these functions.

> +		minsz = offsetofend(struct vfio_irq_info, count);
> +
> +		if (copy_from_user(&info, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
> +			return -EINVAL;
> +
> +		switch (info.index) {
> +		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
> +		case VFIO_PCI_REQ_IRQ_INDEX:
> +			break;
> +			/* pass thru to return error */
> +		case VFIO_PCI_MSIX_IRQ_INDEX:

Lots of assumptions about what the vendor driver is going to support.

> +		default:
> +			return -EINVAL;
> +		}
> +
> +		info.count = VFIO_PCI_NUM_IRQS;
> +
> +		info.flags = VFIO_IRQ_INFO_EVENTFD;
> +		info.count = vgpu_get_irq_count(vdev, info.index);
> +
> +		if (info.count == -1)
> +			return -EINVAL;
> +
> +		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
> +			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
> +					VFIO_IRQ_INFO_AUTOMASKED);
> +		else
> +			info.flags |= VFIO_IRQ_INFO_NORESIZE;
> +
> +		return copy_to_user((void __user *)arg, &info, minsz);
> +	}
> +
> +	case VFIO_DEVICE_SET_IRQS:
> +	{
> +		struct vfio_irq_set hdr;
> +		struct gpu_device *gpu_dev = vdev->vgpu_dev->gpu_dev;
> +		u8 *data = NULL;
> +		int ret = 0;
> +		minsz = offsetofend(struct vfio_irq_set, count);
> +
> +		if (copy_from_user(&hdr, (void __user *)arg, minsz))
> +			return -EFAULT;
> +
> +		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
> +		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
> +		    VFIO_IRQ_SET_ACTION_TYPE_MASK))
> +			return -EINVAL;
> +
> +		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
> +			size_t size;
> +			int max = vgpu_get_irq_count(vdev, hdr.index);
> +
> +			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
> +				size = sizeof(uint8_t);
> +			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
> +				size = sizeof(int32_t);
> +			else
> +				return -EINVAL;
> +
> +			if (hdr.argsz - minsz < hdr.count * size ||
> +			    hdr.start >= max || hdr.start + hdr.count > max)
> +				return -EINVAL;
> +
> +			data = memdup_user((void __user *)(arg + minsz),
> +						hdr.count * size);
> +				if (IS_ERR(data))
> +					return PTR_ERR(data);
> +
> +			}
> +
> +			if (gpu_dev->ops->vgpu_set_irqs) {
> +				ret = gpu_dev->ops->vgpu_set_irqs(vdev->vgpu_dev,
> +								  hdr.flags,
> +								  hdr.index, hdr.start,
> +								  hdr.count, data);
> +			}
> +			kfree(data);
> +			return ret;
> +		}
> +
> +		default:
> +			return -EINVAL;
> +	}
> +	return ret;
> +}
> +
> +ssize_t vgpu_dev_config_rw(struct vfio_vgpu_device *vdev, char __user *buf,
> +		size_t count, loff_t *ppos, bool iswrite)
> +{
> +	struct vgpu_device *vgpu_dev = vdev->vgpu_dev;
> +	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
> +	int cfg_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size;
> +	int ret = 0;
> +	uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
> +
> +	if (pos < 0 || pos >= cfg_size ||
> +	    pos + count > cfg_size) {
> +		printk(KERN_ERR "%s pos 0x%llx out of range\n", __FUNCTION__, pos);
> +		ret = -EFAULT;
> +		goto config_rw_exit;
> +	}
> +
> +	if (iswrite) {
> +		char *user_data = kmalloc(count, GFP_KERNEL);
> +
> +		if (user_data == NULL) {
> +			ret = -ENOMEM;
> +			goto config_rw_exit;
> +		}
> +
> +		if (copy_from_user(user_data, buf, count)) {
> +			ret = -EFAULT;
> +			kfree(user_data);
> +			goto config_rw_exit;
> +		}

memdup_user()?

> +
> +		if (gpu_dev->ops->write) {
> +			ret = gpu_dev->ops->write(vgpu_dev,
> +						  user_data,
> +						  count,
> +						  vgpu_emul_space_config,
> +						  pos);
> +		}
> +
> +		memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count);

So write is expected to user_data to allow only the writable bits to be
changed?  What's really being saved in the vconfig here vs the vendor
vgpu driver?  It seems like we're only using it to cache the BAR
values, but we're not providing the BAR emulation here, which seems
like one of the few things we could provide so it's not duplicated in
every vendor driver.  But then we only need a few u32s to do that, not
all of config space.

> +		kfree(user_data);
> +	}
> +	else
> +	{
> +		char *ret_data = kzalloc(count, GFP_KERNEL);
> +
> +		if (ret_data == NULL) {
> +			ret = -ENOMEM;
> +			goto config_rw_exit;
> +		}
> +
> +		if (gpu_dev->ops->read) {
> +			ret = gpu_dev->ops->read(vgpu_dev,
> +						 ret_data,
> +						 count,
> +						 vgpu_emul_space_config,
> +						 pos);
> +		}
> +
> +		if (ret > 0 ) {
> +			if (copy_to_user(buf, ret_data, ret)) {
> +				ret = -EFAULT;
> +				kfree(ret_data);
> +				goto config_rw_exit;
> +			}
> +
> +			memcpy((void *)(vdev->vconfig + pos), (void *)ret_data, count);
> +		}
> +		kfree(ret_data);
> +	}
> +config_rw_exit:
> +	return ret;
> +}
> +
> +ssize_t vgpu_dev_bar_rw(struct vfio_vgpu_device *vdev, char __user *buf,
> +		size_t count, loff_t *ppos, bool iswrite)
> +{
> +	struct vgpu_device *vgpu_dev = vdev->vgpu_dev;
> +	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
> +	loff_t offset = *ppos & VFIO_PCI_OFFSET_MASK;
> +	loff_t pos;
> +	int bar_index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
> +	int ret = 0;
> +
> +	if (!vdev->bar_info[bar_index].start) {
> +		ret = vdev_read_base(vdev);
> +		if (ret)
> +			goto bar_rw_exit;
> +	}
> +
> +	if (offset >= vdev->bar_info[bar_index].size) {
> +		ret = -EINVAL;
> +		goto bar_rw_exit;
> +	}
> +
> +	pos = vdev->bar_info[bar_index].start + offset;
> +	if (iswrite) {
> +		char *user_data = kmalloc(count, GFP_KERNEL);
> +
> +		if (user_data == NULL) {
> +			ret = -ENOMEM;
> +			goto bar_rw_exit;
> +		}
> +
> +		if (copy_from_user(user_data, buf, count)) {
> +			ret = -EFAULT;
> +			kfree(user_data);
> +			goto bar_rw_exit;
> +		}

memdup_user() again.

> +
> +		if (gpu_dev->ops->write) {
> +			ret = gpu_dev->ops->write(vgpu_dev,
> +						  user_data,
> +						  count,
> +						  vgpu_emul_space_mmio,
> +						  pos);
> +		}

What's the usefulness in a vendor driver that doesn't provide
read/write?

> +
> +		kfree(user_data);
> +	}
> +	else
> +	{
> +		char *ret_data = kmalloc(count, GFP_KERNEL);
> +
> +		if (ret_data == NULL) {
> +			ret = -ENOMEM;
> +			goto bar_rw_exit;
> +		}
> +
> +		memset(ret_data, 0, count);
> +
> +		if (gpu_dev->ops->read) {
> +			ret = gpu_dev->ops->read(vgpu_dev,
> +						 ret_data,
> +						 count,
> +						 vgpu_emul_space_mmio,
> +						 pos);
> +		}
> +
> +		if (ret > 0 ) {
> +			if (copy_to_user(buf, ret_data, ret)) {
> +				ret = -EFAULT;
> +			}
> +		}
> +		kfree(ret_data);
> +	}
> +
> +bar_rw_exit:
> +	return ret;

No freeing, no lock releasing, no cleanup, just return from the point
of error.

> +}
> +
> +
> +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf,
> +		size_t count, loff_t *ppos, bool iswrite)
> +{
> +	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
> +	struct vfio_vgpu_device *vdev = device_data;
> +
> +	if (index >= VFIO_PCI_NUM_REGIONS)
> +		return -EINVAL;
> +
> +	switch (index) {
> +	case VFIO_PCI_CONFIG_REGION_INDEX:
> +		return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite);
> +
> +	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> +		return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite);
> +
> +	case VFIO_PCI_ROM_REGION_INDEX:
> +	case VFIO_PCI_VGA_REGION_INDEX:

Wait a sec, who's doing the VGA emulation?  We can't be claiming to
support a VGA region and then fail to provide read/write access to it
like we said it has.

> +		break;
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +
> +static ssize_t vgpu_dev_read(void *device_data, char __user *buf,
> +			     size_t count, loff_t *ppos)
> +{
> +	int ret = 0;
> +
> +	if (count)
> +		ret = vgpu_dev_rw(device_data, buf, count, ppos, false);
> +
> +	return ret;
> +}
> +
> +static ssize_t vgpu_dev_write(void *device_data, const char __user *buf,
> +			      size_t count, loff_t *ppos)
> +{
> +	int ret = 0;
> +
> +	if (count)
> +		ret = vgpu_dev_rw(device_data, (char *)buf, count, ppos, true);
> +
> +	return ret;
> +}
> +
> +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> +{
> +	int ret = 0;
> +	struct vfio_vgpu_device *vdev = vma->vm_private_data;
> +	struct vgpu_device *vgpu_dev;
> +	struct gpu_device *gpu_dev;
> +	u64 virtaddr = (u64)vmf->virtual_address;
> +	u64 offset, phyaddr;
> +	unsigned long req_size, pgoff;
> +	pgprot_t pg_prot;
> +
> +	if (!vdev && !vdev->vgpu_dev)
> +		return -EINVAL;
> +
> +	vgpu_dev = vdev->vgpu_dev;
> +	gpu_dev  = vgpu_dev->gpu_dev;
> +
> +	offset   = vma->vm_pgoff << PAGE_SHIFT;
> +	phyaddr  = virtaddr - vma->vm_start + offset;
> +	pgoff    = phyaddr >> PAGE_SHIFT;
> +	req_size = vma->vm_end - virtaddr;
> +	pg_prot  = vma->vm_page_prot;
> +
> +	if (gpu_dev->ops->validate_map_request) {
> +		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff,
> +							 &req_size, &pg_prot);
> +		if (ret)
> +			return ret;
> +
> +		if (!req_size)
> +			return -EINVAL;
> +	}
> +
> +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);

So not supporting validate_map_request() means that the user can
directly mmap BARs of the host GPU and as shown below, we assume a 1:1
mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
scenario or should this callback be required?  It's not clear to me how
the vendor driver determines what this maps to, do they compare it to
the physical device's own BAR addresses?

> +
> +	return ret | VM_FAULT_NOPAGE;
> +}
> +
> +static const struct vm_operations_struct vgpu_dev_mmio_ops = {
> +	.fault = vgpu_dev_mmio_fault,
> +};
> +
> +
> +static int vgpu_dev_mmap(void *device_data, struct vm_area_struct *vma)
> +{
> +	unsigned int index;
> +	struct vfio_vgpu_device *vdev = device_data;
> +	struct vgpu_device *vgpu_dev = vdev->vgpu_dev;
> +	struct pci_dev *pdev = vgpu_dev->gpu_dev->dev;
> +	unsigned long pgoff;
> +
> +	loff_t offset = vma->vm_pgoff << PAGE_SHIFT;
> +
> +	index = VFIO_PCI_OFFSET_TO_INDEX(offset);
> +
> +	if (index >= VFIO_PCI_ROM_REGION_INDEX)
> +		return -EINVAL;

ioport BARs?

> +
> +	pgoff = vma->vm_pgoff &
> +		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
> +
> +	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
> +
> +	vma->vm_private_data = vdev;
> +	vma->vm_ops = &vgpu_dev_mmio_ops;
> +
> +	return 0;
> +}
> +
> +static const struct vfio_device_ops vgpu_vfio_dev_ops = {
> +	.name		= "vfio-vgpu",

Should all of this be vfio-pci-vgpu?  We've certainly gotten PCI
specific here.

> +	.open		= vgpu_dev_open,
> +	.release	= vgpu_dev_close,
> +	.ioctl		= vgpu_dev_unlocked_ioctl,
> +	.read		= vgpu_dev_read,
> +	.write		= vgpu_dev_write,
> +	.mmap		= vgpu_dev_mmap,
> +};
> +
> +int vgpu_vfio_probe(struct device *dev)
> +{
> +	struct vfio_vgpu_device *vdev;
> +	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
> +	int ret = 0;
> +
> +	if (vgpu_dev == NULL)
> +		return -EINVAL;
> +
> +	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
> +	if (!vdev) {
> +		return -ENOMEM;
> +	}
> +
> +	vdev->vgpu_dev = vgpu_dev;
> +	vdev->group = vgpu_dev->group;
> +
> +	ret = vfio_add_group_dev(dev, &vgpu_vfio_dev_ops, vdev);
> +	if (ret)
> +		kfree(vdev);
> +
> +	printk(KERN_INFO "%s ret = %d\n", __FUNCTION__, ret);
> +	return ret;
> +}
> +
> +void vgpu_vfio_remove(struct device *dev)
> +{
> +	struct vfio_vgpu_device *vdev;
> +
> +	printk(KERN_INFO "%s \n", __FUNCTION__);
> +	vdev = vfio_del_group_dev(dev);
> +	if (vdev) {
> +		printk(KERN_INFO "%s vdev being freed\n", __FUNCTION__);
> +		kfree(vdev);
> +	}
> +}
> +
> +struct vgpu_driver vgpu_vfio_driver = {
> +        .name	= "vgpu-vfio",
> +        .probe	= vgpu_vfio_probe,
> +        .remove	= vgpu_vfio_remove,
> +};
> +
> +static int __init vgpu_vfio_init(void)
> +{
> +	printk(KERN_INFO "%s \n", __FUNCTION__);
> +	return vgpu_register_driver(&vgpu_vfio_driver, THIS_MODULE);
> +}
> +
> +static void __exit vgpu_vfio_exit(void)
> +{
> +	printk(KERN_INFO "%s \n", __FUNCTION__);
> +	vgpu_unregister_driver(&vgpu_vfio_driver);
> +}
> +
> +module_init(vgpu_vfio_init)
> +module_exit(vgpu_vfio_exit)
> +
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR(DRIVER_AUTHOR);
> +MODULE_DESCRIPTION(DRIVER_DESC);
Tian, Kevin May 4, 2016, 3:23 a.m. UTC | #2
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Wednesday, May 04, 2016 6:43 AM
> > +
> > +		if (gpu_dev->ops->write) {
> > +			ret = gpu_dev->ops->write(vgpu_dev,
> > +						  user_data,
> > +						  count,
> > +						  vgpu_emul_space_config,
> > +						  pos);
> > +		}
> > +
> > +		memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count);
> 
> So write is expected to user_data to allow only the writable bits to be
> changed?  What's really being saved in the vconfig here vs the vendor
> vgpu driver?  It seems like we're only using it to cache the BAR
> values, but we're not providing the BAR emulation here, which seems
> like one of the few things we could provide so it's not duplicated in
> every vendor driver.  But then we only need a few u32s to do that, not
> all of config space.

We can borrow same vconfig emulation from existing vfio-pci driver.
But doing so doesn't mean that vendor vgpu driver cannot have its
own vconfig emulation further. vGPU is not like a real device, since
there may be no physical config space implemented for each vGPU.
So anyway vendor vGPU driver needs to create/emulate the virtualized 
config space while the way how is created might be vendor specific. 
So better to keep the interface to access raw vconfig space from
vendor vGPU driver.

> > +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf,
> > +		size_t count, loff_t *ppos, bool iswrite)
> > +{
> > +	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
> > +	struct vfio_vgpu_device *vdev = device_data;
> > +
> > +	if (index >= VFIO_PCI_NUM_REGIONS)
> > +		return -EINVAL;
> > +
> > +	switch (index) {
> > +	case VFIO_PCI_CONFIG_REGION_INDEX:
> > +		return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite);
> > +
> > +	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> > +		return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite);
> > +
> > +	case VFIO_PCI_ROM_REGION_INDEX:
> > +	case VFIO_PCI_VGA_REGION_INDEX:
> 
> Wait a sec, who's doing the VGA emulation?  We can't be claiming to
> support a VGA region and then fail to provide read/write access to it
> like we said it has.

For Intel side we plan to not support VGA region when upstreaming our
KVMGT work, which means Intel vGPU will be exposed only as a 
secondary graphics card then so legacy VGA is not required. Also no
VBIOS/ROM requirement. Guess we can remove above two regions.

> > +
> > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> > +{
> > +	int ret = 0;
> > +	struct vfio_vgpu_device *vdev = vma->vm_private_data;
> > +	struct vgpu_device *vgpu_dev;
> > +	struct gpu_device *gpu_dev;
> > +	u64 virtaddr = (u64)vmf->virtual_address;
> > +	u64 offset, phyaddr;
> > +	unsigned long req_size, pgoff;
> > +	pgprot_t pg_prot;
> > +
> > +	if (!vdev && !vdev->vgpu_dev)
> > +		return -EINVAL;
> > +
> > +	vgpu_dev = vdev->vgpu_dev;
> > +	gpu_dev  = vgpu_dev->gpu_dev;
> > +
> > +	offset   = vma->vm_pgoff << PAGE_SHIFT;
> > +	phyaddr  = virtaddr - vma->vm_start + offset;
> > +	pgoff    = phyaddr >> PAGE_SHIFT;
> > +	req_size = vma->vm_end - virtaddr;
> > +	pg_prot  = vma->vm_page_prot;
> > +
> > +	if (gpu_dev->ops->validate_map_request) {
> > +		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff,
> > +							 &req_size, &pg_prot);
> > +		if (ret)
> > +			return ret;
> > +
> > +		if (!req_size)
> > +			return -EINVAL;
> > +	}
> > +
> > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
> 
> So not supporting validate_map_request() means that the user can
> directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> scenario or should this callback be required?  It's not clear to me how
> the vendor driver determines what this maps to, do they compare it to
> the physical device's own BAR addresses?

I didn't quite understand too. Based on earlier discussion, do we need
something like this, or could achieve the purpose just by leveraging
recent sparse mmap support?

Thanks
Kevin
Kirti Wankhede May 4, 2016, 4:25 p.m. UTC | #3
On 5/4/2016 4:13 AM, Alex Williamson wrote:
 > On Tue, 3 May 2016 00:10:40 +0530

 >>  obj-$(CONFIG_VGPU)			+= vgpu.o
 >> +obj-$(CONFIG_VGPU_VFIO)                 += vgpu_vfio.o
 >
 > This is where we should add a new Kconfig entry for VGPU_VFIO, nothing
 > in patch 1 has any vfio dependency.  Perhaps it should also depend on
 > VFIO_PCI rather than VFIO since you are getting very PCI specific below.

VGPU_VFIO depends on VFIO but is independent of VFIO_PCI. VGPU_VFIO uses 
VFIO apis defined for PCI devices and uses common #defines but that 
doesn't mean it depends on VFIO_PCI.
I'll move Kconfig entry for VGPU_VFIO here in next version of patch.

 >> +#define VFIO_PCI_OFFSET_SHIFT   40
 >> +
 >> +#define VFIO_PCI_OFFSET_TO_INDEX(off)	(off >> VFIO_PCI_OFFSET_SHIFT)
 >> +#define VFIO_PCI_INDEX_TO_OFFSET(index)	((u64)(index) << 
VFIO_PCI_OFFSET_SHIFT)
 >> +#define VFIO_PCI_OFFSET_MASK	(((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
 >
 > Change the name of these from vfio-pci please or shift code around to
 > use them directly.  You're certainly free to redefine these, but using
 > the same name is confusing.
 >

I'll move these defines to common location.


 >> +	if (gpu_dev->ops->vgpu_bar_info)
 >> +		ret = gpu_dev->ops->vgpu_bar_info(vgpu_dev, index, bar_info);
 >
 > vgpu_bar_info is already optional, further validating that the vgpu
 > core is not PCI specific.

It is not optional if vgpu_vfio module should work on the device. If 
vgpu_bar_info is not provided by vendor driver, open() would fail. 
vgpu_vfio expect PCI device. Also need to PCI device validation.


 >
 > Let's not neglect ioport BARs here, IO_MASK is different.
 >

vgpu_device is virtual device, it is not going to drive VGA signals. 
Nvidia vGPU would not support IO BAR.


 >> +	vdev->refcnt--;
 >> +	if (!vdev->refcnt) {
 >> +		memset(&vdev->bar_info, 0, sizeof(vdev->bar_info));
 >
 > Why?

vfio_vgpu_device is allocated when vgpu device is created by vgpu core, 
then QEMU/VMM call open() on that device, where vdev->bar_info is 
populated and allocates vconfig.
In teardown path, QEMU/VMM call close() on the device and 
vfio_vgpu_device is destroyed when vgpu device is destroyed by vgpu core.

If QEMU/VMM restarts and in that case vgpu device is not destroyed, 
vdev->bar_info should be cleared to fetch it again from vendor driver. 
It should not keep any stale addresses.

 >> +	if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
 >> +		return -1;
 >
 > How are we going to expand the API later for it?  Shouldn't this just
 > be a passthrough to a gpu_devices_ops.vgpu_vfio_get_irq_info callback?

Vendor driver convey interrupt type by defining capabilities in config 
space. I don't think we should add new callback for it.


 >> +		memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count);
 >
 > So write is expected to user_data to allow only the writable bits to be
 > changed?  What's really being saved in the vconfig here vs the vendor
 > vgpu driver?  It seems like we're only using it to cache the BAR
 > values, but we're not providing the BAR emulation here, which seems
 > like one of the few things we could provide so it's not duplicated in
 > every vendor driver.  But then we only need a few u32s to do that, not
 > all of config space.
 >

Vendor driver should emulate config space. It is not just BAR addresses, 
vendor driver should add the capabilities supported by its vGPU device.


 >> +
 >> +		if (gpu_dev->ops->write) {
 >> +			ret = gpu_dev->ops->write(vgpu_dev,
 >> +						  user_data,
 >> +						  count,
 >> +						  vgpu_emul_space_mmio,
 >> +						  pos);
 >> +		}
 >
 > What's the usefulness in a vendor driver that doesn't provide
 > read/write?

The checks are to avoid NULL pointer deference if this callbacks are not 
provided. Whether it will work or not that completely depends on vendor 
driver stack in host and guest.

 >> +	case VFIO_PCI_ROM_REGION_INDEX:
 >> +	case VFIO_PCI_VGA_REGION_INDEX:
 >
 > Wait a sec, who's doing the VGA emulation?  We can't be claiming to
 > support a VGA region and then fail to provide read/write access to it
 > like we said it has.
 >

Nvidia vGPU doesn't support IO BAR and ROM BAR. But I can move these 
cases to
case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:

So that if vendor driver support IO BAR or ROM BAR emulation, it would 
be same as other BARs.


 >> +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
 >
 > So not supporting validate_map_request() means that the user can
 > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
 > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
 > scenario or should this callback be required?

Yes, if restrictions are imposed such that onle one vGPU device can be 
created on one physical GPU, i.e. 1:1 vGPU to host GPU.

 >  It's not clear to me how
 > the vendor driver determines what this maps to, do they compare it to
 > the physical device's own BAR addresses?
 >

Yes.

Thanks,
Kirti
Alex Williamson May 4, 2016, 5:06 p.m. UTC | #4
On Wed, 4 May 2016 03:23:13 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > Sent: Wednesday, May 04, 2016 6:43 AM  
> > > +
> > > +		if (gpu_dev->ops->write) {
> > > +			ret = gpu_dev->ops->write(vgpu_dev,
> > > +						  user_data,
> > > +						  count,
> > > +						  vgpu_emul_space_config,
> > > +						  pos);
> > > +		}
> > > +
> > > +		memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count);  
> > 
> > So write is expected to user_data to allow only the writable bits to be
> > changed?  What's really being saved in the vconfig here vs the vendor
> > vgpu driver?  It seems like we're only using it to cache the BAR
> > values, but we're not providing the BAR emulation here, which seems
> > like one of the few things we could provide so it's not duplicated in
> > every vendor driver.  But then we only need a few u32s to do that, not
> > all of config space.  
> 
> We can borrow same vconfig emulation from existing vfio-pci driver.
> But doing so doesn't mean that vendor vgpu driver cannot have its
> own vconfig emulation further. vGPU is not like a real device, since
> there may be no physical config space implemented for each vGPU.
> So anyway vendor vGPU driver needs to create/emulate the virtualized 
> config space while the way how is created might be vendor specific. 
> So better to keep the interface to access raw vconfig space from
> vendor vGPU driver.

I'm hoping config space will be very simple for a vgpu, so I don't know
that it makes sense to add that complexity early on.  Neo/Kirti, what
capabilities do you expect to provide?  Who provides the MSI
capability?  Is a PCIe capability provided?  Others?
 
> > > +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf,
> > > +		size_t count, loff_t *ppos, bool iswrite)
> > > +{
> > > +	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
> > > +	struct vfio_vgpu_device *vdev = device_data;
> > > +
> > > +	if (index >= VFIO_PCI_NUM_REGIONS)
> > > +		return -EINVAL;
> > > +
> > > +	switch (index) {
> > > +	case VFIO_PCI_CONFIG_REGION_INDEX:
> > > +		return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite);
> > > +
> > > +	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> > > +		return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite);
> > > +
> > > +	case VFIO_PCI_ROM_REGION_INDEX:
> > > +	case VFIO_PCI_VGA_REGION_INDEX:  
> > 
> > Wait a sec, who's doing the VGA emulation?  We can't be claiming to
> > support a VGA region and then fail to provide read/write access to it
> > like we said it has.  
> 
> For Intel side we plan to not support VGA region when upstreaming our
> KVMGT work, which means Intel vGPU will be exposed only as a 
> secondary graphics card then so legacy VGA is not required. Also no
> VBIOS/ROM requirement. Guess we can remove above two regions.

So this needs to be optional based on what the mediation driver
provides.  It seems like we're just making passthroughs for the vendor
mediation driver to speak vfio.

> > > +
> > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > +{
> > > +	int ret = 0;
> > > +	struct vfio_vgpu_device *vdev = vma->vm_private_data;
> > > +	struct vgpu_device *vgpu_dev;
> > > +	struct gpu_device *gpu_dev;
> > > +	u64 virtaddr = (u64)vmf->virtual_address;
> > > +	u64 offset, phyaddr;
> > > +	unsigned long req_size, pgoff;
> > > +	pgprot_t pg_prot;
> > > +
> > > +	if (!vdev && !vdev->vgpu_dev)
> > > +		return -EINVAL;
> > > +
> > > +	vgpu_dev = vdev->vgpu_dev;
> > > +	gpu_dev  = vgpu_dev->gpu_dev;
> > > +
> > > +	offset   = vma->vm_pgoff << PAGE_SHIFT;
> > > +	phyaddr  = virtaddr - vma->vm_start + offset;
> > > +	pgoff    = phyaddr >> PAGE_SHIFT;
> > > +	req_size = vma->vm_end - virtaddr;
> > > +	pg_prot  = vma->vm_page_prot;
> > > +
> > > +	if (gpu_dev->ops->validate_map_request) {
> > > +		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff,
> > > +							 &req_size, &pg_prot);
> > > +		if (ret)
> > > +			return ret;
> > > +
> > > +		if (!req_size)
> > > +			return -EINVAL;
> > > +	}
> > > +
> > > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);  
> > 
> > So not supporting validate_map_request() means that the user can
> > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> > scenario or should this callback be required?  It's not clear to me how
> > the vendor driver determines what this maps to, do they compare it to
> > the physical device's own BAR addresses?  
> 
> I didn't quite understand too. Based on earlier discussion, do we need
> something like this, or could achieve the purpose just by leveraging
> recent sparse mmap support?

The reason for faulting in the mmio space, if I recall correctly, is to
enable an ordering where the user driver (QEMU) can mmap regions of the
device prior to resources being allocated on the host GPU to handle
them.  Sparse mmap only partially handles that, it's not dynamic.  With
this faulting mechanism, the host GPU doesn't need to commit resources
until the mmap is actually accessed.  Thanks,

Alex
Neo Jia May 4, 2016, 9:14 p.m. UTC | #5
On Wed, May 04, 2016 at 11:06:19AM -0600, Alex Williamson wrote:
> On Wed, 4 May 2016 03:23:13 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson [mailto:alex.williamson@redhat.com]
> > > Sent: Wednesday, May 04, 2016 6:43 AM  
> > > > +
> > > > +		if (gpu_dev->ops->write) {
> > > > +			ret = gpu_dev->ops->write(vgpu_dev,
> > > > +						  user_data,
> > > > +						  count,
> > > > +						  vgpu_emul_space_config,
> > > > +						  pos);
> > > > +		}
> > > > +
> > > > +		memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count);  
> > > 
> > > So write is expected to user_data to allow only the writable bits to be
> > > changed?  What's really being saved in the vconfig here vs the vendor
> > > vgpu driver?  It seems like we're only using it to cache the BAR
> > > values, but we're not providing the BAR emulation here, which seems
> > > like one of the few things we could provide so it's not duplicated in
> > > every vendor driver.  But then we only need a few u32s to do that, not
> > > all of config space.  
> > 
> > We can borrow same vconfig emulation from existing vfio-pci driver.
> > But doing so doesn't mean that vendor vgpu driver cannot have its
> > own vconfig emulation further. vGPU is not like a real device, since
> > there may be no physical config space implemented for each vGPU.
> > So anyway vendor vGPU driver needs to create/emulate the virtualized 
> > config space while the way how is created might be vendor specific. 
> > So better to keep the interface to access raw vconfig space from
> > vendor vGPU driver.
> 
> I'm hoping config space will be very simple for a vgpu, so I don't know
> that it makes sense to add that complexity early on.  Neo/Kirti, what
> capabilities do you expect to provide?  Who provides the MSI
> capability?  Is a PCIe capability provided?  Others?

Currently only standard PCI caps.

MSI cap is emulated by the vendor drivers via the above interface.

No PCIe caps so far.

>  
> > > > +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf,
> > > > +		size_t count, loff_t *ppos, bool iswrite)
> > > > +{
> > > > +	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
> > > > +	struct vfio_vgpu_device *vdev = device_data;
> > > > +
> > > > +	if (index >= VFIO_PCI_NUM_REGIONS)
> > > > +		return -EINVAL;
> > > > +
> > > > +	switch (index) {
> > > > +	case VFIO_PCI_CONFIG_REGION_INDEX:
> > > > +		return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite);
> > > > +
> > > > +	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
> > > > +		return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite);
> > > > +
> > > > +	case VFIO_PCI_ROM_REGION_INDEX:
> > > > +	case VFIO_PCI_VGA_REGION_INDEX:  
> > > 
> > > Wait a sec, who's doing the VGA emulation?  We can't be claiming to
> > > support a VGA region and then fail to provide read/write access to it
> > > like we said it has.  
> > 
> > For Intel side we plan to not support VGA region when upstreaming our
> > KVMGT work, which means Intel vGPU will be exposed only as a 
> > secondary graphics card then so legacy VGA is not required. Also no
> > VBIOS/ROM requirement. Guess we can remove above two regions.
> 
> So this needs to be optional based on what the mediation driver
> provides.  It seems like we're just making passthroughs for the vendor
> mediation driver to speak vfio.
> 
> > > > +
> > > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> > > > +{
> > > > +	int ret = 0;
> > > > +	struct vfio_vgpu_device *vdev = vma->vm_private_data;
> > > > +	struct vgpu_device *vgpu_dev;
> > > > +	struct gpu_device *gpu_dev;
> > > > +	u64 virtaddr = (u64)vmf->virtual_address;
> > > > +	u64 offset, phyaddr;
> > > > +	unsigned long req_size, pgoff;
> > > > +	pgprot_t pg_prot;
> > > > +
> > > > +	if (!vdev && !vdev->vgpu_dev)
> > > > +		return -EINVAL;
> > > > +
> > > > +	vgpu_dev = vdev->vgpu_dev;
> > > > +	gpu_dev  = vgpu_dev->gpu_dev;
> > > > +
> > > > +	offset   = vma->vm_pgoff << PAGE_SHIFT;
> > > > +	phyaddr  = virtaddr - vma->vm_start + offset;
> > > > +	pgoff    = phyaddr >> PAGE_SHIFT;
> > > > +	req_size = vma->vm_end - virtaddr;
> > > > +	pg_prot  = vma->vm_page_prot;
> > > > +
> > > > +	if (gpu_dev->ops->validate_map_request) {
> > > > +		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff,
> > > > +							 &req_size, &pg_prot);
> > > > +		if (ret)
> > > > +			return ret;
> > > > +
> > > > +		if (!req_size)
> > > > +			return -EINVAL;
> > > > +	}
> > > > +
> > > > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);  
> > > 
> > > So not supporting validate_map_request() means that the user can
> > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> > > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> > > scenario or should this callback be required?  It's not clear to me how
> > > the vendor driver determines what this maps to, do they compare it to
> > > the physical device's own BAR addresses?  
> > 
> > I didn't quite understand too. Based on earlier discussion, do we need
> > something like this, or could achieve the purpose just by leveraging
> > recent sparse mmap support?
> 
> The reason for faulting in the mmio space, if I recall correctly, is to
> enable an ordering where the user driver (QEMU) can mmap regions of the
> device prior to resources being allocated on the host GPU to handle
> them.  Sparse mmap only partially handles that, it's not dynamic.  With
> this faulting mechanism, the host GPU doesn't need to commit resources
> until the mmap is actually accessed.  Thanks,

Correct.

Thanks,
Neo

> 
> Alex
Kirti Wankhede May 5, 2016, 4:42 a.m. UTC | #6
On 5/5/2016 2:44 AM, Neo Jia wrote:
> On Wed, May 04, 2016 at 11:06:19AM -0600, Alex Williamson wrote:
>> On Wed, 4 May 2016 03:23:13 +0000
>> "Tian, Kevin" <kevin.tian@intel.com> wrote:
>>
>>>> From: Alex Williamson [mailto:alex.williamson@redhat.com]
>>>> Sent: Wednesday, May 04, 2016 6:43 AM
>>>>> +
>>>>> +		if (gpu_dev->ops->write) {
>>>>> +			ret = gpu_dev->ops->write(vgpu_dev,
>>>>> +						  user_data,
>>>>> +						  count,
>>>>> +						  vgpu_emul_space_config,
>>>>> +						  pos);
>>>>> +		}
>>>>> +
>>>>> +		memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count);
>>>>
>>>> So write is expected to user_data to allow only the writable bits to be
>>>> changed?  What's really being saved in the vconfig here vs the vendor
>>>> vgpu driver?  It seems like we're only using it to cache the BAR
>>>> values, but we're not providing the BAR emulation here, which seems
>>>> like one of the few things we could provide so it's not duplicated in
>>>> every vendor driver.  But then we only need a few u32s to do that, not
>>>> all of config space.
>>>
>>> We can borrow same vconfig emulation from existing vfio-pci driver.
>>> But doing so doesn't mean that vendor vgpu driver cannot have its
>>> own vconfig emulation further. vGPU is not like a real device, since
>>> there may be no physical config space implemented for each vGPU.
>>> So anyway vendor vGPU driver needs to create/emulate the virtualized
>>> config space while the way how is created might be vendor specific.
>>> So better to keep the interface to access raw vconfig space from
>>> vendor vGPU driver.
>>
>> I'm hoping config space will be very simple for a vgpu, so I don't know
>> that it makes sense to add that complexity early on.  Neo/Kirti, what
>> capabilities do you expect to provide?  Who provides the MSI
>> capability?  Is a PCIe capability provided?  Others?
>

 From VGPU_VFIO point of view, VGPU_VFIO would not provide or modify any 
capabilities. Vendor vGPU driver should provide config space. Then 
vendor driver can provide PCI capabilities or PCIe capabilities, it 
might also have vendor specific information. VGPU_VFIO driver would not 
intercept that information.

> Currently only standard PCI caps.
>
> MSI cap is emulated by the vendor drivers via the above interface.
>
> No PCIe caps so far.
>

Nvidia vGPU device is standard PCI device. We tested standard PCI caps.

Thanks,
Kirti.

>>
>>>>> +static ssize_t vgpu_dev_rw(void *device_data, char __user *buf,
>>>>> +		size_t count, loff_t *ppos, bool iswrite)
>>>>> +{
>>>>> +	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
>>>>> +	struct vfio_vgpu_device *vdev = device_data;
>>>>> +
>>>>> +	if (index >= VFIO_PCI_NUM_REGIONS)
>>>>> +		return -EINVAL;
>>>>> +
>>>>> +	switch (index) {
>>>>> +	case VFIO_PCI_CONFIG_REGION_INDEX:
>>>>> +		return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite);
>>>>> +
>>>>> +	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
>>>>> +		return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite);
>>>>> +
>>>>> +	case VFIO_PCI_ROM_REGION_INDEX:
>>>>> +	case VFIO_PCI_VGA_REGION_INDEX:
>>>>
>>>> Wait a sec, who's doing the VGA emulation?  We can't be claiming to
>>>> support a VGA region and then fail to provide read/write access to it
>>>> like we said it has.
>>>
>>> For Intel side we plan to not support VGA region when upstreaming our
>>> KVMGT work, which means Intel vGPU will be exposed only as a
>>> secondary graphics card then so legacy VGA is not required. Also no
>>> VBIOS/ROM requirement. Guess we can remove above two regions.
>>
>> So this needs to be optional based on what the mediation driver
>> provides.  It seems like we're just making passthroughs for the vendor
>> mediation driver to speak vfio.
>>
>>>>> +
>>>>> +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>>>>> +{
>>>>> +	int ret = 0;
>>>>> +	struct vfio_vgpu_device *vdev = vma->vm_private_data;
>>>>> +	struct vgpu_device *vgpu_dev;
>>>>> +	struct gpu_device *gpu_dev;
>>>>> +	u64 virtaddr = (u64)vmf->virtual_address;
>>>>> +	u64 offset, phyaddr;
>>>>> +	unsigned long req_size, pgoff;
>>>>> +	pgprot_t pg_prot;
>>>>> +
>>>>> +	if (!vdev && !vdev->vgpu_dev)
>>>>> +		return -EINVAL;
>>>>> +
>>>>> +	vgpu_dev = vdev->vgpu_dev;
>>>>> +	gpu_dev  = vgpu_dev->gpu_dev;
>>>>> +
>>>>> +	offset   = vma->vm_pgoff << PAGE_SHIFT;
>>>>> +	phyaddr  = virtaddr - vma->vm_start + offset;
>>>>> +	pgoff    = phyaddr >> PAGE_SHIFT;
>>>>> +	req_size = vma->vm_end - virtaddr;
>>>>> +	pg_prot  = vma->vm_page_prot;
>>>>> +
>>>>> +	if (gpu_dev->ops->validate_map_request) {
>>>>> +		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff,
>>>>> +							 &req_size, &pg_prot);
>>>>> +		if (ret)
>>>>> +			return ret;
>>>>> +
>>>>> +		if (!req_size)
>>>>> +			return -EINVAL;
>>>>> +	}
>>>>> +
>>>>> +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
>>>>
>>>> So not supporting validate_map_request() means that the user can
>>>> directly mmap BARs of the host GPU and as shown below, we assume a 1:1
>>>> mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
>>>> scenario or should this callback be required?  It's not clear to me how
>>>> the vendor driver determines what this maps to, do they compare it to
>>>> the physical device's own BAR addresses?
>>>
>>> I didn't quite understand too. Based on earlier discussion, do we need
>>> something like this, or could achieve the purpose just by leveraging
>>> recent sparse mmap support?
>>
>> The reason for faulting in the mmio space, if I recall correctly, is to
>> enable an ordering where the user driver (QEMU) can mmap regions of the
>> device prior to resources being allocated on the host GPU to handle
>> them.  Sparse mmap only partially handles that, it's not dynamic.  With
>> this faulting mechanism, the host GPU doesn't need to commit resources
>> until the mmap is actually accessed.  Thanks,
>
> Correct.
>
> Thanks,
> Neo
>
>>
>> Alex
Tian, Kevin May 5, 2016, 9:24 a.m. UTC | #7
> From: Alex Williamson
> Sent: Thursday, May 05, 2016 1:06 AM
> > > > +
> > > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault
> *vmf)
> > > > +{
> > > > +	int ret = 0;
> > > > +	struct vfio_vgpu_device *vdev = vma->vm_private_data;
> > > > +	struct vgpu_device *vgpu_dev;
> > > > +	struct gpu_device *gpu_dev;
> > > > +	u64 virtaddr = (u64)vmf->virtual_address;
> > > > +	u64 offset, phyaddr;
> > > > +	unsigned long req_size, pgoff;
> > > > +	pgprot_t pg_prot;
> > > > +
> > > > +	if (!vdev && !vdev->vgpu_dev)
> > > > +		return -EINVAL;
> > > > +
> > > > +	vgpu_dev = vdev->vgpu_dev;
> > > > +	gpu_dev  = vgpu_dev->gpu_dev;
> > > > +
> > > > +	offset   = vma->vm_pgoff << PAGE_SHIFT;
> > > > +	phyaddr  = virtaddr - vma->vm_start + offset;
> > > > +	pgoff    = phyaddr >> PAGE_SHIFT;
> > > > +	req_size = vma->vm_end - virtaddr;
> > > > +	pg_prot  = vma->vm_page_prot;
> > > > +
> > > > +	if (gpu_dev->ops->validate_map_request) {
> > > > +		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr,
> &pgoff,
> > > > +							 &req_size, &pg_prot);
> > > > +		if (ret)
> > > > +			return ret;
> > > > +
> > > > +		if (!req_size)
> > > > +			return -EINVAL;
> > > > +	}
> > > > +
> > > > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
> > >
> > > So not supporting validate_map_request() means that the user can
> > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> > > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> > > scenario or should this callback be required?  It's not clear to me how
> > > the vendor driver determines what this maps to, do they compare it to
> > > the physical device's own BAR addresses?
> >
> > I didn't quite understand too. Based on earlier discussion, do we need
> > something like this, or could achieve the purpose just by leveraging
> > recent sparse mmap support?
> 
> The reason for faulting in the mmio space, if I recall correctly, is to
> enable an ordering where the user driver (QEMU) can mmap regions of the
> device prior to resources being allocated on the host GPU to handle
> them.  Sparse mmap only partially handles that, it's not dynamic.  With
> this faulting mechanism, the host GPU doesn't need to commit resources
> until the mmap is actually accessed.  Thanks,
> 
> Alex

Neo/Kirti, any specific example how above exactly works? I can see
difference from sparse mmap based on Alex's explanation, but still
cannot map the 1st sentence to a real scenario clearly. Now our side
doesn't use such faulting-based method. So I'd like to understand it
clearly and then see any value to do same thing for Intel GPU.

Thanks
Kevin
Neo Jia May 5, 2016, 8:27 p.m. UTC | #8
On Thu, May 05, 2016 at 09:24:26AM +0000, Tian, Kevin wrote:
> > From: Alex Williamson
> > Sent: Thursday, May 05, 2016 1:06 AM
> > > > > +
> > > > > +static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault
> > *vmf)
> > > > > +{
> > > > > +	int ret = 0;
> > > > > +	struct vfio_vgpu_device *vdev = vma->vm_private_data;
> > > > > +	struct vgpu_device *vgpu_dev;
> > > > > +	struct gpu_device *gpu_dev;
> > > > > +	u64 virtaddr = (u64)vmf->virtual_address;
> > > > > +	u64 offset, phyaddr;
> > > > > +	unsigned long req_size, pgoff;
> > > > > +	pgprot_t pg_prot;
> > > > > +
> > > > > +	if (!vdev && !vdev->vgpu_dev)
> > > > > +		return -EINVAL;
> > > > > +
> > > > > +	vgpu_dev = vdev->vgpu_dev;
> > > > > +	gpu_dev  = vgpu_dev->gpu_dev;
> > > > > +
> > > > > +	offset   = vma->vm_pgoff << PAGE_SHIFT;
> > > > > +	phyaddr  = virtaddr - vma->vm_start + offset;
> > > > > +	pgoff    = phyaddr >> PAGE_SHIFT;
> > > > > +	req_size = vma->vm_end - virtaddr;
> > > > > +	pg_prot  = vma->vm_page_prot;
> > > > > +
> > > > > +	if (gpu_dev->ops->validate_map_request) {
> > > > > +		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr,
> > &pgoff,
> > > > > +							 &req_size, &pg_prot);
> > > > > +		if (ret)
> > > > > +			return ret;
> > > > > +
> > > > > +		if (!req_size)
> > > > > +			return -EINVAL;
> > > > > +	}
> > > > > +
> > > > > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
> > > >
> > > > So not supporting validate_map_request() means that the user can
> > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> > > > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> > > > scenario or should this callback be required?  It's not clear to me how
> > > > the vendor driver determines what this maps to, do they compare it to
> > > > the physical device's own BAR addresses?
> > >
> > > I didn't quite understand too. Based on earlier discussion, do we need
> > > something like this, or could achieve the purpose just by leveraging
> > > recent sparse mmap support?
> > 
> > The reason for faulting in the mmio space, if I recall correctly, is to
> > enable an ordering where the user driver (QEMU) can mmap regions of the
> > device prior to resources being allocated on the host GPU to handle
> > them.  Sparse mmap only partially handles that, it's not dynamic.  With
> > this faulting mechanism, the host GPU doesn't need to commit resources
> > until the mmap is actually accessed.  Thanks,
> > 
> > Alex
> 
> Neo/Kirti, any specific example how above exactly works? I can see
> difference from sparse mmap based on Alex's explanation, but still
> cannot map the 1st sentence to a real scenario clearly. Now our side
> doesn't use such faulting-based method. So I'd like to understand it
> clearly and then see any value to do same thing for Intel GPU.

Hi Kevin,

The short answer is CPU access to GPU resources via MMIO region.

Thanks,
Neo

> 
> Thanks
> Kevin
Tian, Kevin May 11, 2016, 6:45 a.m. UTC | #9
> From: Alex Williamson
> Sent: Thursday, May 05, 2016 1:06 AM
> > > > +
> > > > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
> > >
> > > So not supporting validate_map_request() means that the user can
> > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> > > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> > > scenario or should this callback be required?  It's not clear to me how
> > > the vendor driver determines what this maps to, do they compare it to
> > > the physical device's own BAR addresses?
> >
> > I didn't quite understand too. Based on earlier discussion, do we need
> > something like this, or could achieve the purpose just by leveraging
> > recent sparse mmap support?
> 
> The reason for faulting in the mmio space, if I recall correctly, is to
> enable an ordering where the user driver (QEMU) can mmap regions of the
> device prior to resources being allocated on the host GPU to handle
> them.  Sparse mmap only partially handles that, it's not dynamic.  With
> this faulting mechanism, the host GPU doesn't need to commit resources
> until the mmap is actually accessed.  Thanks,
> 
> Alex

Just double confirm. I assume this faulting mechanism can work with
sparse mmap, right? Regardless of whether it's a full or partial region,
this faulting mechanism would commit resource only when accessed
page has MMAP flag set...

Thanks
Kevin
Alex Williamson May 11, 2016, 8:10 p.m. UTC | #10
On Wed, 11 May 2016 06:45:41 +0000
"Tian, Kevin" <kevin.tian@intel.com> wrote:

> > From: Alex Williamson
> > Sent: Thursday, May 05, 2016 1:06 AM  
> > > > > +
> > > > > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);  
> > > >
> > > > So not supporting validate_map_request() means that the user can
> > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> > > > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> > > > scenario or should this callback be required?  It's not clear to me how
> > > > the vendor driver determines what this maps to, do they compare it to
> > > > the physical device's own BAR addresses?  
> > >
> > > I didn't quite understand too. Based on earlier discussion, do we need
> > > something like this, or could achieve the purpose just by leveraging
> > > recent sparse mmap support?  
> > 
> > The reason for faulting in the mmio space, if I recall correctly, is to
> > enable an ordering where the user driver (QEMU) can mmap regions of the
> > device prior to resources being allocated on the host GPU to handle
> > them.  Sparse mmap only partially handles that, it's not dynamic.  With
> > this faulting mechanism, the host GPU doesn't need to commit resources
> > until the mmap is actually accessed.  Thanks,
> > 
> > Alex  
> 
> Just double confirm. I assume this faulting mechanism can work with
> sparse mmap, right? Regardless of whether it's a full or partial region,
> this faulting mechanism would commit resource only when accessed
> page has MMAP flag set...

Yes, the vfio sparse mmap just solves the problem that a vfio region
maps to an entire device resource, for example in the case of vfio-pci,
a PCI BAR.  It turns out that specifying mmap on a whole region doesn't
give us the granularity we need.  Sparse mmap gives us a generic way to
tell userspace which areas within a region support mmap and which
should use read/write access through the vfio device file descriptor.
The latter allows us to protect specific regions or provide further
emulation/virtualization for that sub-area.  How the mmap vma is
populated for the portions that do support mmap is an orthogonal
issue.  Thanks,

Alex
Tian, Kevin May 12, 2016, 12:59 a.m. UTC | #11
> From: Alex Williamson [mailto:alex.williamson@redhat.com]
> Sent: Thursday, May 12, 2016 4:11 AM
> On Wed, 11 May 2016 06:45:41 +0000
> "Tian, Kevin" <kevin.tian@intel.com> wrote:
> 
> > > From: Alex Williamson
> > > Sent: Thursday, May 05, 2016 1:06 AM
> > > > > > +
> > > > > > +	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
> > > > >
> > > > > So not supporting validate_map_request() means that the user can
> > > > > directly mmap BARs of the host GPU and as shown below, we assume a 1:1
> > > > > mapping of vGPU BAR to host GPU BAR.  Is that ever valid in a vGPU
> > > > > scenario or should this callback be required?  It's not clear to me how
> > > > > the vendor driver determines what this maps to, do they compare it to
> > > > > the physical device's own BAR addresses?
> > > >
> > > > I didn't quite understand too. Based on earlier discussion, do we need
> > > > something like this, or could achieve the purpose just by leveraging
> > > > recent sparse mmap support?
> > >
> > > The reason for faulting in the mmio space, if I recall correctly, is to
> > > enable an ordering where the user driver (QEMU) can mmap regions of the
> > > device prior to resources being allocated on the host GPU to handle
> > > them.  Sparse mmap only partially handles that, it's not dynamic.  With
> > > this faulting mechanism, the host GPU doesn't need to commit resources
> > > until the mmap is actually accessed.  Thanks,
> > >
> > > Alex
> >
> > Just double confirm. I assume this faulting mechanism can work with
> > sparse mmap, right? Regardless of whether it's a full or partial region,
> > this faulting mechanism would commit resource only when accessed
> > page has MMAP flag set...
> 
> Yes, the vfio sparse mmap just solves the problem that a vfio region
> maps to an entire device resource, for example in the case of vfio-pci,
> a PCI BAR.  It turns out that specifying mmap on a whole region doesn't
> give us the granularity we need.  Sparse mmap gives us a generic way to
> tell userspace which areas within a region support mmap and which
> should use read/write access through the vfio device file descriptor.
> The latter allows us to protect specific regions or provide further
> emulation/virtualization for that sub-area.  How the mmap vma is
> populated for the portions that do support mmap is an orthogonal
> issue.  Thanks,
> 

Exactly! Thanks for confirmation.

Kevin
diff mbox

Patch

diff --git a/drivers/vgpu/Makefile b/drivers/vgpu/Makefile
index f5be980..a0a2655 100644
--- a/drivers/vgpu/Makefile
+++ b/drivers/vgpu/Makefile
@@ -2,3 +2,4 @@ 
 vgpu-y := vgpu-core.o vgpu-sysfs.o vgpu-driver.o
 
 obj-$(CONFIG_VGPU)			+= vgpu.o
+obj-$(CONFIG_VGPU_VFIO)                 += vgpu_vfio.o
diff --git a/drivers/vgpu/vgpu_vfio.c b/drivers/vgpu/vgpu_vfio.c
new file mode 100644
index 0000000..460a4dc
--- /dev/null
+++ b/drivers/vgpu/vgpu_vfio.c
@@ -0,0 +1,671 @@ 
+/*
+ * VGPU VFIO device
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+ *     Author: Neo Jia <cjia@nvidia.com>
+ *	       Kirti Wankhede <kwankhede@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/uuid.h>
+#include <linux/vfio.h>
+#include <linux/iommu.h>
+#include <linux/vgpu.h>
+
+#include "vgpu_private.h"
+
+#define DRIVER_VERSION  "0.1"
+#define DRIVER_AUTHOR   "NVIDIA Corporation"
+#define DRIVER_DESC     "VGPU VFIO Driver"
+
+#define VFIO_PCI_OFFSET_SHIFT   40
+
+#define VFIO_PCI_OFFSET_TO_INDEX(off)	(off >> VFIO_PCI_OFFSET_SHIFT)
+#define VFIO_PCI_INDEX_TO_OFFSET(index)	((u64)(index) << VFIO_PCI_OFFSET_SHIFT)
+#define VFIO_PCI_OFFSET_MASK	(((u64)(1) << VFIO_PCI_OFFSET_SHIFT) - 1)
+
+struct vfio_vgpu_device {
+	struct iommu_group *group;
+	struct vgpu_device *vgpu_dev;
+	int		    refcnt;
+	struct pci_bar_info bar_info[VFIO_PCI_NUM_REGIONS];
+	u8		    *vconfig;
+};
+
+static DEFINE_MUTEX(vfio_vgpu_lock);
+
+static int get_virtual_bar_info(struct vgpu_device *vgpu_dev,
+				struct pci_bar_info *bar_info,
+				int index)
+{
+	int ret = -1;
+	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
+
+	if (gpu_dev->ops->vgpu_bar_info)
+		ret = gpu_dev->ops->vgpu_bar_info(vgpu_dev, index, bar_info);
+	return ret;
+}
+
+static int vdev_read_base(struct vfio_vgpu_device *vdev)
+{
+	int index, pos;
+	u32 start_lo, start_hi;
+	u32 mem_type;
+
+	pos = PCI_BASE_ADDRESS_0;
+
+	for (index = 0; index <= VFIO_PCI_BAR5_REGION_INDEX; index++) {
+
+		if (!vdev->bar_info[index].size)
+			continue;
+
+		start_lo = (*(u32 *)(vdev->vconfig + pos)) &
+					PCI_BASE_ADDRESS_MEM_MASK;
+		mem_type = (*(u32 *)(vdev->vconfig + pos)) &
+					PCI_BASE_ADDRESS_MEM_TYPE_MASK;
+
+		switch (mem_type) {
+		case PCI_BASE_ADDRESS_MEM_TYPE_64:
+			start_hi = (*(u32 *)(vdev->vconfig + pos + 4));
+			pos += 4;
+			break;
+		case PCI_BASE_ADDRESS_MEM_TYPE_32:
+		case PCI_BASE_ADDRESS_MEM_TYPE_1M:
+			/* 1M mem BAR treated as 32-bit BAR */
+		default:
+			/* mem unknown type treated as 32-bit BAR */
+			start_hi = 0;
+			break;
+		}
+		pos += 4;
+		vdev->bar_info[index].start = ((u64)start_hi << 32) | start_lo;
+	}
+	return 0;
+}
+
+static int vgpu_dev_open(void *device_data)
+{
+	int ret = 0;
+	struct vfio_vgpu_device *vdev = device_data;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&vfio_vgpu_lock);
+
+	if (!vdev->refcnt) {
+		u8 *vconfig;
+		int vconfig_size, index;
+
+		for (index = 0; index < VFIO_PCI_NUM_REGIONS; index++) {
+			ret = get_virtual_bar_info(vdev->vgpu_dev,
+						   &vdev->bar_info[index],
+						   index);
+			if (ret)
+				goto open_error;
+		}
+		vconfig_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size;
+		if (!vconfig_size)
+			goto open_error;
+
+		vconfig = kzalloc(vconfig_size, GFP_KERNEL);
+		if (!vconfig) {
+			ret = -ENOMEM;
+			goto open_error;
+		}
+
+		vdev->vconfig = vconfig;
+	}
+
+	vdev->refcnt++;
+open_error:
+
+	mutex_unlock(&vfio_vgpu_lock);
+
+	if (ret)
+		module_put(THIS_MODULE);
+
+	return ret;
+}
+
+static void vgpu_dev_close(void *device_data)
+{
+	struct vfio_vgpu_device *vdev = device_data;
+
+	mutex_lock(&vfio_vgpu_lock);
+
+	vdev->refcnt--;
+	if (!vdev->refcnt) {
+		memset(&vdev->bar_info, 0, sizeof(vdev->bar_info));
+		if (vdev->vconfig)
+			kfree(vdev->vconfig);
+	}
+
+	mutex_unlock(&vfio_vgpu_lock);
+
+	module_put(THIS_MODULE);
+}
+
+static int vgpu_get_irq_count(struct vfio_vgpu_device *vdev, int irq_type)
+{
+	// Don't support MSIX for now
+	if (irq_type == VFIO_PCI_MSIX_IRQ_INDEX)
+		return -1;
+
+	return 1;
+}
+
+static long vgpu_dev_unlocked_ioctl(void *device_data,
+		unsigned int cmd, unsigned long arg)
+{
+	int ret = 0;
+	struct vfio_vgpu_device *vdev = device_data;
+	unsigned long minsz;
+
+	switch (cmd)
+	{
+	case VFIO_DEVICE_GET_INFO:
+	{
+		struct vfio_device_info info;
+		printk(KERN_INFO "%s VFIO_DEVICE_GET_INFO cmd index ", __FUNCTION__);
+		minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		info.flags = VFIO_DEVICE_FLAGS_PCI;
+		info.num_regions = VFIO_PCI_NUM_REGIONS;
+		info.num_irqs = VFIO_PCI_NUM_IRQS;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+
+	case VFIO_DEVICE_GET_REGION_INFO:
+	{
+		struct vfio_region_info info;
+
+		minsz = offsetofend(struct vfio_region_info, offset);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz)
+			return -EINVAL;
+
+		printk(KERN_INFO "%s VFIO_DEVICE_GET_REGION_INFO cmd for region_index %d", __FUNCTION__, info.index);
+		switch (info.index) {
+		case VFIO_PCI_CONFIG_REGION_INDEX:
+		case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = vdev->bar_info[info.index].size;
+			if (!info.size) {
+				info.flags = 0;
+				break;
+			}
+
+			info.flags = vdev->bar_info[info.index].flags;
+			break;
+		case VFIO_PCI_VGA_REGION_INDEX:
+			info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index);
+			info.size = 0xc0000;
+			info.flags = VFIO_REGION_INFO_FLAG_READ |
+				     VFIO_REGION_INFO_FLAG_WRITE;
+				break;
+
+		case VFIO_PCI_ROM_REGION_INDEX:
+		default:
+			return -EINVAL;
+		}
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+
+	}
+	case VFIO_DEVICE_GET_IRQ_INFO:
+	{
+		struct vfio_irq_info info;
+
+		printk(KERN_INFO "%s VFIO_DEVICE_GET_IRQ_INFO cmd", __FUNCTION__);
+		minsz = offsetofend(struct vfio_irq_info, count);
+
+		if (copy_from_user(&info, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (info.argsz < minsz || info.index >= VFIO_PCI_NUM_IRQS)
+			return -EINVAL;
+
+		switch (info.index) {
+		case VFIO_PCI_INTX_IRQ_INDEX ... VFIO_PCI_MSI_IRQ_INDEX:
+		case VFIO_PCI_REQ_IRQ_INDEX:
+			break;
+			/* pass thru to return error */
+		case VFIO_PCI_MSIX_IRQ_INDEX:
+		default:
+			return -EINVAL;
+		}
+
+		info.count = VFIO_PCI_NUM_IRQS;
+
+		info.flags = VFIO_IRQ_INFO_EVENTFD;
+		info.count = vgpu_get_irq_count(vdev, info.index);
+
+		if (info.count == -1)
+			return -EINVAL;
+
+		if (info.index == VFIO_PCI_INTX_IRQ_INDEX)
+			info.flags |= (VFIO_IRQ_INFO_MASKABLE |
+					VFIO_IRQ_INFO_AUTOMASKED);
+		else
+			info.flags |= VFIO_IRQ_INFO_NORESIZE;
+
+		return copy_to_user((void __user *)arg, &info, minsz);
+	}
+
+	case VFIO_DEVICE_SET_IRQS:
+	{
+		struct vfio_irq_set hdr;
+		struct gpu_device *gpu_dev = vdev->vgpu_dev->gpu_dev;
+		u8 *data = NULL;
+		int ret = 0;
+		minsz = offsetofend(struct vfio_irq_set, count);
+
+		if (copy_from_user(&hdr, (void __user *)arg, minsz))
+			return -EFAULT;
+
+		if (hdr.argsz < minsz || hdr.index >= VFIO_PCI_NUM_IRQS ||
+		    hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+		    VFIO_IRQ_SET_ACTION_TYPE_MASK))
+			return -EINVAL;
+
+		if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+			size_t size;
+			int max = vgpu_get_irq_count(vdev, hdr.index);
+
+			if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
+				size = sizeof(uint8_t);
+			else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
+				size = sizeof(int32_t);
+			else
+				return -EINVAL;
+
+			if (hdr.argsz - minsz < hdr.count * size ||
+			    hdr.start >= max || hdr.start + hdr.count > max)
+				return -EINVAL;
+
+			data = memdup_user((void __user *)(arg + minsz),
+						hdr.count * size);
+				if (IS_ERR(data))
+					return PTR_ERR(data);
+
+			}
+
+			if (gpu_dev->ops->vgpu_set_irqs) {
+				ret = gpu_dev->ops->vgpu_set_irqs(vdev->vgpu_dev,
+								  hdr.flags,
+								  hdr.index, hdr.start,
+								  hdr.count, data);
+			}
+			kfree(data);
+			return ret;
+		}
+
+		default:
+			return -EINVAL;
+	}
+	return ret;
+}
+
+ssize_t vgpu_dev_config_rw(struct vfio_vgpu_device *vdev, char __user *buf,
+		size_t count, loff_t *ppos, bool iswrite)
+{
+	struct vgpu_device *vgpu_dev = vdev->vgpu_dev;
+	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
+	int cfg_size = vdev->bar_info[VFIO_PCI_CONFIG_REGION_INDEX].size;
+	int ret = 0;
+	uint64_t pos = *ppos & VFIO_PCI_OFFSET_MASK;
+
+	if (pos < 0 || pos >= cfg_size ||
+	    pos + count > cfg_size) {
+		printk(KERN_ERR "%s pos 0x%llx out of range\n", __FUNCTION__, pos);
+		ret = -EFAULT;
+		goto config_rw_exit;
+	}
+
+	if (iswrite) {
+		char *user_data = kmalloc(count, GFP_KERNEL);
+
+		if (user_data == NULL) {
+			ret = -ENOMEM;
+			goto config_rw_exit;
+		}
+
+		if (copy_from_user(user_data, buf, count)) {
+			ret = -EFAULT;
+			kfree(user_data);
+			goto config_rw_exit;
+		}
+
+		if (gpu_dev->ops->write) {
+			ret = gpu_dev->ops->write(vgpu_dev,
+						  user_data,
+						  count,
+						  vgpu_emul_space_config,
+						  pos);
+		}
+
+		memcpy((void *)(vdev->vconfig + pos), (void *)user_data, count);
+		kfree(user_data);
+	}
+	else
+	{
+		char *ret_data = kzalloc(count, GFP_KERNEL);
+
+		if (ret_data == NULL) {
+			ret = -ENOMEM;
+			goto config_rw_exit;
+		}
+
+		if (gpu_dev->ops->read) {
+			ret = gpu_dev->ops->read(vgpu_dev,
+						 ret_data,
+						 count,
+						 vgpu_emul_space_config,
+						 pos);
+		}
+
+		if (ret > 0 ) {
+			if (copy_to_user(buf, ret_data, ret)) {
+				ret = -EFAULT;
+				kfree(ret_data);
+				goto config_rw_exit;
+			}
+
+			memcpy((void *)(vdev->vconfig + pos), (void *)ret_data, count);
+		}
+		kfree(ret_data);
+	}
+config_rw_exit:
+	return ret;
+}
+
+ssize_t vgpu_dev_bar_rw(struct vfio_vgpu_device *vdev, char __user *buf,
+		size_t count, loff_t *ppos, bool iswrite)
+{
+	struct vgpu_device *vgpu_dev = vdev->vgpu_dev;
+	struct gpu_device *gpu_dev = vgpu_dev->gpu_dev;
+	loff_t offset = *ppos & VFIO_PCI_OFFSET_MASK;
+	loff_t pos;
+	int bar_index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	int ret = 0;
+
+	if (!vdev->bar_info[bar_index].start) {
+		ret = vdev_read_base(vdev);
+		if (ret)
+			goto bar_rw_exit;
+	}
+
+	if (offset >= vdev->bar_info[bar_index].size) {
+		ret = -EINVAL;
+		goto bar_rw_exit;
+	}
+
+	pos = vdev->bar_info[bar_index].start + offset;
+	if (iswrite) {
+		char *user_data = kmalloc(count, GFP_KERNEL);
+
+		if (user_data == NULL) {
+			ret = -ENOMEM;
+			goto bar_rw_exit;
+		}
+
+		if (copy_from_user(user_data, buf, count)) {
+			ret = -EFAULT;
+			kfree(user_data);
+			goto bar_rw_exit;
+		}
+
+		if (gpu_dev->ops->write) {
+			ret = gpu_dev->ops->write(vgpu_dev,
+						  user_data,
+						  count,
+						  vgpu_emul_space_mmio,
+						  pos);
+		}
+
+		kfree(user_data);
+	}
+	else
+	{
+		char *ret_data = kmalloc(count, GFP_KERNEL);
+
+		if (ret_data == NULL) {
+			ret = -ENOMEM;
+			goto bar_rw_exit;
+		}
+
+		memset(ret_data, 0, count);
+
+		if (gpu_dev->ops->read) {
+			ret = gpu_dev->ops->read(vgpu_dev,
+						 ret_data,
+						 count,
+						 vgpu_emul_space_mmio,
+						 pos);
+		}
+
+		if (ret > 0 ) {
+			if (copy_to_user(buf, ret_data, ret)) {
+				ret = -EFAULT;
+			}
+		}
+		kfree(ret_data);
+	}
+
+bar_rw_exit:
+	return ret;
+}
+
+
+static ssize_t vgpu_dev_rw(void *device_data, char __user *buf,
+		size_t count, loff_t *ppos, bool iswrite)
+{
+	unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos);
+	struct vfio_vgpu_device *vdev = device_data;
+
+	if (index >= VFIO_PCI_NUM_REGIONS)
+		return -EINVAL;
+
+	switch (index) {
+	case VFIO_PCI_CONFIG_REGION_INDEX:
+		return vgpu_dev_config_rw(vdev, buf, count, ppos, iswrite);
+
+	case VFIO_PCI_BAR0_REGION_INDEX ... VFIO_PCI_BAR5_REGION_INDEX:
+		return vgpu_dev_bar_rw(vdev, buf, count, ppos, iswrite);
+
+	case VFIO_PCI_ROM_REGION_INDEX:
+	case VFIO_PCI_VGA_REGION_INDEX:
+		break;
+	}
+
+	return -EINVAL;
+}
+
+
+static ssize_t vgpu_dev_read(void *device_data, char __user *buf,
+			     size_t count, loff_t *ppos)
+{
+	int ret = 0;
+
+	if (count)
+		ret = vgpu_dev_rw(device_data, buf, count, ppos, false);
+
+	return ret;
+}
+
+static ssize_t vgpu_dev_write(void *device_data, const char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	int ret = 0;
+
+	if (count)
+		ret = vgpu_dev_rw(device_data, (char *)buf, count, ppos, true);
+
+	return ret;
+}
+
+static int vgpu_dev_mmio_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	int ret = 0;
+	struct vfio_vgpu_device *vdev = vma->vm_private_data;
+	struct vgpu_device *vgpu_dev;
+	struct gpu_device *gpu_dev;
+	u64 virtaddr = (u64)vmf->virtual_address;
+	u64 offset, phyaddr;
+	unsigned long req_size, pgoff;
+	pgprot_t pg_prot;
+
+	if (!vdev && !vdev->vgpu_dev)
+		return -EINVAL;
+
+	vgpu_dev = vdev->vgpu_dev;
+	gpu_dev  = vgpu_dev->gpu_dev;
+
+	offset   = vma->vm_pgoff << PAGE_SHIFT;
+	phyaddr  = virtaddr - vma->vm_start + offset;
+	pgoff    = phyaddr >> PAGE_SHIFT;
+	req_size = vma->vm_end - virtaddr;
+	pg_prot  = vma->vm_page_prot;
+
+	if (gpu_dev->ops->validate_map_request) {
+		ret = gpu_dev->ops->validate_map_request(vgpu_dev, virtaddr, &pgoff,
+							 &req_size, &pg_prot);
+		if (ret)
+			return ret;
+
+		if (!req_size)
+			return -EINVAL;
+	}
+
+	ret = remap_pfn_range(vma, virtaddr, pgoff, req_size, pg_prot);
+
+	return ret | VM_FAULT_NOPAGE;
+}
+
+static const struct vm_operations_struct vgpu_dev_mmio_ops = {
+	.fault = vgpu_dev_mmio_fault,
+};
+
+
+static int vgpu_dev_mmap(void *device_data, struct vm_area_struct *vma)
+{
+	unsigned int index;
+	struct vfio_vgpu_device *vdev = device_data;
+	struct vgpu_device *vgpu_dev = vdev->vgpu_dev;
+	struct pci_dev *pdev = vgpu_dev->gpu_dev->dev;
+	unsigned long pgoff;
+
+	loff_t offset = vma->vm_pgoff << PAGE_SHIFT;
+
+	index = VFIO_PCI_OFFSET_TO_INDEX(offset);
+
+	if (index >= VFIO_PCI_ROM_REGION_INDEX)
+		return -EINVAL;
+
+	pgoff = vma->vm_pgoff &
+		((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+
+	vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
+
+	vma->vm_private_data = vdev;
+	vma->vm_ops = &vgpu_dev_mmio_ops;
+
+	return 0;
+}
+
+static const struct vfio_device_ops vgpu_vfio_dev_ops = {
+	.name		= "vfio-vgpu",
+	.open		= vgpu_dev_open,
+	.release	= vgpu_dev_close,
+	.ioctl		= vgpu_dev_unlocked_ioctl,
+	.read		= vgpu_dev_read,
+	.write		= vgpu_dev_write,
+	.mmap		= vgpu_dev_mmap,
+};
+
+int vgpu_vfio_probe(struct device *dev)
+{
+	struct vfio_vgpu_device *vdev;
+	struct vgpu_device *vgpu_dev = to_vgpu_device(dev);
+	int ret = 0;
+
+	if (vgpu_dev == NULL)
+		return -EINVAL;
+
+	vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+	if (!vdev) {
+		return -ENOMEM;
+	}
+
+	vdev->vgpu_dev = vgpu_dev;
+	vdev->group = vgpu_dev->group;
+
+	ret = vfio_add_group_dev(dev, &vgpu_vfio_dev_ops, vdev);
+	if (ret)
+		kfree(vdev);
+
+	printk(KERN_INFO "%s ret = %d\n", __FUNCTION__, ret);
+	return ret;
+}
+
+void vgpu_vfio_remove(struct device *dev)
+{
+	struct vfio_vgpu_device *vdev;
+
+	printk(KERN_INFO "%s \n", __FUNCTION__);
+	vdev = vfio_del_group_dev(dev);
+	if (vdev) {
+		printk(KERN_INFO "%s vdev being freed\n", __FUNCTION__);
+		kfree(vdev);
+	}
+}
+
+struct vgpu_driver vgpu_vfio_driver = {
+        .name	= "vgpu-vfio",
+        .probe	= vgpu_vfio_probe,
+        .remove	= vgpu_vfio_remove,
+};
+
+static int __init vgpu_vfio_init(void)
+{
+	printk(KERN_INFO "%s \n", __FUNCTION__);
+	return vgpu_register_driver(&vgpu_vfio_driver, THIS_MODULE);
+}
+
+static void __exit vgpu_vfio_exit(void)
+{
+	printk(KERN_INFO "%s \n", __FUNCTION__);
+	vgpu_unregister_driver(&vgpu_vfio_driver);
+}
+
+module_init(vgpu_vfio_init)
+module_exit(vgpu_vfio_exit)
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);