diff mbox

[v11,11/22] vfio iommu: Add blocking notifier to notify DMA_UNMAP

Message ID 1478293856-8191-12-git-send-email-kwankhede@nvidia.com
State New
Headers show

Commit Message

Kirti Wankhede Nov. 4, 2016, 9:10 p.m. UTC
Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
about DMA_UNMAP.
Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
Notifier should be registered, if external user wants to use
vfio_pin_pages()/vfio_unpin_pages() APIs to pin/unpin pages.
Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
mappings.

Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
Signed-off-by: Neo Jia <cjia@nvidia.com>
Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
---
 drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++++++++++
 drivers/vfio/vfio_iommu_type1.c | 47 ++++++++++++++++++++------
 include/linux/vfio.h            | 11 +++++++
 3 files changed, 121 insertions(+), 10 deletions(-)

Comments

Alex Williamson Nov. 7, 2016, 11:45 p.m. UTC | #1
On Sat, 5 Nov 2016 02:40:45 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> Added blocking notifier to IOMMU TYPE1 driver to notify vendor drivers
> about DMA_UNMAP.
> Exported two APIs vfio_register_notifier() and vfio_unregister_notifier().
> Notifier should be registered, if external user wants to use
> vfio_pin_pages()/vfio_unpin_pages() APIs to pin/unpin pages.
> Vendor driver should use VFIO_IOMMU_NOTIFY_DMA_UNMAP action to invalidate
> mappings.
> 
> Signed-off-by: Kirti Wankhede <kwankhede@nvidia.com>
> Signed-off-by: Neo Jia <cjia@nvidia.com>
> Change-Id: I5910d0024d6be87f3e8d3e0ca0eaeaaa0b17f271
> ---
>  drivers/vfio/vfio.c             | 73 +++++++++++++++++++++++++++++++++++++++++
>  drivers/vfio/vfio_iommu_type1.c | 47 ++++++++++++++++++++------
>  include/linux/vfio.h            | 11 +++++++
>  3 files changed, 121 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
> index 76d260e98930..4ed1a6a247c6 100644
> --- a/drivers/vfio/vfio.c
> +++ b/drivers/vfio/vfio.c
> @@ -1895,6 +1895,79 @@ err_unpin_pages:
>  }
>  EXPORT_SYMBOL(vfio_unpin_pages);
>  
> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)

Is the expectation here that this is a generic notifier for all
vfio->mdev signaling?  That should probably be made clear in the mdev
API to avoid vendor drivers assuming their notifier callback only
occurs for unmaps, even if that's currently the case.

> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret;
> +
> +	if (!dev || !nb)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_register_nb;
> +
> +	container = group->container;
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->register_notifier))
> +		ret = driver->ops->register_notifier(container->iommu_data, nb);
> +	else
> +		ret = -EINVAL;

-ENOTTY again?  And below.

> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);
> +
> +err_register_nb:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_register_notifier);
> +
> +int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
> +{
> +	struct vfio_container *container;
> +	struct vfio_group *group;
> +	struct vfio_iommu_driver *driver;
> +	ssize_t ret;
> +
> +	if (!dev || !nb)
> +		return -EINVAL;
> +
> +	group = vfio_group_get_from_dev(dev);
> +	if (IS_ERR(group))
> +		return PTR_ERR(group);
> +
> +	ret = vfio_group_add_container_user(group);
> +	if (ret)
> +		goto err_unregister_nb;
> +
> +	container = group->container;
> +	down_read(&container->group_lock);
> +
> +	driver = container->iommu_driver;
> +	if (likely(driver && driver->ops->unregister_notifier))
> +		ret = driver->ops->unregister_notifier(container->iommu_data,
> +						       nb);
> +	else
> +		ret = -EINVAL;
> +
> +	up_read(&container->group_lock);
> +	vfio_group_try_dissolve_container(group);

The concern any time we have an unregister like this is whether the
vendor driver does proper cleanup.  Maybe we don't even need an
unregister, could we track this on the group such that releasing the
group automatically unregisters the notifier?  Maybe a single nb
slot and -EBUSY if already set, cleared on release?  Along those lines,
automatically unpinning anything would also be a nice feature (ie. if
an mdev device is unplugged while other devices are still in the
container), but then we'd need to track pinning per group and we already
have too much overhead in tracking pinning.

> +
> +err_unregister_nb:
> +	vfio_group_put(group);
> +	return ret;
> +}
> +EXPORT_SYMBOL(vfio_unregister_notifier);
> +
>  /**
>   * Module/class support
>   */
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index e511073446a0..c2d3a84c447b 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -37,6 +37,7 @@
>  #include <linux/vfio.h>
>  #include <linux/workqueue.h>
>  #include <linux/mdev.h>
> +#include <linux/notifier.h>
>  
>  #define DRIVER_VERSION  "0.2"
>  #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
> @@ -60,6 +61,7 @@ struct vfio_iommu {
>  	struct vfio_domain	*external_domain; /* domain for external user */
>  	struct mutex		lock;
>  	struct rb_root		dma_list;
> +	struct blocking_notifier_head notifier;
>  	bool			v2;
>  	bool			nesting;
>  };
> @@ -550,7 +552,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
>  
>  	mutex_lock(&iommu->lock);
>  
> -	if (!iommu->external_domain) {
> +	/* Fail if notifier list is empty */
> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
>  		ret = -EINVAL;
>  		goto pin_done;
>  	}
> @@ -867,6 +870,11 @@ unlock:
>  	/* Report how much was unmapped */
>  	unmap->size = unmapped;
>  
> +	if (unmapped && iommu->external_domain)
> +		blocking_notifier_call_chain(&iommu->notifier,
> +					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> +					     unmap);

This is after the fact, there's already a gap here where pages are
unpinned and the mdev device is still running.  The notifier needs to
happen prior to that and I suspect that we need to validate that we
have no remaining external pfn references within this vfio_dma block.
It seems like we need to root our pfn tracking in the vfio_dma so that
we can see that it's empty after the notifier chain and BUG_ON if not.
I would also add some enforcement that external pinning is only enabled
when vfio_iommu_type1 is configured for v2 semantics (ie. we only
support unmaps exactly matching previous maps).

> +
>  	return ret;
>  }
>  
> @@ -1474,6 +1482,7 @@ static void *vfio_iommu_type1_open(unsigned long arg)
>  	INIT_LIST_HEAD(&iommu->addr_space_list);
>  	iommu->dma_list = RB_ROOT;
>  	mutex_init(&iommu->lock);
> +	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
>  
>  	return iommu;
>  }
> @@ -1610,16 +1619,34 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>  	return -ENOTTY;
>  }
>  
> +static int vfio_iommu_type1_register_notifier(void *iommu_data,
> +					      struct notifier_block *nb)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +
> +	return blocking_notifier_chain_register(&iommu->notifier, nb);
> +}
> +
> +static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
> +						struct notifier_block *nb)
> +{
> +	struct vfio_iommu *iommu = iommu_data;
> +
> +	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
> +}
> +
>  static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
> -	.name		= "vfio-iommu-type1",
> -	.owner		= THIS_MODULE,
> -	.open		= vfio_iommu_type1_open,
> -	.release	= vfio_iommu_type1_release,
> -	.ioctl		= vfio_iommu_type1_ioctl,
> -	.attach_group	= vfio_iommu_type1_attach_group,
> -	.detach_group	= vfio_iommu_type1_detach_group,
> -	.pin_pages	= vfio_iommu_type1_pin_pages,
> -	.unpin_pages	= vfio_iommu_type1_unpin_pages,
> +	.name			= "vfio-iommu-type1",
> +	.owner			= THIS_MODULE,
> +	.open			= vfio_iommu_type1_open,
> +	.release		= vfio_iommu_type1_release,
> +	.ioctl			= vfio_iommu_type1_ioctl,
> +	.attach_group		= vfio_iommu_type1_attach_group,
> +	.detach_group		= vfio_iommu_type1_detach_group,
> +	.pin_pages		= vfio_iommu_type1_pin_pages,
> +	.unpin_pages		= vfio_iommu_type1_unpin_pages,
> +	.register_notifier	= vfio_iommu_type1_register_notifier,
> +	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
>  };
>  
>  static int __init vfio_iommu_type1_init(void)
> diff --git a/include/linux/vfio.h b/include/linux/vfio.h
> index ba1b64cb7d4b..dcda8fccefab 100644
> --- a/include/linux/vfio.h
> +++ b/include/linux/vfio.h
> @@ -82,6 +82,10 @@ struct vfio_iommu_driver_ops {
>  				       unsigned long *user_pfn,
>  				       unsigned long *pfn,
>  				       int npage);
> +	int		(*register_notifier)(void *iommu_data,
> +					     struct notifier_block *nb);
> +	int		(*unregister_notifier)(void *iommu_data,
> +					       struct notifier_block *nb);
>  };
>  
>  extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
> @@ -139,6 +143,13 @@ extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
>  extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
>  			    unsigned long *pfn, int npage);
>  
> +#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	1
> +
> +extern int vfio_register_notifier(struct device *dev,
> +				  struct notifier_block *nb);
> +
> +extern int vfio_unregister_notifier(struct device *dev,
> +				    struct notifier_block *nb);
>  /*
>   * IRQfd - generic
>   */
Kirti Wankhede Nov. 8, 2016, 4:26 p.m. UTC | #2
On 11/8/2016 5:15 AM, Alex Williamson wrote:
> On Sat, 5 Nov 2016 02:40:45 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
...
>>  
>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
> 
> Is the expectation here that this is a generic notifier for all
> vfio->mdev signaling?  That should probably be made clear in the mdev
> API to avoid vendor drivers assuming their notifier callback only
> occurs for unmaps, even if that's currently the case.
> 

Ok. Adding comment about notifier callback in mdev_device which is part
of next patch.

...

>>  	mutex_lock(&iommu->lock);
>>  
>> -	if (!iommu->external_domain) {
>> +	/* Fail if notifier list is empty */
>> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
>>  		ret = -EINVAL;
>>  		goto pin_done;
>>  	}
>> @@ -867,6 +870,11 @@ unlock:
>>  	/* Report how much was unmapped */
>>  	unmap->size = unmapped;
>>  
>> +	if (unmapped && iommu->external_domain)
>> +		blocking_notifier_call_chain(&iommu->notifier,
>> +					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>> +					     unmap);
> 
> This is after the fact, there's already a gap here where pages are
> unpinned and the mdev device is still running.

Oh, there is a bug here, now unpin_pages() take user_pfn as argument and
find vfio_dma. If its not found, it doesn't unpin pages. We have to call
this notifier before vfio_remove_dma(). But if we call this before
vfio_remove_dma() there will be deadlock since iommu->lock is already
held here and vfio_iommu_type1_unpin_pages() will also try to hold
iommu->lock.
If we want to call blocking_notifier_call_chain() before
vfio_remove_dma(), sequence should be:

unmapped += dma->size;
mutex_unlock(&iommu->lock);
if (iommu->external_domain)) {
	struct vfio_iommu_type1_dma_unmap nb_unmap;

	nb_unmap.iova = dma->iova;
	nb_unmap.size = dma->size;
	blocking_notifier_call_chain(&iommu->notifier,
        	                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
                	             &nb_unmap);
}
mutex_lock(&iommu->lock);
vfio_remove_dma(iommu, dma);


>  The notifier needs to
> happen prior to that and I suspect that we need to validate that we
> have no remaining external pfn references within this vfio_dma block.
> It seems like we need to root our pfn tracking in the vfio_dma so that
> we can see that it's empty after the notifier chain and BUG_ON if not.

There is no way to find pfns from that iova range with current
implementation. We can have this validate if we go with linear array of
iova to track pfns.

> I would also add some enforcement that external pinning is only enabled
> when vfio_iommu_type1 is configured for v2 semantics (ie. we only
> support unmaps exactly matching previous maps).
> 

Ok I'll add that check.

Thanks,
Kirti
Alex Williamson Nov. 8, 2016, 5:46 p.m. UTC | #3
On Tue, 8 Nov 2016 21:56:29 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/8/2016 5:15 AM, Alex Williamson wrote:
> > On Sat, 5 Nov 2016 02:40:45 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> ...
> >>  
> >> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)  
> > 
> > Is the expectation here that this is a generic notifier for all
> > vfio->mdev signaling?  That should probably be made clear in the mdev
> > API to avoid vendor drivers assuming their notifier callback only
> > occurs for unmaps, even if that's currently the case.
> >   
> 
> Ok. Adding comment about notifier callback in mdev_device which is part
> of next patch.
> 
> ...
> 
> >>  	mutex_lock(&iommu->lock);
> >>  
> >> -	if (!iommu->external_domain) {
> >> +	/* Fail if notifier list is empty */
> >> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
> >>  		ret = -EINVAL;
> >>  		goto pin_done;
> >>  	}
> >> @@ -867,6 +870,11 @@ unlock:
> >>  	/* Report how much was unmapped */
> >>  	unmap->size = unmapped;
> >>  
> >> +	if (unmapped && iommu->external_domain)
> >> +		blocking_notifier_call_chain(&iommu->notifier,
> >> +					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >> +					     unmap);  
> > 
> > This is after the fact, there's already a gap here where pages are
> > unpinned and the mdev device is still running.  
> 
> Oh, there is a bug here, now unpin_pages() take user_pfn as argument and
> find vfio_dma. If its not found, it doesn't unpin pages. We have to call
> this notifier before vfio_remove_dma(). But if we call this before
> vfio_remove_dma() there will be deadlock since iommu->lock is already
> held here and vfio_iommu_type1_unpin_pages() will also try to hold
> iommu->lock.
> If we want to call blocking_notifier_call_chain() before
> vfio_remove_dma(), sequence should be:
> 
> unmapped += dma->size;
> mutex_unlock(&iommu->lock);
> if (iommu->external_domain)) {
> 	struct vfio_iommu_type1_dma_unmap nb_unmap;
> 
> 	nb_unmap.iova = dma->iova;
> 	nb_unmap.size = dma->size;
> 	blocking_notifier_call_chain(&iommu->notifier,
>         	                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>                 	             &nb_unmap);
> }
> mutex_lock(&iommu->lock);
> vfio_remove_dma(iommu, dma);

It seems like it would be worthwhile to have the rb-tree rooted in the
vfio-dma, then we only need to call the notifier if there are pages
pinned within that vfio-dma (ie. the rb-tree is not empty).  We can
then release the lock call the notifier, re-acquire the lock, and
BUG_ON if the rb-tree still is not empty.  We might get duplicate pfns
between separate vfio_dma structs, but as I mentioned in other replies,
that seems like an exception that we don't need to optimize for.

> >  The notifier needs to
> > happen prior to that and I suspect that we need to validate that we
> > have no remaining external pfn references within this vfio_dma block.
> > It seems like we need to root our pfn tracking in the vfio_dma so that
> > we can see that it's empty after the notifier chain and BUG_ON if not.  
> 
> There is no way to find pfns from that iova range with current
> implementation. We can have this validate if we go with linear array of
> iova to track pfns.

Right, I was still hoping to avoid storing the pfn even with the
array/page-table approach though, ask the mm layer for the mapping
again.  Is that too much overhead?  Maybe the page table could store
the phys addr and we could use PAGE_MASK to store the reference count
so that each entry is still only 8bytes(?)
 
> > I would also add some enforcement that external pinning is only enabled
> > when vfio_iommu_type1 is configured for v2 semantics (ie. we only
> > support unmaps exactly matching previous maps).
> >   
> 
> Ok I'll add that check.
> 
> Thanks,
> Kirti
Kirti Wankhede Nov. 8, 2016, 7:59 p.m. UTC | #4
On 11/8/2016 11:16 PM, Alex Williamson wrote:
> On Tue, 8 Nov 2016 21:56:29 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 11/8/2016 5:15 AM, Alex Williamson wrote:
>>> On Sat, 5 Nov 2016 02:40:45 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>> ...
>>>>  
>>>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)  
>>>
>>> Is the expectation here that this is a generic notifier for all
>>> vfio->mdev signaling?  That should probably be made clear in the mdev
>>> API to avoid vendor drivers assuming their notifier callback only
>>> occurs for unmaps, even if that's currently the case.
>>>   
>>
>> Ok. Adding comment about notifier callback in mdev_device which is part
>> of next patch.
>>
>> ...
>>
>>>>  	mutex_lock(&iommu->lock);
>>>>  
>>>> -	if (!iommu->external_domain) {
>>>> +	/* Fail if notifier list is empty */
>>>> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
>>>>  		ret = -EINVAL;
>>>>  		goto pin_done;
>>>>  	}
>>>> @@ -867,6 +870,11 @@ unlock:
>>>>  	/* Report how much was unmapped */
>>>>  	unmap->size = unmapped;
>>>>  
>>>> +	if (unmapped && iommu->external_domain)
>>>> +		blocking_notifier_call_chain(&iommu->notifier,
>>>> +					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>> +					     unmap);  
>>>
>>> This is after the fact, there's already a gap here where pages are
>>> unpinned and the mdev device is still running.  
>>
>> Oh, there is a bug here, now unpin_pages() take user_pfn as argument and
>> find vfio_dma. If its not found, it doesn't unpin pages. We have to call
>> this notifier before vfio_remove_dma(). But if we call this before
>> vfio_remove_dma() there will be deadlock since iommu->lock is already
>> held here and vfio_iommu_type1_unpin_pages() will also try to hold
>> iommu->lock.
>> If we want to call blocking_notifier_call_chain() before
>> vfio_remove_dma(), sequence should be:
>>
>> unmapped += dma->size;
>> mutex_unlock(&iommu->lock);
>> if (iommu->external_domain)) {
>> 	struct vfio_iommu_type1_dma_unmap nb_unmap;
>>
>> 	nb_unmap.iova = dma->iova;
>> 	nb_unmap.size = dma->size;
>> 	blocking_notifier_call_chain(&iommu->notifier,
>>         	                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>                 	             &nb_unmap);
>> }
>> mutex_lock(&iommu->lock);
>> vfio_remove_dma(iommu, dma);
> 
> It seems like it would be worthwhile to have the rb-tree rooted in the
> vfio-dma, then we only need to call the notifier if there are pages
> pinned within that vfio-dma (ie. the rb-tree is not empty).  We can
> then release the lock call the notifier, re-acquire the lock, and
> BUG_ON if the rb-tree still is not empty.  We might get duplicate pfns
> between separate vfio_dma structs, but as I mentioned in other replies,
> that seems like an exception that we don't need to optimize for.
> 

If we don't optimize for the case where iova from different vfio_dma are
mapped to same pfn and we would not consider this case for page
accounting then:
- have rb tree of pinned iova, where key would be iova, in each vfio_dma
structure.
- iova tracking structure would have iova and ref_count only.
- page accounting would only count number of iova's in rb_tree, case
where different iova could map to same pfn would not be considered in
this implementation for now.
- vfio_unpin_pages() would have user_pfn and pfn as input, we would
validate that iova exist in rb tree and trust vendor driver that
corresponding pfn is correct, there is no validation of pfn. If want
validate pfn, call GUP, verify pfn and call put_pfn().
- In .release() or .detach_group() path, if there are entries in this rb
tree, call GUP again using that iova, get pfn and then call
put_pfn(pfn) for ref_count+1 times. This is because we are not keeping
pfn in our tracking logic.

Does this sound reasonable?

Thanks,
Kirti


>>>  The notifier needs to
>>> happen prior to that and I suspect that we need to validate that we
>>> have no remaining external pfn references within this vfio_dma block.
>>> It seems like we need to root our pfn tracking in the vfio_dma so that
>>> we can see that it's empty after the notifier chain and BUG_ON if not.  
>>
>> There is no way to find pfns from that iova range with current
>> implementation. We can have this validate if we go with linear array of
>> iova to track pfns.
> 
> Right, I was still hoping to avoid storing the pfn even with the
> array/page-table approach though, ask the mm layer for the mapping
> again.  Is that too much overhead?  Maybe the page table could store
> the phys addr and we could use PAGE_MASK to store the reference count
> so that each entry is still only 8bytes(?)
>  
>>> I would also add some enforcement that external pinning is only enabled
>>> when vfio_iommu_type1 is configured for v2 semantics (ie. we only
>>> support unmaps exactly matching previous maps).
>>>   
>>
>> Ok I'll add that check.
>>
>> Thanks,
>> Kirti
>
Alex Williamson Nov. 8, 2016, 9:28 p.m. UTC | #5
On Wed, 9 Nov 2016 01:29:19 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/8/2016 11:16 PM, Alex Williamson wrote:
> > On Tue, 8 Nov 2016 21:56:29 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 11/8/2016 5:15 AM, Alex Williamson wrote:  
> >>> On Sat, 5 Nov 2016 02:40:45 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>     
> >> ...  
> >>>>  
> >>>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)    
> >>>
> >>> Is the expectation here that this is a generic notifier for all
> >>> vfio->mdev signaling?  That should probably be made clear in the mdev
> >>> API to avoid vendor drivers assuming their notifier callback only
> >>> occurs for unmaps, even if that's currently the case.
> >>>     
> >>
> >> Ok. Adding comment about notifier callback in mdev_device which is part
> >> of next patch.
> >>
> >> ...
> >>  
> >>>>  	mutex_lock(&iommu->lock);
> >>>>  
> >>>> -	if (!iommu->external_domain) {
> >>>> +	/* Fail if notifier list is empty */
> >>>> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
> >>>>  		ret = -EINVAL;
> >>>>  		goto pin_done;
> >>>>  	}
> >>>> @@ -867,6 +870,11 @@ unlock:
> >>>>  	/* Report how much was unmapped */
> >>>>  	unmap->size = unmapped;
> >>>>  
> >>>> +	if (unmapped && iommu->external_domain)
> >>>> +		blocking_notifier_call_chain(&iommu->notifier,
> >>>> +					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>>> +					     unmap);    
> >>>
> >>> This is after the fact, there's already a gap here where pages are
> >>> unpinned and the mdev device is still running.    
> >>
> >> Oh, there is a bug here, now unpin_pages() take user_pfn as argument and
> >> find vfio_dma. If its not found, it doesn't unpin pages. We have to call
> >> this notifier before vfio_remove_dma(). But if we call this before
> >> vfio_remove_dma() there will be deadlock since iommu->lock is already
> >> held here and vfio_iommu_type1_unpin_pages() will also try to hold
> >> iommu->lock.
> >> If we want to call blocking_notifier_call_chain() before
> >> vfio_remove_dma(), sequence should be:
> >>
> >> unmapped += dma->size;
> >> mutex_unlock(&iommu->lock);
> >> if (iommu->external_domain)) {
> >> 	struct vfio_iommu_type1_dma_unmap nb_unmap;
> >>
> >> 	nb_unmap.iova = dma->iova;
> >> 	nb_unmap.size = dma->size;
> >> 	blocking_notifier_call_chain(&iommu->notifier,
> >>         	                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>                 	             &nb_unmap);
> >> }
> >> mutex_lock(&iommu->lock);
> >> vfio_remove_dma(iommu, dma);  
> > 
> > It seems like it would be worthwhile to have the rb-tree rooted in the
> > vfio-dma, then we only need to call the notifier if there are pages
> > pinned within that vfio-dma (ie. the rb-tree is not empty).  We can
> > then release the lock call the notifier, re-acquire the lock, and
> > BUG_ON if the rb-tree still is not empty.  We might get duplicate pfns
> > between separate vfio_dma structs, but as I mentioned in other replies,
> > that seems like an exception that we don't need to optimize for.
> >   
> 
> If we don't optimize for the case where iova from different vfio_dma are
> mapped to same pfn and we would not consider this case for page
> accounting then:

Just to clarify, the current code (not handling mdevs) will pin and do
page accounting per iova, regardless of whether the iova translates to a
unique pfn.  As long as we do no worse than that, I'm ok.

> - have rb tree of pinned iova, where key would be iova, in each vfio_dma
> structure.
> - iova tracking structure would have iova and ref_count only.
> - page accounting would only count number of iova's in rb_tree, case
> where different iova could map to same pfn would not be considered in
> this implementation for now.
> - vfio_unpin_pages() would have user_pfn and pfn as input, we would
> validate that iova exist in rb tree and trust vendor driver that
> corresponding pfn is correct, there is no validation of pfn. If want
> validate pfn, call GUP, verify pfn and call put_pfn().
> - In .release() or .detach_group() path, if there are entries in this rb
> tree, call GUP again using that iova, get pfn and then call
> put_pfn(pfn) for ref_count+1 times. This is because we are not keeping
> pfn in our tracking logic.

Wait a sec, if we detach a group from the container and it's not the
last group in the container (which would trigger a release), we can't
assume anything about which vfio_dma entries were associated with that
device.  The vendor driver, through the release of the device(s) within
that group, needs to unpin.  In a container release, we need to send a
notifier to the vendor driver(s) to cause an unpin.  This is the only
mechanism we have to ensure that vendor drivers are not leaking
references.  If during the release, after the notifier, if any
vfio_pfns remain, we need to BUG_ON, just like we need to do for any
other DMA_UNMAP.

Also, I'll say it again, I also don't like this API of passing around
potentially giant arrays, and especially the API of relying on the
vendor driver to tell us an arbitrary pfn to unpin.  If we make the
assumption that vendor drivers do not pin lots and lots of memory,
perhaps we could use a struct vfio_pfn as:

struct vfio_pfn {
        struct rb_node          node;
	dma_addr_t		iova; /* key */
        unsigned long           pfn;
        atomic_t                ref_count;
};

This puts us at 44-bytes per pfn, which isn't great, but I think it
puts us in a better position with the API that we could make use of a
page-table or sparse array in the future that would eliminate the
rb_node and make the iova implicit in the location of the data
structure.  That would leave only the pfn and ref_count, which could
potentially be combined into a single 8-byte field if we had per
vfio_dma (or higher) locking to avoid the atomic_t (and we're happy that
the reference count is always less than PAGE_SIZE, ie. we could fail
pinning if we get to that point).

That would allow for the unpin call to not provide the pfn, so could we
then look at whether we need the batching provided by the iova array at
all?  I don't have a feel for the size of memory that gets pinned by
the vendor driver, the frequency of pinning, or whether usage of
hugepages for the guest is likely to translate into contiguous memory
requests through this API.  What's your feeling?  Thanks,

Alex
Kirti Wankhede Nov. 14, 2016, 7:52 a.m. UTC | #6
On 11/9/2016 2:58 AM, Alex Williamson wrote:
> On Wed, 9 Nov 2016 01:29:19 +0530
> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> 
>> On 11/8/2016 11:16 PM, Alex Williamson wrote:
>>> On Tue, 8 Nov 2016 21:56:29 +0530
>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>   
>>>> On 11/8/2016 5:15 AM, Alex Williamson wrote:  
>>>>> On Sat, 5 Nov 2016 02:40:45 +0530
>>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
>>>>>     
>>>> ...  
>>>>>>  
>>>>>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)    
>>>>>
>>>>> Is the expectation here that this is a generic notifier for all
>>>>> vfio->mdev signaling?  That should probably be made clear in the mdev
>>>>> API to avoid vendor drivers assuming their notifier callback only
>>>>> occurs for unmaps, even if that's currently the case.
>>>>>     
>>>>
>>>> Ok. Adding comment about notifier callback in mdev_device which is part
>>>> of next patch.
>>>>
>>>> ...
>>>>  
>>>>>>  	mutex_lock(&iommu->lock);
>>>>>>  
>>>>>> -	if (!iommu->external_domain) {
>>>>>> +	/* Fail if notifier list is empty */
>>>>>> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
>>>>>>  		ret = -EINVAL;
>>>>>>  		goto pin_done;
>>>>>>  	}
>>>>>> @@ -867,6 +870,11 @@ unlock:
>>>>>>  	/* Report how much was unmapped */
>>>>>>  	unmap->size = unmapped;
>>>>>>  
>>>>>> +	if (unmapped && iommu->external_domain)
>>>>>> +		blocking_notifier_call_chain(&iommu->notifier,
>>>>>> +					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>>>> +					     unmap);    
>>>>>
>>>>> This is after the fact, there's already a gap here where pages are
>>>>> unpinned and the mdev device is still running.    
>>>>
>>>> Oh, there is a bug here, now unpin_pages() take user_pfn as argument and
>>>> find vfio_dma. If its not found, it doesn't unpin pages. We have to call
>>>> this notifier before vfio_remove_dma(). But if we call this before
>>>> vfio_remove_dma() there will be deadlock since iommu->lock is already
>>>> held here and vfio_iommu_type1_unpin_pages() will also try to hold
>>>> iommu->lock.
>>>> If we want to call blocking_notifier_call_chain() before
>>>> vfio_remove_dma(), sequence should be:
>>>>
>>>> unmapped += dma->size;
>>>> mutex_unlock(&iommu->lock);
>>>> if (iommu->external_domain)) {
>>>> 	struct vfio_iommu_type1_dma_unmap nb_unmap;
>>>>
>>>> 	nb_unmap.iova = dma->iova;
>>>> 	nb_unmap.size = dma->size;
>>>> 	blocking_notifier_call_chain(&iommu->notifier,
>>>>         	                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
>>>>                 	             &nb_unmap);
>>>> }
>>>> mutex_lock(&iommu->lock);
>>>> vfio_remove_dma(iommu, dma);  
>>>
>>> It seems like it would be worthwhile to have the rb-tree rooted in the
>>> vfio-dma, then we only need to call the notifier if there are pages
>>> pinned within that vfio-dma (ie. the rb-tree is not empty).  We can
>>> then release the lock call the notifier, re-acquire the lock, and
>>> BUG_ON if the rb-tree still is not empty.  We might get duplicate pfns
>>> between separate vfio_dma structs, but as I mentioned in other replies,
>>> that seems like an exception that we don't need to optimize for.
>>>   
>>
>> If we don't optimize for the case where iova from different vfio_dma are
>> mapped to same pfn and we would not consider this case for page
>> accounting then:
> 
> Just to clarify, the current code (not handling mdevs) will pin and do
> page accounting per iova, regardless of whether the iova translates to a
> unique pfn.  As long as we do no worse than that, I'm ok.
> 
>> - have rb tree of pinned iova, where key would be iova, in each vfio_dma
>> structure.
>> - iova tracking structure would have iova and ref_count only.
>> - page accounting would only count number of iova's in rb_tree, case
>> where different iova could map to same pfn would not be considered in
>> this implementation for now.
>> - vfio_unpin_pages() would have user_pfn and pfn as input, we would
>> validate that iova exist in rb tree and trust vendor driver that
>> corresponding pfn is correct, there is no validation of pfn. If want
>> validate pfn, call GUP, verify pfn and call put_pfn().
>> - In .release() or .detach_group() path, if there are entries in this rb
>> tree, call GUP again using that iova, get pfn and then call
>> put_pfn(pfn) for ref_count+1 times. This is because we are not keeping
>> pfn in our tracking logic.
> 
> Wait a sec, if we detach a group from the container and it's not the
> last group in the container (which would trigger a release), we can't
> assume anything about which vfio_dma entries were associated with that
> device.  The vendor driver, through the release of the device(s) within
> that group, needs to unpin.  In a container release, we need to send a
> notifier to the vendor driver(s) to cause an unpin.  This is the only
> mechanism we have to ensure that vendor drivers are not leaking
> references.  If during the release, after the notifier, if any
> vfio_pfns remain, we need to BUG_ON, just like we need to do for any
> other DMA_UNMAP.
> 
> Also, I'll say it again, I also don't like this API of passing around
> potentially giant arrays, and especially the API of relying on the
> vendor driver to tell us an arbitrary pfn to unpin.  If we make the
> assumption that vendor drivers do not pin lots and lots of memory,
> perhaps we could use a struct vfio_pfn as:
> 
> struct vfio_pfn {
>         struct rb_node          node;
> 	dma_addr_t		iova; /* key */
>         unsigned long           pfn;
>         atomic_t                ref_count;
> };
> 
> This puts us at 44-bytes per pfn, which isn't great, but I think it
> puts us in a better position with the API that we could make use of a
> page-table or sparse array in the future that would eliminate the
> rb_node and make the iova implicit in the location of the data
> structure.  That would leave only the pfn and ref_count, which could
> potentially be combined into a single 8-byte field if we had per
> vfio_dma (or higher) locking to avoid the atomic_t (and we're happy that
> the reference count is always less than PAGE_SIZE, ie. we could fail
> pinning if we get to that point).
> 

Ok.
- I'll have above structure to track pinned pfn for now and a rb-tree in
vfio_dma structure that would keep track of pages pinned in that range,
dma->iova to dma->iova + dma->size.
- Key for pfn_list rb-tree would be iova, instead of pfn.
- Removing address space structure. vfio_dma keeps task structure, which
would be used to get mm structure (using get_task_mm(task) and
mmput(mm)) for pin/unpin and page accounting.
- vfio_unpin_pages() would have array of user_pfns as input argument,
instead of array of pfns.
- On vfio_pin_pages(), pinning would happen once. On later call to
vfio_pin_pages() with same user_pfn, if iova is found in pfn_list, only
ref_count would be incremented.
- In vfio_unpin_pages(), ref_count is decremented and page will be
unpinned when ref_count is 0.
- For vfio_pin_pages() and vfio_unpin_pages() input array, number of
elements in array should be less that PAGE_SIZE. If vendor driver wants
to use for more pages, array should  be split it in chunks of PAGE_SIZE.
- Updating page accounting logic with above changes.

Thanks,
Kirti

> That would allow for the unpin call to not provide the pfn, so could we
> then look at whether we need the batching provided by the iova array at
> all?  I don't have a feel for the size of memory that gets pinned by
> the vendor driver, the frequency of pinning, or whether usage of
> hugepages for the guest is likely to translate into contiguous memory
> requests through this API.  What's your feeling?  Thanks,
>
Alex Williamson Nov. 14, 2016, 3:37 p.m. UTC | #7
On Mon, 14 Nov 2016 13:22:08 +0530
Kirti Wankhede <kwankhede@nvidia.com> wrote:

> On 11/9/2016 2:58 AM, Alex Williamson wrote:
> > On Wed, 9 Nov 2016 01:29:19 +0530
> > Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >   
> >> On 11/8/2016 11:16 PM, Alex Williamson wrote:  
> >>> On Tue, 8 Nov 2016 21:56:29 +0530
> >>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>     
> >>>> On 11/8/2016 5:15 AM, Alex Williamson wrote:    
> >>>>> On Sat, 5 Nov 2016 02:40:45 +0530
> >>>>> Kirti Wankhede <kwankhede@nvidia.com> wrote:
> >>>>>       
> >>>> ...    
> >>>>>>  
> >>>>>> +int vfio_register_notifier(struct device *dev, struct notifier_block *nb)      
> >>>>>
> >>>>> Is the expectation here that this is a generic notifier for all
> >>>>> vfio->mdev signaling?  That should probably be made clear in the mdev
> >>>>> API to avoid vendor drivers assuming their notifier callback only
> >>>>> occurs for unmaps, even if that's currently the case.
> >>>>>       
> >>>>
> >>>> Ok. Adding comment about notifier callback in mdev_device which is part
> >>>> of next patch.
> >>>>
> >>>> ...
> >>>>    
> >>>>>>  	mutex_lock(&iommu->lock);
> >>>>>>  
> >>>>>> -	if (!iommu->external_domain) {
> >>>>>> +	/* Fail if notifier list is empty */
> >>>>>> +	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
> >>>>>>  		ret = -EINVAL;
> >>>>>>  		goto pin_done;
> >>>>>>  	}
> >>>>>> @@ -867,6 +870,11 @@ unlock:
> >>>>>>  	/* Report how much was unmapped */
> >>>>>>  	unmap->size = unmapped;
> >>>>>>  
> >>>>>> +	if (unmapped && iommu->external_domain)
> >>>>>> +		blocking_notifier_call_chain(&iommu->notifier,
> >>>>>> +					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>>>>> +					     unmap);      
> >>>>>
> >>>>> This is after the fact, there's already a gap here where pages are
> >>>>> unpinned and the mdev device is still running.      
> >>>>
> >>>> Oh, there is a bug here, now unpin_pages() take user_pfn as argument and
> >>>> find vfio_dma. If its not found, it doesn't unpin pages. We have to call
> >>>> this notifier before vfio_remove_dma(). But if we call this before
> >>>> vfio_remove_dma() there will be deadlock since iommu->lock is already
> >>>> held here and vfio_iommu_type1_unpin_pages() will also try to hold
> >>>> iommu->lock.
> >>>> If we want to call blocking_notifier_call_chain() before
> >>>> vfio_remove_dma(), sequence should be:
> >>>>
> >>>> unmapped += dma->size;
> >>>> mutex_unlock(&iommu->lock);
> >>>> if (iommu->external_domain)) {
> >>>> 	struct vfio_iommu_type1_dma_unmap nb_unmap;
> >>>>
> >>>> 	nb_unmap.iova = dma->iova;
> >>>> 	nb_unmap.size = dma->size;
> >>>> 	blocking_notifier_call_chain(&iommu->notifier,
> >>>>         	                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
> >>>>                 	             &nb_unmap);
> >>>> }
> >>>> mutex_lock(&iommu->lock);
> >>>> vfio_remove_dma(iommu, dma);    
> >>>
> >>> It seems like it would be worthwhile to have the rb-tree rooted in the
> >>> vfio-dma, then we only need to call the notifier if there are pages
> >>> pinned within that vfio-dma (ie. the rb-tree is not empty).  We can
> >>> then release the lock call the notifier, re-acquire the lock, and
> >>> BUG_ON if the rb-tree still is not empty.  We might get duplicate pfns
> >>> between separate vfio_dma structs, but as I mentioned in other replies,
> >>> that seems like an exception that we don't need to optimize for.
> >>>     
> >>
> >> If we don't optimize for the case where iova from different vfio_dma are
> >> mapped to same pfn and we would not consider this case for page
> >> accounting then:  
> > 
> > Just to clarify, the current code (not handling mdevs) will pin and do
> > page accounting per iova, regardless of whether the iova translates to a
> > unique pfn.  As long as we do no worse than that, I'm ok.
> >   
> >> - have rb tree of pinned iova, where key would be iova, in each vfio_dma
> >> structure.
> >> - iova tracking structure would have iova and ref_count only.
> >> - page accounting would only count number of iova's in rb_tree, case
> >> where different iova could map to same pfn would not be considered in
> >> this implementation for now.
> >> - vfio_unpin_pages() would have user_pfn and pfn as input, we would
> >> validate that iova exist in rb tree and trust vendor driver that
> >> corresponding pfn is correct, there is no validation of pfn. If want
> >> validate pfn, call GUP, verify pfn and call put_pfn().
> >> - In .release() or .detach_group() path, if there are entries in this rb
> >> tree, call GUP again using that iova, get pfn and then call
> >> put_pfn(pfn) for ref_count+1 times. This is because we are not keeping
> >> pfn in our tracking logic.  
> > 
> > Wait a sec, if we detach a group from the container and it's not the
> > last group in the container (which would trigger a release), we can't
> > assume anything about which vfio_dma entries were associated with that
> > device.  The vendor driver, through the release of the device(s) within
> > that group, needs to unpin.  In a container release, we need to send a
> > notifier to the vendor driver(s) to cause an unpin.  This is the only
> > mechanism we have to ensure that vendor drivers are not leaking
> > references.  If during the release, after the notifier, if any
> > vfio_pfns remain, we need to BUG_ON, just like we need to do for any
> > other DMA_UNMAP.
> > 
> > Also, I'll say it again, I also don't like this API of passing around
> > potentially giant arrays, and especially the API of relying on the
> > vendor driver to tell us an arbitrary pfn to unpin.  If we make the
> > assumption that vendor drivers do not pin lots and lots of memory,
> > perhaps we could use a struct vfio_pfn as:
> > 
> > struct vfio_pfn {
> >         struct rb_node          node;
> > 	dma_addr_t		iova; /* key */
> >         unsigned long           pfn;
> >         atomic_t                ref_count;
> > };
> > 
> > This puts us at 44-bytes per pfn, which isn't great, but I think it
> > puts us in a better position with the API that we could make use of a
> > page-table or sparse array in the future that would eliminate the
> > rb_node and make the iova implicit in the location of the data
> > structure.  That would leave only the pfn and ref_count, which could
> > potentially be combined into a single 8-byte field if we had per
> > vfio_dma (or higher) locking to avoid the atomic_t (and we're happy that
> > the reference count is always less than PAGE_SIZE, ie. we could fail
> > pinning if we get to that point).
> >   
> 
> Ok.
> - I'll have above structure to track pinned pfn for now and a rb-tree in
> vfio_dma structure that would keep track of pages pinned in that range,
> dma->iova to dma->iova + dma->size.
> - Key for pfn_list rb-tree would be iova, instead of pfn.
> - Removing address space structure. vfio_dma keeps task structure, which
> would be used to get mm structure (using get_task_mm(task) and
> mmput(mm)) for pin/unpin and page accounting.
> - vfio_unpin_pages() would have array of user_pfns as input argument,
> instead of array of pfns.
> - On vfio_pin_pages(), pinning would happen once. On later call to
> vfio_pin_pages() with same user_pfn, if iova is found in pfn_list, only
> ref_count would be incremented.
> - In vfio_unpin_pages(), ref_count is decremented and page will be
> unpinned when ref_count is 0.
> - For vfio_pin_pages() and vfio_unpin_pages() input array, number of
> elements in array should be less that PAGE_SIZE. If vendor driver wants
> to use for more pages, array should  be split it in chunks of PAGE_SIZE.

Yes, this is what we discussed offline, the size of the arrays should
never exceed PAGE_SIZE, therefore the number of entries should never
exceed PAGE_SIZE/sizeof(pfn).  The iommu driver should fault with -E2BIG
if the vendor driver attempts to exceed this.

> - Updating page accounting logic with above changes.

Thanks,
Alex
diff mbox

Patch

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 76d260e98930..4ed1a6a247c6 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1895,6 +1895,79 @@  err_unpin_pages:
 }
 EXPORT_SYMBOL(vfio_unpin_pages);
 
+int vfio_register_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret;
+
+	if (!dev || !nb)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_register_nb;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->register_notifier))
+		ret = driver->ops->register_notifier(container->iommu_data, nb);
+	else
+		ret = -EINVAL;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_register_nb:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_register_notifier);
+
+int vfio_unregister_notifier(struct device *dev, struct notifier_block *nb)
+{
+	struct vfio_container *container;
+	struct vfio_group *group;
+	struct vfio_iommu_driver *driver;
+	ssize_t ret;
+
+	if (!dev || !nb)
+		return -EINVAL;
+
+	group = vfio_group_get_from_dev(dev);
+	if (IS_ERR(group))
+		return PTR_ERR(group);
+
+	ret = vfio_group_add_container_user(group);
+	if (ret)
+		goto err_unregister_nb;
+
+	container = group->container;
+	down_read(&container->group_lock);
+
+	driver = container->iommu_driver;
+	if (likely(driver && driver->ops->unregister_notifier))
+		ret = driver->ops->unregister_notifier(container->iommu_data,
+						       nb);
+	else
+		ret = -EINVAL;
+
+	up_read(&container->group_lock);
+	vfio_group_try_dissolve_container(group);
+
+err_unregister_nb:
+	vfio_group_put(group);
+	return ret;
+}
+EXPORT_SYMBOL(vfio_unregister_notifier);
+
 /**
  * Module/class support
  */
diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index e511073446a0..c2d3a84c447b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -37,6 +37,7 @@ 
 #include <linux/vfio.h>
 #include <linux/workqueue.h>
 #include <linux/mdev.h>
+#include <linux/notifier.h>
 
 #define DRIVER_VERSION  "0.2"
 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
@@ -60,6 +61,7 @@  struct vfio_iommu {
 	struct vfio_domain	*external_domain; /* domain for external user */
 	struct mutex		lock;
 	struct rb_root		dma_list;
+	struct blocking_notifier_head notifier;
 	bool			v2;
 	bool			nesting;
 };
@@ -550,7 +552,8 @@  static int vfio_iommu_type1_pin_pages(void *iommu_data,
 
 	mutex_lock(&iommu->lock);
 
-	if (!iommu->external_domain) {
+	/* Fail if notifier list is empty */
+	if ((!iommu->external_domain) || (!iommu->notifier.head)) {
 		ret = -EINVAL;
 		goto pin_done;
 	}
@@ -867,6 +870,11 @@  unlock:
 	/* Report how much was unmapped */
 	unmap->size = unmapped;
 
+	if (unmapped && iommu->external_domain)
+		blocking_notifier_call_chain(&iommu->notifier,
+					     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
+					     unmap);
+
 	return ret;
 }
 
@@ -1474,6 +1482,7 @@  static void *vfio_iommu_type1_open(unsigned long arg)
 	INIT_LIST_HEAD(&iommu->addr_space_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
+	BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
 
 	return iommu;
 }
@@ -1610,16 +1619,34 @@  static long vfio_iommu_type1_ioctl(void *iommu_data,
 	return -ENOTTY;
 }
 
+static int vfio_iommu_type1_register_notifier(void *iommu_data,
+					      struct notifier_block *nb)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	return blocking_notifier_chain_register(&iommu->notifier, nb);
+}
+
+static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
+						struct notifier_block *nb)
+{
+	struct vfio_iommu *iommu = iommu_data;
+
+	return blocking_notifier_chain_unregister(&iommu->notifier, nb);
+}
+
 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
-	.name		= "vfio-iommu-type1",
-	.owner		= THIS_MODULE,
-	.open		= vfio_iommu_type1_open,
-	.release	= vfio_iommu_type1_release,
-	.ioctl		= vfio_iommu_type1_ioctl,
-	.attach_group	= vfio_iommu_type1_attach_group,
-	.detach_group	= vfio_iommu_type1_detach_group,
-	.pin_pages	= vfio_iommu_type1_pin_pages,
-	.unpin_pages	= vfio_iommu_type1_unpin_pages,
+	.name			= "vfio-iommu-type1",
+	.owner			= THIS_MODULE,
+	.open			= vfio_iommu_type1_open,
+	.release		= vfio_iommu_type1_release,
+	.ioctl			= vfio_iommu_type1_ioctl,
+	.attach_group		= vfio_iommu_type1_attach_group,
+	.detach_group		= vfio_iommu_type1_detach_group,
+	.pin_pages		= vfio_iommu_type1_pin_pages,
+	.unpin_pages		= vfio_iommu_type1_unpin_pages,
+	.register_notifier	= vfio_iommu_type1_register_notifier,
+	.unregister_notifier	= vfio_iommu_type1_unregister_notifier,
 };
 
 static int __init vfio_iommu_type1_init(void)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ba1b64cb7d4b..dcda8fccefab 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -82,6 +82,10 @@  struct vfio_iommu_driver_ops {
 				       unsigned long *user_pfn,
 				       unsigned long *pfn,
 				       int npage);
+	int		(*register_notifier)(void *iommu_data,
+					     struct notifier_block *nb);
+	int		(*unregister_notifier)(void *iommu_data,
+					       struct notifier_block *nb);
 };
 
 extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
@@ -139,6 +143,13 @@  extern int vfio_pin_pages(struct device *dev, unsigned long *user_pfn,
 extern int vfio_unpin_pages(struct device *dev, unsigned long *user_pfn,
 			    unsigned long *pfn, int npage);
 
+#define VFIO_IOMMU_NOTIFY_DMA_UNMAP	1
+
+extern int vfio_register_notifier(struct device *dev,
+				  struct notifier_block *nb);
+
+extern int vfio_unregister_notifier(struct device *dev,
+				    struct notifier_block *nb);
 /*
  * IRQfd - generic
  */