@@ -1607,20 +1607,100 @@ ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *bu
}
EXPORT_SYMBOL_GPL(vfio_pci_core_write);
-static void vfio_pci_zap_bars(struct vfio_pci_core_device *vdev)
+/* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */
+static int vfio_pci_zap_and_vma_lock(struct vfio_pci_core_device *vdev, bool try)
{
- struct vfio_device *core_vdev = &vdev->vdev;
- loff_t start = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_BAR0_REGION_INDEX);
- loff_t end = VFIO_PCI_INDEX_TO_OFFSET(VFIO_PCI_ROM_REGION_INDEX);
- loff_t len = end - start;
+ struct vfio_pci_mmap_vma *mmap_vma, *tmp;
- unmap_mapping_range(core_vdev->inode->i_mapping, start, len, true);
+ /*
+ * Lock ordering:
+ * vma_lock is nested under mmap_lock for vm_ops callback paths.
+ * The memory_lock semaphore is used by both code paths calling
+ * into this function to zap vmas and the vm_ops.fault callback
+ * to protect the memory enable state of the device.
+ *
+ * When zapping vmas we need to maintain the mmap_lock => vma_lock
+ * ordering, which requires using vma_lock to walk vma_list to
+ * acquire an mm, then dropping vma_lock to get the mmap_lock and
+ * reacquiring vma_lock. This logic is derived from similar
+ * requirements in uverbs_user_mmap_disassociate().
+ *
+ * mmap_lock must always be the top-level lock when it is taken.
+ * Therefore we can only hold the memory_lock write lock when
+ * vma_list is empty, as we'd need to take mmap_lock to clear
+ * entries. vma_list can only be guaranteed empty when holding
+ * vma_lock, thus memory_lock is nested under vma_lock.
+ *
+ * This enables the vm_ops.fault callback to acquire vma_lock,
+ * followed by memory_lock read lock, while already holding
+ * mmap_lock without risk of deadlock.
+ */
+ while (1) {
+ struct mm_struct *mm = NULL;
+
+ if (try) {
+ if (!mutex_trylock(&vdev->vma_lock))
+ return 0;
+ } else {
+ mutex_lock(&vdev->vma_lock);
+ }
+ while (!list_empty(&vdev->vma_list)) {
+ mmap_vma = list_first_entry(&vdev->vma_list,
+ struct vfio_pci_mmap_vma,
+ vma_next);
+ mm = mmap_vma->vma->vm_mm;
+ if (mmget_not_zero(mm))
+ break;
+
+ list_del(&mmap_vma->vma_next);
+ kfree(mmap_vma);
+ mm = NULL;
+ }
+ if (!mm)
+ return 1;
+ mutex_unlock(&vdev->vma_lock);
+
+ if (try) {
+ if (!mmap_read_trylock(mm)) {
+ mmput(mm);
+ return 0;
+ }
+ } else {
+ mmap_read_lock(mm);
+ }
+ if (try) {
+ if (!mutex_trylock(&vdev->vma_lock)) {
+ mmap_read_unlock(mm);
+ mmput(mm);
+ return 0;
+ }
+ } else {
+ mutex_lock(&vdev->vma_lock);
+ }
+ list_for_each_entry_safe(mmap_vma, tmp,
+ &vdev->vma_list, vma_next) {
+ struct vm_area_struct *vma = mmap_vma->vma;
+
+ if (vma->vm_mm != mm)
+ continue;
+
+ list_del(&mmap_vma->vma_next);
+ kfree(mmap_vma);
+
+ zap_vma_ptes(vma, vma->vm_start,
+ vma->vm_end - vma->vm_start);
+ }
+ mutex_unlock(&vdev->vma_lock);
+ mmap_read_unlock(mm);
+ mmput(mm);
+ }
}
void vfio_pci_zap_and_down_write_memory_lock(struct vfio_pci_core_device *vdev)
{
+ vfio_pci_zap_and_vma_lock(vdev, false);
down_write(&vdev->memory_lock);
- vfio_pci_zap_bars(vdev);
+ mutex_unlock(&vdev->vma_lock);
}
u16 vfio_pci_memory_lock_and_enable(struct vfio_pci_core_device *vdev)
@@ -1642,41 +1722,99 @@ void vfio_pci_memory_unlock_and_restore(struct vfio_pci_core_device *vdev, u16 c
up_write(&vdev->memory_lock);
}
-static unsigned long vma_to_pfn(struct vm_area_struct *vma)
+/* Caller holds vma_lock */
+static int __vfio_pci_add_vma(struct vfio_pci_core_device *vdev,
+ struct vm_area_struct *vma)
{
- struct vfio_pci_core_device *vdev = vma->vm_private_data;
- int index = vma->vm_pgoff >> (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT);
- u64 pgoff;
+ struct vfio_pci_mmap_vma *mmap_vma;
- pgoff = vma->vm_pgoff &
- ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+ mmap_vma = kmalloc(sizeof(*mmap_vma), GFP_KERNEL_ACCOUNT);
+ if (!mmap_vma)
+ return -ENOMEM;
+
+ mmap_vma->vma = vma;
+ list_add(&mmap_vma->vma_next, &vdev->vma_list);
+
+ return 0;
+}
+
+/*
+ * Zap mmaps on open so that we can fault them in on access and therefore
+ * our vma_list only tracks mappings accessed since last zap.
+ */
+static void vfio_pci_mmap_open(struct vm_area_struct *vma)
+{
+ zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+}
- return (pci_resource_start(vdev->pdev, index) >> PAGE_SHIFT) + pgoff;
+static void vfio_pci_mmap_close(struct vm_area_struct *vma)
+{
+ struct vfio_pci_core_device *vdev = vma->vm_private_data;
+ struct vfio_pci_mmap_vma *mmap_vma;
+
+ mutex_lock(&vdev->vma_lock);
+ list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+ if (mmap_vma->vma == vma) {
+ list_del(&mmap_vma->vma_next);
+ kfree(mmap_vma);
+ break;
+ }
+ }
+ mutex_unlock(&vdev->vma_lock);
}
static vm_fault_t vfio_pci_mmap_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct vfio_pci_core_device *vdev = vma->vm_private_data;
- unsigned long pfn, pgoff = vmf->pgoff - vma->vm_pgoff;
- vm_fault_t ret = VM_FAULT_SIGBUS;
-
- pfn = vma_to_pfn(vma);
+ struct vfio_pci_mmap_vma *mmap_vma;
+ vm_fault_t ret = VM_FAULT_NOPAGE;
+ mutex_lock(&vdev->vma_lock);
down_read(&vdev->memory_lock);
- if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev))
- goto out_disabled;
+ /*
+ * Memory region cannot be accessed if the low power feature is engaged
+ * or memory access is disabled.
+ */
+ if (vdev->pm_runtime_engaged || !__vfio_pci_memory_enabled(vdev)) {
+ ret = VM_FAULT_SIGBUS;
+ goto up_out;
+ }
- ret = vmf_insert_pfn(vma, vmf->address, pfn + pgoff);
+ /*
+ * We populate the whole vma on fault, so we need to test whether
+ * the vma has already been mapped, such as for concurrent faults
+ * to the same vma. io_remap_pfn_range() will trigger a BUG_ON if
+ * we ask it to fill the same range again.
+ */
+ list_for_each_entry(mmap_vma, &vdev->vma_list, vma_next) {
+ if (mmap_vma->vma == vma)
+ goto up_out;
+ }
-out_disabled:
- up_read(&vdev->memory_lock);
+ if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+ vma->vm_end - vma->vm_start,
+ vma->vm_page_prot)) {
+ ret = VM_FAULT_SIGBUS;
+ zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+ goto up_out;
+ }
+ if (__vfio_pci_add_vma(vdev, vma)) {
+ ret = VM_FAULT_OOM;
+ zap_vma_ptes(vma, vma->vm_start, vma->vm_end - vma->vm_start);
+ }
+
+up_out:
+ up_read(&vdev->memory_lock);
+ mutex_unlock(&vdev->vma_lock);
return ret;
}
static const struct vm_operations_struct vfio_pci_mmap_ops = {
+ .open = vfio_pci_mmap_open,
+ .close = vfio_pci_mmap_close,
.fault = vfio_pci_mmap_fault,
};
@@ -1739,15 +1877,11 @@ int vfio_pci_core_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma
vma->vm_private_data = vdev;
vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
- vma->vm_page_prot = pgprot_decrypted(vma->vm_page_prot);
+ vma->vm_pgoff = (pci_resource_start(pdev, index) >> PAGE_SHIFT) + pgoff;
/*
- * Set vm_flags now, they should not be changed in the fault handler.
- * We want the same flags and page protection (decrypted above) as
- * io_remap_pfn_range() would set.
- *
- * VM_ALLOW_ANY_UNCACHED: The VMA flag is implemented for ARM64,
- * allowing KVM stage 2 device mapping attributes to use Normal-NC
+ * See remap_pfn_range(), called from vfio_pci_fault() but we can't
+ * change vm_flags within the fault handler. Set them now.
*/
vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
vma->vm_ops = &vfio_pci_mmap_ops;
@@ -2047,6 +2181,8 @@ int vfio_pci_core_init_dev(struct vfio_device *core_vdev)
mutex_init(&vdev->ioeventfds_lock);
INIT_LIST_HEAD(&vdev->dummy_resources_list);
INIT_LIST_HEAD(&vdev->ioeventfds_list);
+ mutex_init(&vdev->vma_lock);
+ INIT_LIST_HEAD(&vdev->vma_list);
INIT_LIST_HEAD(&vdev->sriov_pfs_item);
init_rwsem(&vdev->memory_lock);
xa_init(&vdev->ctx);
@@ -2062,6 +2198,7 @@ void vfio_pci_core_release_dev(struct vfio_device *core_vdev)
mutex_destroy(&vdev->igate);
mutex_destroy(&vdev->ioeventfds_lock);
+ mutex_destroy(&vdev->vma_lock);
kfree(vdev->region);
kfree(vdev->pm_save);
}
@@ -2339,15 +2476,26 @@ static int vfio_pci_dev_set_pm_runtime_get(struct vfio_device_set *dev_set)
return ret;
}
+/*
+ * We need to get memory_lock for each device, but devices can share mmap_lock,
+ * therefore we need to zap and hold the vma_lock for each device, and only then
+ * get each memory_lock.
+ */
static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
struct vfio_pci_group_info *groups,
struct iommufd_ctx *iommufd_ctx)
{
- struct vfio_pci_core_device *vdev;
+ struct vfio_pci_core_device *cur_mem;
+ struct vfio_pci_core_device *cur_vma;
+ struct vfio_pci_core_device *cur;
struct pci_dev *pdev;
+ bool is_mem = true;
int ret;
mutex_lock(&dev_set->lock);
+ cur_mem = list_first_entry(&dev_set->device_list,
+ struct vfio_pci_core_device,
+ vdev.dev_set_list);
pdev = vfio_pci_dev_set_resettable(dev_set);
if (!pdev) {
@@ -2364,7 +2512,7 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
if (ret)
goto err_unlock;
- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list) {
+ list_for_each_entry(cur_vma, &dev_set->device_list, vdev.dev_set_list) {
bool owned;
/*
@@ -2388,38 +2536,38 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
* Otherwise, reset is not allowed.
*/
if (iommufd_ctx) {
- int devid = vfio_iommufd_get_dev_id(&vdev->vdev,
+ int devid = vfio_iommufd_get_dev_id(&cur_vma->vdev,
iommufd_ctx);
owned = (devid > 0 || devid == -ENOENT);
} else {
- owned = vfio_dev_in_groups(&vdev->vdev, groups);
+ owned = vfio_dev_in_groups(&cur_vma->vdev, groups);
}
if (!owned) {
ret = -EINVAL;
- break;
+ goto err_undo;
}
/*
- * Take the memory write lock for each device and zap BAR
- * mappings to prevent the user accessing the device while in
- * reset. Locking multiple devices is prone to deadlock,
- * runaway and unwind if we hit contention.
+ * Locking multiple devices is prone to deadlock, runaway and
+ * unwind if we hit contention.
*/
- if (!down_write_trylock(&vdev->memory_lock)) {
+ if (!vfio_pci_zap_and_vma_lock(cur_vma, true)) {
ret = -EBUSY;
- break;
+ goto err_undo;
}
-
- vfio_pci_zap_bars(vdev);
}
+ cur_vma = NULL;
- if (!list_entry_is_head(vdev,
- &dev_set->device_list, vdev.dev_set_list)) {
- vdev = list_prev_entry(vdev, vdev.dev_set_list);
- goto err_undo;
+ list_for_each_entry(cur_mem, &dev_set->device_list, vdev.dev_set_list) {
+ if (!down_write_trylock(&cur_mem->memory_lock)) {
+ ret = -EBUSY;
+ goto err_undo;
+ }
+ mutex_unlock(&cur_mem->vma_lock);
}
+ cur_mem = NULL;
/*
* The pci_reset_bus() will reset all the devices in the bus.
@@ -2430,22 +2578,25 @@ static int vfio_pci_dev_set_hot_reset(struct vfio_device_set *dev_set,
* cause the PCI config space reset without restoring the original
* state (saved locally in 'vdev->pm_save').
*/
- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
- vfio_pci_set_power_state(vdev, PCI_D0);
+ list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+ vfio_pci_set_power_state(cur, PCI_D0);
ret = pci_reset_bus(pdev);
- vdev = list_last_entry(&dev_set->device_list,
- struct vfio_pci_core_device, vdev.dev_set_list);
-
err_undo:
- list_for_each_entry_from_reverse(vdev, &dev_set->device_list,
- vdev.dev_set_list)
- up_write(&vdev->memory_lock);
-
- list_for_each_entry(vdev, &dev_set->device_list, vdev.dev_set_list)
- pm_runtime_put(&vdev->pdev->dev);
+ list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) {
+ if (cur == cur_mem)
+ is_mem = false;
+ if (cur == cur_vma)
+ break;
+ if (is_mem)
+ up_write(&cur->memory_lock);
+ else
+ mutex_unlock(&cur->vma_lock);
+ }
+ list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list)
+ pm_runtime_put(&cur->pdev->dev);
err_unlock:
mutex_unlock(&dev_set->lock);
return ret;
@@ -93,6 +93,8 @@ struct vfio_pci_core_device {
struct list_head sriov_pfs_item;
struct vfio_pci_core_device *sriov_pf_core_dev;
struct notifier_block nb;
+ struct mutex vma_lock;
+ struct list_head vma_list;
struct rw_semaphore memory_lock;
};
BugLink: https://bugs.launchpad.net/bugs/2089306 This reverts commit "vfio/pci: Use unmap_mapping_range()". The original commit rewrote the vfio_pci mmap'd MMIO fault handler to use the "unmap_mapping_range()" and "vmf_insert_pfn()" functions in place of vfio_pci tracking its own mapped areas and using "zap_vma_ptes()" and "io_remap_pfn_range()". Use of "vmf_insert_pfn()" is significantly slower than the prior implementation. With large BAR region passthrough PCIe devices, this causes host soft lockup warnings if the commit "vfio/pci: Insert full vma on mmap'd MMIO fault" is present, or an extremely slow guest boot if it is not. Signed-off-by: Jacob Martin <jacob.martin@canonical.com> --- drivers/vfio/pci/vfio_pci_core.c | 267 ++++++++++++++++++++++++------- include/linux/vfio_pci_core.h | 2 + 2 files changed, 211 insertions(+), 58 deletions(-)