From patchwork Fri Mar 29 05:46:40 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexey Kardashevskiy X-Patchwork-Id: 1069166 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=kvm-ppc-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=ozlabs.ru Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44VrMs6l17z9sQs for ; Fri, 29 Mar 2019 16:46:49 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728651AbfC2Fqt (ORCPT ); Fri, 29 Mar 2019 01:46:49 -0400 Received: from ozlabs.ru ([107.173.13.209]:45197 "EHLO ozlabs.ru" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726251AbfC2Fqt (ORCPT ); Fri, 29 Mar 2019 01:46:49 -0400 Received: from fstn1-p1.ozlabs.ibm.com (localhost [IPv6:::1]) by ozlabs.ru (Postfix) with ESMTP id A23CAAE8003F; Fri, 29 Mar 2019 01:46:47 -0400 (EDT) From: Alexey Kardashevskiy To: linuxppc-dev@lists.ozlabs.org Cc: Alexey Kardashevskiy , David Gibson , kvm-ppc@vger.kernel.org, "Aneesh Kumar K.V" , Paul Mackerras Subject: [PATCH kernel 1/2] powerpc/mm_iommu: Prepare for less locking Date: Fri, 29 Mar 2019 16:46:40 +1100 Message-Id: <20190329054641.48597-2-aik@ozlabs.ru> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20190329054641.48597-1-aik@ozlabs.ru> References: <20190329054641.48597-1-aik@ozlabs.ru> Sender: kvm-ppc-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm-ppc@vger.kernel.org The next patch will reduce amount of time spent under locks. This adds mm_iommu_find() to see if the region is already registered. This removes a rather ugly union from the mm_iommu_table_group_mem_t struct and keeps the hack local in mm_iommu_do_alloc(). This makes pageshift and hpas local and assigns them late as soon this moves to a helper. This should cause no behavioral change. Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/mm/mmu_context_iommu.c | 82 +++++++++++++++-------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index e7a9c4f6bfca..6b351c79713b 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -35,18 +35,8 @@ struct mm_iommu_table_group_mem_t { atomic64_t mapped; unsigned int pageshift; u64 ua; /* userspace address */ - u64 entries; /* number of entries in hpas/hpages[] */ - /* - * in mm_iommu_get we temporarily use this to store - * struct page address. - * - * We need to convert ua to hpa in real mode. Make it - * simpler by storing physical address. - */ - union { - struct page **hpages; /* vmalloc'ed */ - phys_addr_t *hpas; - }; + u64 entries; /* number of entries in hpas */ + phys_addr_t *hpas; #define MM_IOMMU_TABLE_INVALID_HPA ((uint64_t)-1) u64 dev_hpa; /* Device memory base address */ }; @@ -91,26 +81,36 @@ bool mm_iommu_preregistered(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(mm_iommu_preregistered); +/* Must be called with &mem_list_mutex held */ +static bool mm_iommu_find(struct mm_struct *mm, unsigned long ua, + unsigned long entries) +{ + struct mm_iommu_table_group_mem_t *mem; + + list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, next) { + /* Overlap? */ + if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && + (ua < (mem->ua + (mem->entries << PAGE_SHIFT)))) + return true; + } + return false; +} + static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { struct mm_iommu_table_group_mem_t *mem; long i, ret, locked_entries = 0; - unsigned int pageshift; + unsigned int pageshift, mem_pageshift; + struct page **hpages; + phys_addr_t *hpas; mutex_lock(&mem_list_mutex); - list_for_each_entry_rcu(mem, &mm->context.iommu_group_mem_list, - next) { - /* Overlap? */ - if ((mem->ua < (ua + (entries << PAGE_SHIFT))) && - (ua < (mem->ua + - (mem->entries << PAGE_SHIFT)))) { - ret = -EINVAL; - goto unlock_exit; - } - + if (mm_iommu_find(mm, ua, entries)) { + ret = -EINVAL; + goto unlock_exit; } if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { @@ -128,58 +128,60 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, } if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { - mem->pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); + mem_pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); + hpas = NULL; mem->dev_hpa = dev_hpa; goto good_exit; } mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; - /* - * For a starting point for a maximum page size calculation - * we use @ua and @entries natural alignment to allow IOMMU pages - * smaller than huge pages but still bigger than PAGE_SIZE. - */ - mem->pageshift = __ffs(ua | (entries << PAGE_SHIFT)); - mem->hpas = vzalloc(array_size(entries, sizeof(mem->hpas[0]))); - if (!mem->hpas) { + hpages = vzalloc(array_size(entries, sizeof(hpages[0]))); + if (!hpages) { kfree(mem); ret = -ENOMEM; goto unlock_exit; } down_read(&mm->mmap_sem); - ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, mem->hpages, NULL); + ret = get_user_pages_longterm(ua, entries, FOLL_WRITE, hpages, NULL); up_read(&mm->mmap_sem); if (ret != entries) { /* free the reference taken */ for (i = 0; i < ret; i++) - put_page(mem->hpages[i]); + put_page(hpages[i]); - vfree(mem->hpas); + vfree(hpages); kfree(mem); ret = -EFAULT; goto unlock_exit; } + /* + * For a starting point for a maximum page size calculation + * we use @ua and @entries natural alignment to allow IOMMU pages + * smaller than huge pages but still bigger than PAGE_SIZE. + */ + mem_pageshift = __ffs(ua | (entries << PAGE_SHIFT)); + hpas = (phys_addr_t *) hpages; pageshift = PAGE_SHIFT; for (i = 0; i < entries; ++i) { - struct page *page = mem->hpages[i]; + struct page *page = hpages[i]; /* * Allow to use larger than 64k IOMMU pages. Only do that * if we are backed by hugetlb. */ - if ((mem->pageshift > PAGE_SHIFT) && PageHuge(page)) { + if ((mem_pageshift > PAGE_SHIFT) && PageHuge(page)) { struct page *head = compound_head(page); pageshift = compound_order(head) + PAGE_SHIFT; } - mem->pageshift = min(mem->pageshift, pageshift); + mem_pageshift = min(mem_pageshift, pageshift); /* * We don't need struct page reference any more, switch * to physical address. */ - mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT; + hpas[i] = page_to_pfn(page) << PAGE_SHIFT; } good_exit: @@ -188,6 +190,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, mem->used = 1; mem->ua = ua; mem->entries = entries; + mem->hpas = hpas; + mem->pageshift = mem_pageshift; *pmem = mem; list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); From patchwork Fri Mar 29 05:46:41 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit X-Patchwork-Submitter: Alexey Kardashevskiy X-Patchwork-Id: 1069167 Return-Path: X-Original-To: incoming@patchwork.ozlabs.org Delivered-To: patchwork-incoming@bilbo.ozlabs.org Authentication-Results: ozlabs.org; spf=none (mailfrom) smtp.mailfrom=vger.kernel.org (client-ip=209.132.180.67; helo=vger.kernel.org; envelope-from=kvm-ppc-owner@vger.kernel.org; receiver=) Authentication-Results: ozlabs.org; dmarc=none (p=none dis=none) header.from=ozlabs.ru Received: from vger.kernel.org (vger.kernel.org [209.132.180.67]) by ozlabs.org (Postfix) with ESMTP id 44VrMv6969z9sQr for ; Fri, 29 Mar 2019 16:46:51 +1100 (AEDT) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1728660AbfC2Fqv (ORCPT ); Fri, 29 Mar 2019 01:46:51 -0400 Received: from ozlabs.ru ([107.173.13.209]:45206 "EHLO ozlabs.ru" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1726251AbfC2Fqv (ORCPT ); Fri, 29 Mar 2019 01:46:51 -0400 Received: from fstn1-p1.ozlabs.ibm.com (localhost [IPv6:::1]) by ozlabs.ru (Postfix) with ESMTP id 7BC3BAE80120; Fri, 29 Mar 2019 01:46:49 -0400 (EDT) From: Alexey Kardashevskiy To: linuxppc-dev@lists.ozlabs.org Cc: Alexey Kardashevskiy , David Gibson , kvm-ppc@vger.kernel.org, "Aneesh Kumar K.V" , Paul Mackerras Subject: [PATCH kernel 2/2] powerpc/mm_iommu: Fix potential deadlock Date: Fri, 29 Mar 2019 16:46:41 +1100 Message-Id: <20190329054641.48597-3-aik@ozlabs.ru> X-Mailer: git-send-email 2.17.1 In-Reply-To: <20190329054641.48597-1-aik@ozlabs.ru> References: <20190329054641.48597-1-aik@ozlabs.ru> Sender: kvm-ppc-owner@vger.kernel.org Precedence: bulk List-ID: X-Mailing-List: kvm-ppc@vger.kernel.org Currently mm_iommu_do_alloc() is called in 2 cases: - VFIO_IOMMU_SPAPR_REGISTER_MEMORY ioctl() for normal memory; - vfio_pci_nvgpu_regops::mmap() for GPU memory. One of the differences here is that the mmap() is called with mm::mmap_sem help and mm_iommu_do_alloc() locks mm::mmap_sem itself (when adjusting locked_vm and when pinning pages) which can potentially cause a deadlock. We did not hit this yet because the mmap() path does not adjust locked_vm and does not pin pages. However with CONFIG_DEBUG_LOCKDEP=y there is an annoying warning (below, it is slightly confusing). This makes a few changes to reduce the amount of time spent under a lock. This holds mem_list_mutex only when looking or changing the mem list. This means the list is checked twice now for the normal memory case - before starting pinning and before adding the item to the list. This changes mm_iommu_do_alloc() to only allocate and add an iommu memory descriptor (used to deal with both normal and GPU memory in a rather messy way). This cleans the code in a way that mm_iommu_new() and mm_iommu_do_alloc() do not need to test for (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) which makes the code simpler. This moves locked_vm decrementing from under mem_list_mutex for the same reasons. This is one of the lockdep warnings: ====================================================== WARNING: possible circular locking dependency detected 5.1.0-rc2-le_nv2_aikATfstn1-p1 #363 Not tainted ------------------------------------------------------ qemu-system-ppc/8038 is trying to acquire lock: 000000002ec6c453 (mem_list_mutex){+.+.}, at: mm_iommu_do_alloc+0x70/0x490 but task is already holding lock: 00000000fd7da97f (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0xf0/0x160 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&mm->mmap_sem){++++}: lock_acquire+0xf8/0x260 down_write+0x44/0xa0 mm_iommu_adjust_locked_vm.part.1+0x4c/0x190 mm_iommu_do_alloc+0x310/0x490 tce_iommu_ioctl.part.9+0xb84/0x1150 [vfio_iommu_spapr_tce] vfio_fops_unl_ioctl+0x94/0x430 [vfio] do_vfs_ioctl+0xe4/0x930 ksys_ioctl+0xc4/0x110 sys_ioctl+0x28/0x80 system_call+0x5c/0x70 -> #0 (mem_list_mutex){+.+.}: __lock_acquire+0x1484/0x1900 lock_acquire+0xf8/0x260 __mutex_lock+0x88/0xa70 mm_iommu_do_alloc+0x70/0x490 vfio_pci_nvgpu_mmap+0xc0/0x130 [vfio_pci] vfio_pci_mmap+0x198/0x2a0 [vfio_pci] vfio_device_fops_mmap+0x44/0x70 [vfio] mmap_region+0x5d4/0x770 do_mmap+0x42c/0x650 vm_mmap_pgoff+0x124/0x160 ksys_mmap_pgoff+0xdc/0x2f0 sys_mmap+0x40/0x80 system_call+0x5c/0x70 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&mm->mmap_sem); lock(mem_list_mutex); lock(&mm->mmap_sem); lock(mem_list_mutex); *** DEADLOCK *** 1 lock held by qemu-system-ppc/8038: #0: 00000000fd7da97f (&mm->mmap_sem){++++}, at: vm_mmap_pgoff+0xf0/0x160 Signed-off-by: Alexey Kardashevskiy --- arch/powerpc/mm/mmu_context_iommu.c | 128 ++++++++++++++-------------- 1 file changed, 65 insertions(+), 63 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 6b351c79713b..36a826e23d45 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -96,50 +96,59 @@ static bool mm_iommu_find(struct mm_struct *mm, unsigned long ua, return false; } -static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, - unsigned long entries, unsigned long dev_hpa, - struct mm_iommu_table_group_mem_t **pmem) +/* Must be called with &mem_list_mutex held */ +static int mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, + unsigned long entries, unsigned long dev_hpa, + unsigned int mem_pageshift, phys_addr_t *hpas, + struct mm_iommu_table_group_mem_t **pmem) { struct mm_iommu_table_group_mem_t *mem; - long i, ret, locked_entries = 0; + + if (mm_iommu_find(mm, ua, entries)) + return -EINVAL; + + mem = kzalloc(sizeof(*mem), GFP_KERNEL); + if (!mem) + return -ENOMEM; + + atomic64_set(&mem->mapped, 1); + mem->dev_hpa = dev_hpa; + mem->used = 1; + mem->ua = ua; + mem->entries = entries; + mem->pageshift = mem_pageshift; + mem->hpas = hpas; + list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); + *pmem = mem; + + return 0; +} + +long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, + struct mm_iommu_table_group_mem_t **pmem) +{ + long i, ret = 0, locked_entries = 0; unsigned int pageshift, mem_pageshift; struct page **hpages; phys_addr_t *hpas; mutex_lock(&mem_list_mutex); - if (mm_iommu_find(mm, ua, entries)) { - ret = -EINVAL; - goto unlock_exit; + mutex_unlock(&mem_list_mutex); + return -EINVAL; } + mutex_unlock(&mem_list_mutex); - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) { - ret = mm_iommu_adjust_locked_vm(mm, entries, true); - if (ret) - goto unlock_exit; + ret = mm_iommu_adjust_locked_vm(mm, entries, true); + if (ret) + return ret; - locked_entries = entries; - } - - mem = kzalloc(sizeof(*mem), GFP_KERNEL); - if (!mem) { - ret = -ENOMEM; - goto unlock_exit; - } - - if (dev_hpa != MM_IOMMU_TABLE_INVALID_HPA) { - mem_pageshift = __ffs(dev_hpa | (entries << PAGE_SHIFT)); - hpas = NULL; - mem->dev_hpa = dev_hpa; - goto good_exit; - } - mem->dev_hpa = MM_IOMMU_TABLE_INVALID_HPA; + locked_entries = entries; hpages = vzalloc(array_size(entries, sizeof(hpages[0]))); if (!hpages) { - kfree(mem); ret = -ENOMEM; - goto unlock_exit; + goto cleanup_exit; } down_read(&mm->mmap_sem); @@ -149,11 +158,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, /* free the reference taken */ for (i = 0; i < ret; i++) put_page(hpages[i]); - - vfree(hpages); - kfree(mem); ret = -EFAULT; - goto unlock_exit; + goto cleanup_exit; } /* @@ -184,40 +190,35 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, unsigned long ua, hpas[i] = page_to_pfn(page) << PAGE_SHIFT; } -good_exit: - ret = 0; - atomic64_set(&mem->mapped, 1); - mem->used = 1; - mem->ua = ua; - mem->entries = entries; - mem->hpas = hpas; - mem->pageshift = mem_pageshift; - *pmem = mem; - - list_add_rcu(&mem->next, &mm->context.iommu_group_mem_list); - -unlock_exit: - if (locked_entries && ret) - mm_iommu_adjust_locked_vm(mm, locked_entries, false); - + mutex_lock(&mem_list_mutex); + ret = mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, + mem_pageshift, hpas, pmem); mutex_unlock(&mem_list_mutex); + if (ret) + goto cleanup_exit; + return 0; + +cleanup_exit: + mm_iommu_adjust_locked_vm(mm, locked_entries, false); + vfree(hpages); + return ret; } - -long mm_iommu_new(struct mm_struct *mm, unsigned long ua, unsigned long entries, - struct mm_iommu_table_group_mem_t **pmem) -{ - return mm_iommu_do_alloc(mm, ua, entries, MM_IOMMU_TABLE_INVALID_HPA, - pmem); -} EXPORT_SYMBOL_GPL(mm_iommu_new); long mm_iommu_newdev(struct mm_struct *mm, unsigned long ua, unsigned long entries, unsigned long dev_hpa, struct mm_iommu_table_group_mem_t **pmem) { - return mm_iommu_do_alloc(mm, ua, entries, dev_hpa, pmem); + int ret; + + mutex_lock(&mem_list_mutex); + ret = mm_iommu_do_alloc(mm, ua, entries, dev_hpa, + __ffs(dev_hpa | (entries << PAGE_SHIFT)), NULL, pmem); + mutex_unlock(&mem_list_mutex); + + return ret; } EXPORT_SYMBOL_GPL(mm_iommu_newdev); @@ -270,10 +271,13 @@ static void mm_iommu_release(struct mm_iommu_table_group_mem_t *mem) long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) { long ret = 0; - unsigned long entries, dev_hpa; + unsigned long unlock_entries = 0; mutex_lock(&mem_list_mutex); + if (mem->dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) + unlock_entries = mem->entries; + if (mem->used == 0) { ret = -ENOENT; goto unlock_exit; @@ -292,16 +296,14 @@ long mm_iommu_put(struct mm_struct *mm, struct mm_iommu_table_group_mem_t *mem) } /* @mapped became 0 so now mappings are disabled, release the region */ - entries = mem->entries; - dev_hpa = mem->dev_hpa; mm_iommu_release(mem); - if (dev_hpa == MM_IOMMU_TABLE_INVALID_HPA) - mm_iommu_adjust_locked_vm(mm, entries, false); - unlock_exit: mutex_unlock(&mem_list_mutex); + if (!ret) + mm_iommu_adjust_locked_vm(mm, unlock_entries, false); + return ret; } EXPORT_SYMBOL_GPL(mm_iommu_put);