diff mbox

mm: stop leaking PageTables

Message ID alpine.LSU.2.11.1701071526090.1130@eggly.anvils (mailing list archive)
State Not Applicable
Headers show

Commit Message

Hugh Dickins Jan. 7, 2017, 11:37 p.m. UTC
4.10-rc loadtest (even on x86, even without THPCache) fails with
"fork: Cannot allocate memory" or some such; and /proc/meminfo
shows PageTables growing.

rc1 removed the freeing of an unused preallocated pagetable after
do_fault_around() has called map_pages(): which is usually a good
optimization, so that the followup doesn't have to reallocate one;
but it's not sufficient to shift the freeing into alloc_set_pte(),
since there are failure cases (most commonly VM_FAULT_RETRY) which
never reach finish_fault().

Check and free it at the outer level in do_fault(), then we don't
need to worry in alloc_set_pte(), and can restore that to how it was
(I cannot find any reason to pte_free() under lock as it was doing).

And fix a separate pagetable leak, or crash, introduced by the same
change, that could only show up on some ppc64: why does do_set_pmd()'s
failure case attempt to withdraw a pagetable when it never deposited
one, at the same time overwriting (so leaking) the vmf->prealloc_pte?
Residue of an earlier implementation, perhaps?  Delete it.

Fixes: 953c66c2b22a ("mm: THP page cache support for ppc64")
Signed-off-by: Hugh Dickins <hughd@google.com>
---

 mm/memory.c |   47 ++++++++++++++++++++---------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

Comments

Aneesh Kumar K.V Jan. 8, 2017, 6:59 a.m. UTC | #1
Hugh Dickins <hughd@google.com> writes:

> 4.10-rc loadtest (even on x86, even without THPCache) fails with
> "fork: Cannot allocate memory" or some such; and /proc/meminfo
> shows PageTables growing.
>
> rc1 removed the freeing of an unused preallocated pagetable after
> do_fault_around() has called map_pages(): which is usually a good
> optimization, so that the followup doesn't have to reallocate one;
> but it's not sufficient to shift the freeing into alloc_set_pte(),
> since there are failure cases (most commonly VM_FAULT_RETRY) which
> never reach finish_fault().
>
> Check and free it at the outer level in do_fault(), then we don't
> need to worry in alloc_set_pte(), and can restore that to how it was
> (I cannot find any reason to pte_free() under lock as it was doing).
>
> And fix a separate pagetable leak, or crash, introduced by the same
> change, that could only show up on some ppc64: why does do_set_pmd()'s
> failure case attempt to withdraw a pagetable when it never deposited
> one, at the same time overwriting (so leaking) the vmf->prealloc_pte?
> Residue of an earlier implementation, perhaps?  Delete it.

That change is part of -mm tree.

https://lkml.kernel.org/r/20161212163428.6780-1-aneesh.kumar@linux.vnet.ibm.com

>
> Fixes: 953c66c2b22a ("mm: THP page cache support for ppc64")
> Signed-off-by: Hugh Dickins <hughd@google.com>


Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

> ---
>
>  mm/memory.c |   47 ++++++++++++++++++++---------------------------
>  1 file changed, 20 insertions(+), 27 deletions(-)
>
> --- 4.10-rc2/mm/memory.c	2016-12-25 18:40:50.830453384 -0800
> +++ linux/mm/memory.c	2017-01-07 13:34:29.373381551 -0800
> @@ -3008,13 +3008,6 @@ static int do_set_pmd(struct vm_fault *v
>  	ret = 0;
>  	count_vm_event(THP_FILE_MAPPED);
>  out:
> -	/*
> -	 * If we are going to fallback to pte mapping, do a
> -	 * withdraw with pmd lock held.
> -	 */
> -	if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
> -		vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
> -								vmf->pmd);
>  	spin_unlock(vmf->ptl);
>  	return ret;
>  }
> @@ -3055,20 +3048,18 @@ int alloc_set_pte(struct vm_fault *vmf,
>
>  		ret = do_set_pmd(vmf, page);
>  		if (ret != VM_FAULT_FALLBACK)
> -			goto fault_handled;
> +			return ret;
>  	}
>
>  	if (!vmf->pte) {
>  		ret = pte_alloc_one_map(vmf);
>  		if (ret)
> -			goto fault_handled;
> +			return ret;
>  	}
>
>  	/* Re-check under ptl */
> -	if (unlikely(!pte_none(*vmf->pte))) {
> -		ret = VM_FAULT_NOPAGE;
> -		goto fault_handled;
> -	}
> +	if (unlikely(!pte_none(*vmf->pte)))
> +		return VM_FAULT_NOPAGE;
>
>  	flush_icache_page(vma, page);
>  	entry = mk_pte(page, vma->vm_page_prot);
> @@ -3088,15 +3079,8 @@ int alloc_set_pte(struct vm_fault *vmf,
>
>  	/* no need to invalidate: a not-present page won't be cached */
>  	update_mmu_cache(vma, vmf->address, vmf->pte);
> -	ret = 0;
>
> -fault_handled:
> -	/* preallocated pagetable is unused: free it */
> -	if (vmf->prealloc_pte) {
> -		pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
> -		vmf->prealloc_pte = 0;
> -	}
> -	return ret;
> +	return 0;
>  }
>
>
> @@ -3360,15 +3344,24 @@ static int do_shared_fault(struct vm_fau
>  static int do_fault(struct vm_fault *vmf)
>  {
>  	struct vm_area_struct *vma = vmf->vma;
> +	int ret;
>
>  	/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
>  	if (!vma->vm_ops->fault)
> -		return VM_FAULT_SIGBUS;
> -	if (!(vmf->flags & FAULT_FLAG_WRITE))
> -		return do_read_fault(vmf);
> -	if (!(vma->vm_flags & VM_SHARED))
> -		return do_cow_fault(vmf);
> -	return do_shared_fault(vmf);
> +		ret = VM_FAULT_SIGBUS;
> +	else if (!(vmf->flags & FAULT_FLAG_WRITE))
> +		ret = do_read_fault(vmf);
> +	else if (!(vma->vm_flags & VM_SHARED))
> +		ret = do_cow_fault(vmf);
> +	else
> +		ret = do_shared_fault(vmf);
> +
> +	/* preallocated pagetable is unused: free it */
> +	if (vmf->prealloc_pte) {
> +		pte_free(vma->vm_mm, vmf->prealloc_pte);
> +		vmf->prealloc_pte = 0;
> +	}
> +	return ret;
>  }
>
>  static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
Hugh Dickins Jan. 8, 2017, 8:21 p.m. UTC | #2
On Sun, 8 Jan 2017, Aneesh Kumar K.V wrote:
> Hugh Dickins <hughd@google.com> writes:
> 
> > And fix a separate pagetable leak, or crash, introduced by the same
> > change, that could only show up on some ppc64: why does do_set_pmd()'s
> > failure case attempt to withdraw a pagetable when it never deposited
> > one, at the same time overwriting (so leaking) the vmf->prealloc_pte?
> > Residue of an earlier implementation, perhaps?  Delete it.
> 
> That change is part of -mm tree.
> 
> https://lkml.kernel.org/r/20161212163428.6780-1-aneesh.kumar@linux.vnet.ibm.com

Ah, so it is, I hadn't looked there.  That's reassuring,
I'm glad to know you reached the same conclusion on that piece of code.

It still worried me that the fix is languishing in mmotm, but it looks
not lost: akpm would have sent it in a couple of days anyway, and only
affected ppc64 (like the related khugepaged patch you have queued there).

> 
> >
> > Fixes: 953c66c2b22a ("mm: THP page cache support for ppc64")
> > Signed-off-by: Hugh Dickins <hughd@google.com>
> 
> 
> Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

Thanks, and to Linus, who already has this in for -rc3: so akpm can drop
mm-thp-pagecache-only-withdraw-page-table-after-a-successful-deposit.patch
and then later send in your
mm-thp-pagecache-collapse-free-the-pte-page-table-on-collapse-for-thp-page-cache.patch

Hugh
Kirill A. Shutemov Jan. 8, 2017, 11:29 p.m. UTC | #3
On Sat, Jan 07, 2017 at 03:37:31PM -0800, Hugh Dickins wrote:
> 4.10-rc loadtest (even on x86, even without THPCache) fails with
> "fork: Cannot allocate memory" or some such; and /proc/meminfo
> shows PageTables growing.
> 
> rc1 removed the freeing of an unused preallocated pagetable after
> do_fault_around() has called map_pages(): which is usually a good
> optimization, so that the followup doesn't have to reallocate one;
> but it's not sufficient to shift the freeing into alloc_set_pte(),
> since there are failure cases (most commonly VM_FAULT_RETRY) which
> never reach finish_fault().
> 
> Check and free it at the outer level in do_fault(), then we don't
> need to worry in alloc_set_pte(), and can restore that to how it was
> (I cannot find any reason to pte_free() under lock as it was doing).
> 
> And fix a separate pagetable leak, or crash, introduced by the same
> change, that could only show up on some ppc64: why does do_set_pmd()'s
> failure case attempt to withdraw a pagetable when it never deposited
> one, at the same time overwriting (so leaking) the vmf->prealloc_pte?
> Residue of an earlier implementation, perhaps?  Delete it.
> 
> Fixes: 953c66c2b22a ("mm: THP page cache support for ppc64")
> Signed-off-by: Hugh Dickins <hughd@google.com>

Sorry, that I missed this initially.

Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
diff mbox

Patch

--- 4.10-rc2/mm/memory.c	2016-12-25 18:40:50.830453384 -0800
+++ linux/mm/memory.c	2017-01-07 13:34:29.373381551 -0800
@@ -3008,13 +3008,6 @@  static int do_set_pmd(struct vm_fault *v
 	ret = 0;
 	count_vm_event(THP_FILE_MAPPED);
 out:
-	/*
-	 * If we are going to fallback to pte mapping, do a
-	 * withdraw with pmd lock held.
-	 */
-	if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK)
-		vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm,
-								vmf->pmd);
 	spin_unlock(vmf->ptl);
 	return ret;
 }
@@ -3055,20 +3048,18 @@  int alloc_set_pte(struct vm_fault *vmf,
 
 		ret = do_set_pmd(vmf, page);
 		if (ret != VM_FAULT_FALLBACK)
-			goto fault_handled;
+			return ret;
 	}
 
 	if (!vmf->pte) {
 		ret = pte_alloc_one_map(vmf);
 		if (ret)
-			goto fault_handled;
+			return ret;
 	}
 
 	/* Re-check under ptl */
-	if (unlikely(!pte_none(*vmf->pte))) {
-		ret = VM_FAULT_NOPAGE;
-		goto fault_handled;
-	}
+	if (unlikely(!pte_none(*vmf->pte)))
+		return VM_FAULT_NOPAGE;
 
 	flush_icache_page(vma, page);
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -3088,15 +3079,8 @@  int alloc_set_pte(struct vm_fault *vmf,
 
 	/* no need to invalidate: a not-present page won't be cached */
 	update_mmu_cache(vma, vmf->address, vmf->pte);
-	ret = 0;
 
-fault_handled:
-	/* preallocated pagetable is unused: free it */
-	if (vmf->prealloc_pte) {
-		pte_free(vmf->vma->vm_mm, vmf->prealloc_pte);
-		vmf->prealloc_pte = 0;
-	}
-	return ret;
+	return 0;
 }
 
 
@@ -3360,15 +3344,24 @@  static int do_shared_fault(struct vm_fau
 static int do_fault(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
+	int ret;
 
 	/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
 	if (!vma->vm_ops->fault)
-		return VM_FAULT_SIGBUS;
-	if (!(vmf->flags & FAULT_FLAG_WRITE))
-		return do_read_fault(vmf);
-	if (!(vma->vm_flags & VM_SHARED))
-		return do_cow_fault(vmf);
-	return do_shared_fault(vmf);
+		ret = VM_FAULT_SIGBUS;
+	else if (!(vmf->flags & FAULT_FLAG_WRITE))
+		ret = do_read_fault(vmf);
+	else if (!(vma->vm_flags & VM_SHARED))
+		ret = do_cow_fault(vmf);
+	else
+		ret = do_shared_fault(vmf);
+
+	/* preallocated pagetable is unused: free it */
+	if (vmf->prealloc_pte) {
+		pte_free(vma->vm_mm, vmf->prealloc_pte);
+		vmf->prealloc_pte = 0;
+	}
+	return ret;
 }
 
 static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,