[v3,14/20] mm: Provide speculative fault infrastructure

Message ID	1504894024-2750-15-git-send-email-ldufour@linux.vnet.ibm.com (mailing list archive)
State	Not Applicable
Headers	show Return-Path: <linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org> Gateway: Authorized Use Only! Violators will be prosecuted for <linuxppc-dev@lists.ozlabs.org> from <ldufour@linux.vnet.ibm.com>; Fri, 8 Sep 2017 19:08:00 +0100 Gateway: Authorized Use Only! Violators will be prosecuted; Fri, 8 Sep 2017 19:07:54 +0100 From: Laurent Dufour <ldufour@linux.vnet.ibm.com> To: paulmck@linux.vnet.ibm.com, peterz@infradead.org, akpm@linux-foundation.org, kirill@shutemov.name, ak@linux.intel.com, mhocko@kernel.org, dave@stgolabs.net, jack@suse.cz, Matthew Wilcox <willy@infradead.org>, benh@kernel.crashing.org, mpe@ellerman.id.au, paulus@samba.org, Thomas Gleixner <tglx@linutronix.de>, Ingo Molnar <mingo@redhat.com>, hpa@zytor.com, Will Deacon <will.deacon@arm.com>, Sergey Senozhatsky <sergey.senozhatsky@gmail.com> Subject: [PATCH v3 14/20] mm: Provide speculative fault infrastructure Date: Fri, 8 Sep 2017 20:06:58 +0200 In-Reply-To: <1504894024-2750-1-git-send-email-ldufour@linux.vnet.ibm.com> References: <1504894024-2750-1-git-send-email-ldufour@linux.vnet.ibm.com> Message-Id: <1504894024-2750-15-git-send-email-ldufour@linux.vnet.ibm.com> Precedence: list Cc: linuxppc-dev@lists.ozlabs.org, x86@kernel.org, linux-kernel@vger.kernel.org, npiggin@gmail.com, linux-mm@kvack.org, Tim Chen <tim.c.chen@linux.intel.com>, haren@linux.vnet.ibm.com, khandual@linux.vnet.ibm.com Errors-To: linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org Sender: "Linuxppc-dev" <linuxppc-dev-bounces+patchwork-incoming=ozlabs.org@lists.ozlabs.org>
Series	Speculative page faults \| expand [v3,00/20] Speculative page faults [v3,01/20] mm: Dont assume page-table invariance during faults [v3,02/20] mm: Prepare for FAULT_FLAG_SPECULATIVE [v3,03/20] mm: Introduce pte_spinlock for FAULT_FLAG_SPECULATIVE [v3,04/20] mm: VMA sequence count [v3,05/20] mm: Protect VMA modifications using VMA sequence count [v3,06/20] mm: RCU free VMAs [v3,07/20] mm: Cache some VMA fields in the vm_fault structure [v3,08/20] mm: Protect SPF handler against anon_vma changes [v3,09/20] mm/migrate: Pass vm_fault pointer to migrate_misplaced_page() [v3,10/20] mm: Introduce __lru_cache_add_active_or_unevictable [v3,11/20] mm: Introduce __maybe_mkwrite() [v3,12/20] mm: Introduce __vm_normal_page() [v3,13/20] mm: Introduce __page_add_new_anon_rmap() [v3,14/20] mm: Provide speculative fault infrastructure [v3,15/20] mm: Try spin lock in speculative path [v3,16/20] mm: Adding speculative page fault failure trace events [v3,17/20] perf: Add a speculative page fault sw event [v3,18/20] perf tools: Add support for the SPF perf event [v3,19/20] x86/mm: Add speculative pagefault handling [v3,20/20] powerpc/mm: Add speculative page fault

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h index a4e7ca0f3585..6cfdfca4cc2a 100644 --- a/include/linux/hugetlb_inline.h +++ b/include/linux/hugetlb_inline.h @@ -7,7 +7,7 @@ static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { - return !!(vma->vm_flags & VM_HUGETLB); + return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB); } #else diff --git a/include/linux/mm.h b/include/linux/mm.h index a2857aaa03f1..966b69f10f57 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -320,6 +320,7 @@ struct vm_fault { gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ + unsigned int sequence; pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching @@ -1342,6 +1343,10 @@ int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags); +#ifdef __HAVE_ARCH_CALL_SPF +extern int handle_speculative_fault(struct mm_struct *mm, + unsigned long address, unsigned int flags); +#endif /* __HAVE_ARCH_CALL_SPF */ extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 5bbd6780f205..832aa3ec7d00 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -451,8 +451,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma, pgoff_t pgoff; if (unlikely(is_vm_hugetlb_page(vma))) return linear_hugepage_index(vma, address); - pgoff = (address - vma->vm_start) >> PAGE_SHIFT; - pgoff += vma->vm_pgoff; + pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT; + pgoff += READ_ONCE(vma->vm_pgoff); return pgoff; } diff --git a/mm/internal.h b/mm/internal.h index 84360184eafd..4ddadc440c26 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -45,6 +45,20 @@ extern struct srcu_struct vma_srcu; extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm, unsigned long addr); +static inline bool vma_has_changed(struct vm_fault *vmf) +{ + int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb); + unsigned seq = ACCESS_ONCE(vmf->vma->vm_sequence.sequence); + + /* + * Matches both the wmb in write_seqlock_{begin,end}() and + * the wmb in vma_rb_erase(). + */ + smp_rmb(); + + return ret || seq != vmf->sequence; +} + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/memory.c b/mm/memory.c index 479b47a8ed7c..5e98259c7ac0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -762,7 +762,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (page) dump_page(page, "bad pte"); pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", - (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); + (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma, + mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ @@ -2417,15 +2418,69 @@ static inline void wp_page_reuse(struct vm_fault *vmf) static bool pte_spinlock(struct vm_fault *vmf) { + bool ret = false; + + /* Check if vma is still valid */ + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + return true; + } + + local_irq_disable(); + if (vma_has_changed(vmf)) + goto out; + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); - return true; + + if (vma_has_changed(vmf)) { + spin_unlock(vmf->ptl); + goto out; + } + + ret = true; +out: + local_irq_enable(); + return ret; } static bool pte_map_lock(struct vm_fault *vmf) { - vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - return true; + bool ret = false; + pte_t *pte; + spinlock_t *ptl; + + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) { + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + return true; + } + + /* + * The first vma_has_changed() guarantees the page-tables are still + * valid, having IRQs disabled ensures they stay around, hence the + * second vma_has_changed() to make sure they are still valid once + * we've got the lock. After that a concurrent zap_pte_range() will + * block on the PTL and thus we're safe. + */ + local_irq_disable(); + if (vma_has_changed(vmf)) + goto out; + + pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &ptl); + if (vma_has_changed(vmf)) { + pte_unmap_unlock(pte, ptl); + goto out; + } + + vmf->pte = pte; + vmf->ptl = ptl; + ret = true; +out: + local_irq_enable(); + return ret; } /* @@ -3094,6 +3149,14 @@ static int do_anonymous_page(struct vm_fault *vmf) ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; + /* + * Don't call the userfaultfd during the speculative path. + * We already checked for the VMA to not be managed through + * userfaultfd, but it may be set in our back once we have lock + * the pte. In such a case we can ignore it this time. + */ + if (vmf->flags & FAULT_FLAG_SPECULATIVE) + goto setpte; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -3136,7 +3199,7 @@ static int do_anonymous_page(struct vm_fault *vmf) goto release; /* Deliver the page fault to userland, check inside PT lock */ - if (userfaultfd_missing(vma)) { + if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -3915,6 +3978,8 @@ static int handle_pte_fault(struct vm_fault *vmf) if (!vmf->pte) { if (vma_is_anonymous(vmf->vma)) return do_anonymous_page(vmf); + else if (vmf->flags & FAULT_FLAG_SPECULATIVE) + return VM_FAULT_RETRY; else return do_fault(vmf); } @@ -4012,6 +4077,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, vmf.pmd = pmd_alloc(mm, vmf.pud, address); if (!vmf.pmd) return VM_FAULT_OOM; + vmf.sequence = raw_read_seqcount(&vma->vm_sequence); if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) @@ -4045,6 +4111,179 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, return handle_pte_fault(&vmf); } +#ifdef __HAVE_ARCH_CALL_SPF + +#ifndef __HAVE_ARCH_PTE_SPECIAL +/* This is required by vm_normal_page() */ +#error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL" +#endif + +/* + * vm_normal_page() adds some processing which should be done while + * hodling the mmap_sem. + */ +int handle_speculative_fault(struct mm_struct *mm, unsigned long address, + unsigned int flags) +{ + struct vm_fault vmf = { + .address = address, + }; + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + int dead, seq, idx, ret = VM_FAULT_RETRY; + struct vm_area_struct *vma; +#ifdef CONFIG_NUMA + struct mempolicy *pol; +#endif + + /* Clear flags that may lead to release the mmap_sem to retry */ + flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE); + flags |= FAULT_FLAG_SPECULATIVE; + + idx = srcu_read_lock(&vma_srcu); + vma = find_vma_srcu(mm, address); + if (!vma) + goto unlock; + + /* + * Validate the VMA found by the lockless lookup. + */ + dead = RB_EMPTY_NODE(&vma->vm_rb); + seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */ + if ((seq & 1) || dead) + goto unlock; + + /* + * Can't call vm_ops service has we don't know what they would do + * with the VMA. + * This include huge page from hugetlbfs. + */ + if (vma->vm_ops) + goto unlock; + + /* + * __anon_vma_prepare() requires the mmap_sem to be held + * because vm_next and vm_prev must be safe. This can't be guaranteed + * in the speculative path. + */ + if (unlikely(!vma->anon_vma)) + goto unlock; + + vmf.vma_flags = READ_ONCE(vma->vm_flags); + vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot); + + /* Can't call userland page fault handler in the speculative path */ + if (unlikely(vmf.vma_flags & VM_UFFD_MISSING)) + goto unlock; + +#ifdef CONFIG_NUMA + /* + * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which + * are not compatible with the speculative page fault processing. + */ + pol = __get_vma_policy(vma, address); + if (!pol) + pol = get_task_policy(current); + if (pol && pol->mode == MPOL_INTERLEAVE) + goto unlock; +#endif + + if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP) + /* + * This could be detected by the check address against VMA's + * boundaries but we want to trace it as not supported instead + * of changed. + */ + goto unlock; + + if (address < READ_ONCE(vma->vm_start) + || READ_ONCE(vma->vm_end) <= address) + goto unlock; + + if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE, + flags & FAULT_FLAG_INSTRUCTION, + flags & FAULT_FLAG_REMOTE)) { + ret = VM_FAULT_SIGSEGV; + goto unlock; + } + + /* This is one is required to check that the VMA has write access set */ + if (flags & FAULT_FLAG_WRITE) { + if (unlikely(!(vmf.vma_flags & VM_WRITE))) { + ret = VM_FAULT_SIGSEGV; + goto unlock; + } + } else if (unlikely(!(vmf.vma_flags & (VM_READ|VM_EXEC|VM_WRITE)))) { + ret = VM_FAULT_SIGSEGV; + goto unlock; + } + + /* + * Do a speculative lookup of the PTE entry. + */ + local_irq_disable(); + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out_walk; + + p4d = p4d_alloc(mm, pgd, address); + if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d))) + goto out_walk; + + pud = pud_alloc(mm, p4d, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out_walk; + + /* Transparent huge pages are not supported. */ + if (unlikely(pud_trans_huge(*pud))) + goto out_walk; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out_walk; + + /* + * The above does not allocate/instantiate page-tables because doing so + * would lead to the possibility of instantiating page-tables after + * free_pgtables() -- and consequently leaking them. + * + * The result is that we take at least one !speculative fault per PMD + * in order to instantiate it. + */ + /* Transparent huge pages are not supported. */ + if (unlikely(pmd_trans_huge(*pmd))) + goto out_walk; + + vmf.vma = vma; + vmf.pmd = pmd; + vmf.pgoff = linear_page_index(vma, address); + vmf.gfp_mask = __get_fault_gfp_mask(vma); + vmf.sequence = seq; + vmf.flags = flags; + + local_irq_enable(); + + /* + * We need to re-validate the VMA after checking the bounds, otherwise + * we might have a false positive on the bounds. + */ + if (read_seqcount_retry(&vma->vm_sequence, seq)) + goto unlock; + + ret = handle_pte_fault(&vmf); + +unlock: + srcu_read_unlock(&vma_srcu, idx); + return ret; + +out_walk: + local_irq_enable(); + goto unlock; +} +#endif /* __HAVE_ARCH_CALL_SPF */ + /* * By the time we get here, we already hold the mm semaphore *

[v3,14/20] mm: Provide speculative fault infrastructure

Commit Message

Patch