@@ -315,6 +315,7 @@ struct vm_fault {
gfp_t gfp_mask; /* gfp mask to be used for allocations */
pgoff_t pgoff; /* Logical page offset based on vma */
unsigned long address; /* Faulting virtual address */
+ unsigned int sequence;
pmd_t *pmd; /* Pointer to pmd entry matching
* the 'address' */
pud_t *pud; /* Pointer to pud entry matching
@@ -1286,6 +1287,8 @@ int invalidate_inode_page(struct page *page);
#ifdef CONFIG_MMU
extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags);
+extern int handle_speculative_fault(struct mm_struct *mm,
+ unsigned long address, unsigned int flags);
extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked);
@@ -2245,15 +2245,69 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
static bool pte_spinlock(struct vm_fault *vmf)
{
+ bool ret = false;
+
+ /* Check if vma is still valid */
+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+ return true;
+ }
+
+ local_irq_disable();
+ if (vma_has_changed(vmf))
+ goto out;
+
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
- return true;
+
+ if (vma_has_changed(vmf)) {
+ spin_unlock(vmf->ptl);
+ goto out;
+ }
+
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
}
static bool pte_map_lock(struct vm_fault *vmf)
{
- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);
- return true;
+ bool ret = false;
+ pte_t *pte;
+ spinlock_t *ptl;
+
+ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
+ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &vmf->ptl);
+ return true;
+ }
+
+ /*
+ * The first vma_has_changed() guarantees the page-tables are still
+ * valid, having IRQs disabled ensures they stay around, hence the
+ * second vma_has_changed() to make sure they are still valid once
+ * we've got the lock. After that a concurrent zap_pte_range() will
+ * block on the PTL and thus we're safe.
+ */
+ local_irq_disable();
+ if (vma_has_changed(vmf))
+ goto out;
+
+ pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address, &ptl);
+ if (vma_has_changed(vmf)) {
+ pte_unmap_unlock(pte, ptl);
+ goto out;
+ }
+
+ vmf->pte = pte;
+ vmf->ptl = ptl;
+ ret = true;
+out:
+ local_irq_enable();
+ return ret;
}
/*
@@ -2872,6 +2926,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (vma->vm_flags & VM_SHARED)
return VM_FAULT_SIGBUS;
+ /* Can't call userland page fault handler in the speculative path */
+ if (vmf->flags & FAULT_FLAG_SPECULATIVE && userfaultfd_missing(vma))
+ return VM_FAULT_RETRY;
+
/*
* Use pte_alloc() instead of pte_alloc_map(). We can't run
* pte_offset_map() on pmds where a huge pmd might be created
@@ -3707,6 +3765,8 @@ static int handle_pte_fault(struct vm_fault *vmf)
if (!vmf->pte) {
if (vma_is_anonymous(vmf->vma))
return do_anonymous_page(vmf);
+ else if (vmf->flags & FAULT_FLAG_SPECULATIVE)
+ return VM_FAULT_RETRY;
else
return do_fault(vmf);
}
@@ -3802,6 +3862,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
+ vmf.sequence = raw_read_seqcount(&vma->vm_sequence);
if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
@@ -3829,6 +3890,122 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
return handle_pte_fault(&vmf);
}
+int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
+ unsigned int flags)
+{
+ struct vm_fault vmf = {
+ .address = address,
+ };
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ int dead, seq, idx, ret = VM_FAULT_RETRY;
+ struct vm_area_struct *vma;
+
+ /* Clear flags that may lead to release the mmap_sem to retry */
+ flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
+ flags |= FAULT_FLAG_SPECULATIVE;
+
+ idx = srcu_read_lock(&vma_srcu);
+ vma = find_vma_srcu(mm, address);
+ if (!vma)
+ goto unlock;
+
+ /*
+ * Validate the VMA found by the lockless lookup.
+ */
+ dead = RB_EMPTY_NODE(&vma->vm_rb);
+ seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */
+ if ((seq & 1) || dead)
+ goto unlock;
+
+ /*
+ * We need to re-validate the VMA after checking the bounds, otherwise
+ * we might have a false positive on the bounds.
+ */
+ if (address < vma->vm_start || vma->vm_end <= address)
+ goto unlock;
+
+ /*
+ * Huge pages are not yet supported.
+ */
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ goto unlock;
+
+ /*
+ * The three following checks are copied from access_error from
+ * arch/x86/mm/fault.c
+ */
+ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
+ flags & FAULT_FLAG_INSTRUCTION,
+ flags & FAULT_FLAG_REMOTE))
+ goto unlock;
+
+ /* This is one is required to check that the VMA has write access set */
+ if (flags & FAULT_FLAG_WRITE) {
+ if (unlikely(!(vma->vm_flags & VM_WRITE)))
+ goto unlock;
+ } else {
+ if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
+ goto unlock;
+ }
+
+ if (read_seqcount_retry(&vma->vm_sequence, seq))
+ goto unlock;
+
+ /*
+ * Do a speculative lookup of the PTE entry.
+ */
+ local_irq_disable();
+ pgd = pgd_offset(mm, address);
+ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+ goto out_walk;
+
+ p4d = p4d_alloc(mm, pgd, address);
+ if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+ goto out_walk;
+
+ pud = pud_alloc(mm, p4d, address);
+ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ goto out_walk;
+
+ pmd = pmd_offset(pud, address);
+ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
+ goto out_walk;
+
+ /*
+ * The above does not allocate/instantiate page-tables because doing so
+ * would lead to the possibility of instantiating page-tables after
+ * free_pgtables() -- and consequently leaking them.
+ *
+ * The result is that we take at least one !speculative fault per PMD
+ * in order to instantiate it.
+ */
+
+ if (unlikely(pmd_huge(*pmd)))
+ goto out_walk;
+
+ vmf.vma = vma;
+ vmf.pmd = pmd;
+ vmf.pgoff = linear_page_index(vma, address);
+ vmf.gfp_mask = __get_fault_gfp_mask(vma);
+ vmf.sequence = seq;
+ vmf.flags = flags;
+
+ local_irq_enable();
+
+ ret = handle_pte_fault(&vmf);
+
+unlock:
+ srcu_read_unlock(&vma_srcu, idx);
+ return ret;
+
+out_walk:
+ local_irq_enable();
+ goto unlock;
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*