diff mbox series

[v2,07/14] mm: khugepaged: collapse_pte_mapped_thp() use pte_offset_map_rw_nolock()

Message ID c377dab2bf55950e6155ea051aba3887ed5a2773.1724310149.git.zhengqi.arch@bytedance.com (mailing list archive)
State Handled Elsewhere, archived
Headers show
Series introduce pte_offset_map_{ro|rw}_nolock() | expand

Commit Message

Qi Zheng Aug. 22, 2024, 7:13 a.m. UTC
In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
this time, the write lock of mmap_lock is not held, and the pte_same()
check is not performed after the PTL held. So we should get pgt_pmd and do
pmd_same() check after the ptl held.

For the case where the ptl is released first and then the pml is acquired,
the PTE page may have been freed, so we must do pmd_same() check before
reacquiring the ptl.

Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
---
 mm/khugepaged.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

Comments

Muchun Song Aug. 29, 2024, 8:10 a.m. UTC | #1
On 2024/8/22 15:13, Qi Zheng wrote:
> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
> this time, the write lock of mmap_lock is not held, and the pte_same()
> check is not performed after the PTL held. So we should get pgt_pmd and do
> pmd_same() check after the ptl held.
>
> For the case where the ptl is released first and then the pml is acquired,
> the PTE page may have been freed, so we must do pmd_same() check before
> reacquiring the ptl.
>
> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
> ---
>   mm/khugepaged.c | 16 +++++++++++++++-
>   1 file changed, 15 insertions(+), 1 deletion(-)
>
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 53bfa7f4b7f82..15d3f7f3c65f2 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1604,7 +1604,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>   	if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>   		pml = pmd_lock(mm, pmd);
>   
> -	start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
> +	start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>   	if (!start_pte)		/* mmap_lock + page lock should prevent this */
>   		goto abort;
>   	if (!pml)
> @@ -1612,6 +1612,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>   	else if (ptl != pml)
>   		spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>   
> +	if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
> +		goto abort;
> +
>   	/* step 2: clear page table and adjust rmap */
>   	for (i = 0, addr = haddr, pte = start_pte;
>   	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
> @@ -1657,6 +1660,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>   	/* step 4: remove empty page table */
>   	if (!pml) {
>   		pml = pmd_lock(mm, pmd);
> +		/*
> +		 * We called pte_unmap() and release the ptl before acquiring
> +		 * the pml, which means we left the RCU critical section, so the
> +		 * PTE page may have been freed, so we must do pmd_same() check
> +		 * before reacquiring the ptl.
> +		 */
> +		if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
> +			spin_unlock(pml);
> +			goto pmd_change;

Seems we forget to flush TLB since we've cleared some pte entry?

> +		}
>   		if (ptl != pml)
>   			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>   	}
> @@ -1688,6 +1701,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>   		pte_unmap_unlock(start_pte, ptl);
>   	if (pml && pml != ptl)
>   		spin_unlock(pml);
> +pmd_change:
>   	if (notified)
>   		mmu_notifier_invalidate_range_end(&range);
>   drop_folio:
Qi Zheng Aug. 30, 2024, 6:54 a.m. UTC | #2
On 2024/8/29 16:10, Muchun Song wrote:
> 
> 
> On 2024/8/22 15:13, Qi Zheng wrote:
>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>> this time, the write lock of mmap_lock is not held, and the pte_same()
>> check is not performed after the PTL held. So we should get pgt_pmd 
>> and do
>> pmd_same() check after the ptl held.
>>
>> For the case where the ptl is released first and then the pml is 
>> acquired,
>> the PTE page may have been freed, so we must do pmd_same() check before
>> reacquiring the ptl.
>>
>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>> ---
>>   mm/khugepaged.c | 16 +++++++++++++++-
>>   1 file changed, 15 insertions(+), 1 deletion(-)
>>
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 53bfa7f4b7f82..15d3f7f3c65f2 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1604,7 +1604,7 @@ int collapse_pte_mapped_thp(struct mm_struct 
>> *mm, unsigned long addr,
>>       if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>>           pml = pmd_lock(mm, pmd);
>> -    start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>> +    start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, 
>> &ptl);
>>       if (!start_pte)        /* mmap_lock + page lock should prevent 
>> this */
>>           goto abort;
>>       if (!pml)
>> @@ -1612,6 +1612,9 @@ int collapse_pte_mapped_thp(struct mm_struct 
>> *mm, unsigned long addr,
>>       else if (ptl != pml)
>>           spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>> +    if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>> +        goto abort;
>> +
>>       /* step 2: clear page table and adjust rmap */
>>       for (i = 0, addr = haddr, pte = start_pte;
>>            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>> @@ -1657,6 +1660,16 @@ int collapse_pte_mapped_thp(struct mm_struct 
>> *mm, unsigned long addr,
>>       /* step 4: remove empty page table */
>>       if (!pml) {
>>           pml = pmd_lock(mm, pmd);
>> +        /*
>> +         * We called pte_unmap() and release the ptl before acquiring
>> +         * the pml, which means we left the RCU critical section, so the
>> +         * PTE page may have been freed, so we must do pmd_same() check
>> +         * before reacquiring the ptl.
>> +         */
>> +        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>> +            spin_unlock(pml);
>> +            goto pmd_change;
> 
> Seems we forget to flush TLB since we've cleared some pte entry?

See comment above the ptep_clear():

/*
  * Must clear entry, or a racing truncate may re-remove it.
  * TLB flush can be left until pmdp_collapse_flush() does it.
  * PTE dirty? Shmem page is already dirty; file is read-only.
  */

The TLB flush was handed over to pmdp_collapse_flush(). If a
concurrent thread free the PTE page at this time, the TLB will
also be flushed after pmd_clear().

> 
>> +        }
>>           if (ptl != pml)
>>               spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>       }
>> @@ -1688,6 +1701,7 @@ int collapse_pte_mapped_thp(struct mm_struct 
>> *mm, unsigned long addr,
>>           pte_unmap_unlock(start_pte, ptl);
>>       if (pml && pml != ptl)
>>           spin_unlock(pml);
>> +pmd_change:
>>       if (notified)
>>           mmu_notifier_invalidate_range_end(&range);
>>   drop_folio:
>
Muchun Song Sept. 5, 2024, 6:32 a.m. UTC | #3
> On Aug 30, 2024, at 14:54, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> 
> 
> 
> On 2024/8/29 16:10, Muchun Song wrote:
>> On 2024/8/22 15:13, Qi Zheng wrote:
>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>> this time, the write lock of mmap_lock is not held, and the pte_same()
>>> check is not performed after the PTL held. So we should get pgt_pmd and do
>>> pmd_same() check after the ptl held.
>>> 
>>> For the case where the ptl is released first and then the pml is acquired,
>>> the PTE page may have been freed, so we must do pmd_same() check before
>>> reacquiring the ptl.
>>> 
>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>> ---
>>>  mm/khugepaged.c | 16 +++++++++++++++-
>>>  1 file changed, 15 insertions(+), 1 deletion(-)
>>> 
>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>> index 53bfa7f4b7f82..15d3f7f3c65f2 100644
>>> --- a/mm/khugepaged.c
>>> +++ b/mm/khugepaged.c
>>> @@ -1604,7 +1604,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>      if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>>>          pml = pmd_lock(mm, pmd);
>>> -    start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>>> +    start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>>>      if (!start_pte)        /* mmap_lock + page lock should prevent this */
>>>          goto abort;
>>>      if (!pml)
>>> @@ -1612,6 +1612,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>      else if (ptl != pml)
>>>          spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>> +    if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>>> +        goto abort;
>>> +
>>>      /* step 2: clear page table and adjust rmap */
>>>      for (i = 0, addr = haddr, pte = start_pte;
>>>           i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>>> @@ -1657,6 +1660,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>      /* step 4: remove empty page table */
>>>      if (!pml) {
>>>          pml = pmd_lock(mm, pmd);
>>> +        /*
>>> +         * We called pte_unmap() and release the ptl before acquiring
>>> +         * the pml, which means we left the RCU critical section, so the
>>> +         * PTE page may have been freed, so we must do pmd_same() check
>>> +         * before reacquiring the ptl.
>>> +         */
>>> +        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>>> +            spin_unlock(pml);
>>> +            goto pmd_change;
>> Seems we forget to flush TLB since we've cleared some pte entry?
> 
> See comment above the ptep_clear():
> 
> /*
> * Must clear entry, or a racing truncate may re-remove it.
> * TLB flush can be left until pmdp_collapse_flush() does it.
> * PTE dirty? Shmem page is already dirty; file is read-only.
> */
> 
> The TLB flush was handed over to pmdp_collapse_flush(). If a

But you skipped pmdp_collapse_flush().

> concurrent thread free the PTE page at this time, the TLB will
> also be flushed after pmd_clear().
> 
>>> +        }
>>>          if (ptl != pml)
>>>              spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>      }
>>> @@ -1688,6 +1701,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>          pte_unmap_unlock(start_pte, ptl);
>>>      if (pml && pml != ptl)
>>>          spin_unlock(pml);
>>> +pmd_change:
>>>      if (notified)
>>>          mmu_notifier_invalidate_range_end(&range);
>>>  drop_folio:
Qi Zheng Sept. 5, 2024, 6:41 a.m. UTC | #4
On 2024/9/5 14:32, Muchun Song wrote:
> 
> 
>> On Aug 30, 2024, at 14:54, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>
>>
>>
>> On 2024/8/29 16:10, Muchun Song wrote:
>>> On 2024/8/22 15:13, Qi Zheng wrote:
>>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>>> this time, the write lock of mmap_lock is not held, and the pte_same()
>>>> check is not performed after the PTL held. So we should get pgt_pmd and do
>>>> pmd_same() check after the ptl held.
>>>>
>>>> For the case where the ptl is released first and then the pml is acquired,
>>>> the PTE page may have been freed, so we must do pmd_same() check before
>>>> reacquiring the ptl.
>>>>
>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>> ---
>>>>   mm/khugepaged.c | 16 +++++++++++++++-
>>>>   1 file changed, 15 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>> index 53bfa7f4b7f82..15d3f7f3c65f2 100644
>>>> --- a/mm/khugepaged.c
>>>> +++ b/mm/khugepaged.c
>>>> @@ -1604,7 +1604,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>       if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>>>>           pml = pmd_lock(mm, pmd);
>>>> -    start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>>>> +    start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>>>>       if (!start_pte)        /* mmap_lock + page lock should prevent this */
>>>>           goto abort;
>>>>       if (!pml)
>>>> @@ -1612,6 +1612,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>       else if (ptl != pml)
>>>>           spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>> +    if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>>>> +        goto abort;
>>>> +
>>>>       /* step 2: clear page table and adjust rmap */
>>>>       for (i = 0, addr = haddr, pte = start_pte;
>>>>            i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>>>> @@ -1657,6 +1660,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>       /* step 4: remove empty page table */
>>>>       if (!pml) {
>>>>           pml = pmd_lock(mm, pmd);
>>>> +        /*
>>>> +         * We called pte_unmap() and release the ptl before acquiring
>>>> +         * the pml, which means we left the RCU critical section, so the
>>>> +         * PTE page may have been freed, so we must do pmd_same() check
>>>> +         * before reacquiring the ptl.
>>>> +         */
>>>> +        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>>>> +            spin_unlock(pml);
>>>> +            goto pmd_change;
>>> Seems we forget to flush TLB since we've cleared some pte entry?
>>
>> See comment above the ptep_clear():
>>
>> /*
>> * Must clear entry, or a racing truncate may re-remove it.
>> * TLB flush can be left until pmdp_collapse_flush() does it.
>> * PTE dirty? Shmem page is already dirty; file is read-only.
>> */
>>
>> The TLB flush was handed over to pmdp_collapse_flush(). If a
> 
> But you skipped pmdp_collapse_flush().

I skip it only in !pmd_same() case, at which time it must be cleared
by other thread, which will be responsible for flushing TLB:

CPU 0				CPU 1
				pmd_clear
				spin_unlock
				flushing tlb
spin_lock
if (!pmd_same)	
	goto pmd_change;
pmdp_collapse_flush

Did I miss something?

> 
>> concurrent thread free the PTE page at this time, the TLB will
>> also be flushed after pmd_clear().
>>
>>>> +        }
>>>>           if (ptl != pml)
>>>>               spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>>       }
>>>> @@ -1688,6 +1701,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>           pte_unmap_unlock(start_pte, ptl);
>>>>       if (pml && pml != ptl)
>>>>           spin_unlock(pml);
>>>> +pmd_change:
>>>>       if (notified)
>>>>           mmu_notifier_invalidate_range_end(&range);
>>>>   drop_folio:
>
Muchun Song Sept. 5, 2024, 7:18 a.m. UTC | #5
> On Sep 5, 2024, at 14:41, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
> 
> 
> 
> On 2024/9/5 14:32, Muchun Song wrote:
>>> On Aug 30, 2024, at 14:54, Qi Zheng <zhengqi.arch@bytedance.com> wrote:
>>> 
>>> 
>>> 
>>> On 2024/8/29 16:10, Muchun Song wrote:
>>>> On 2024/8/22 15:13, Qi Zheng wrote:
>>>>> In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after
>>>>> acquring the ptl, so convert it to using pte_offset_map_rw_nolock(). At
>>>>> this time, the write lock of mmap_lock is not held, and the pte_same()
>>>>> check is not performed after the PTL held. So we should get pgt_pmd and do
>>>>> pmd_same() check after the ptl held.
>>>>> 
>>>>> For the case where the ptl is released first and then the pml is acquired,
>>>>> the PTE page may have been freed, so we must do pmd_same() check before
>>>>> reacquiring the ptl.
>>>>> 
>>>>> Signed-off-by: Qi Zheng <zhengqi.arch@bytedance.com>
>>>>> ---
>>>>>  mm/khugepaged.c | 16 +++++++++++++++-
>>>>>  1 file changed, 15 insertions(+), 1 deletion(-)
>>>>> 
>>>>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>>>>> index 53bfa7f4b7f82..15d3f7f3c65f2 100644
>>>>> --- a/mm/khugepaged.c
>>>>> +++ b/mm/khugepaged.c
>>>>> @@ -1604,7 +1604,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>>      if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
>>>>>          pml = pmd_lock(mm, pmd);
>>>>> -    start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
>>>>> +    start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
>>>>>      if (!start_pte)        /* mmap_lock + page lock should prevent this */
>>>>>          goto abort;
>>>>>      if (!pml)
>>>>> @@ -1612,6 +1612,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>>      else if (ptl != pml)
>>>>>          spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>>> +    if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
>>>>> +        goto abort;
>>>>> +
>>>>>      /* step 2: clear page table and adjust rmap */
>>>>>      for (i = 0, addr = haddr, pte = start_pte;
>>>>>           i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
>>>>> @@ -1657,6 +1660,16 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>>      /* step 4: remove empty page table */
>>>>>      if (!pml) {
>>>>>          pml = pmd_lock(mm, pmd);
>>>>> +        /*
>>>>> +         * We called pte_unmap() and release the ptl before acquiring
>>>>> +         * the pml, which means we left the RCU critical section, so the
>>>>> +         * PTE page may have been freed, so we must do pmd_same() check
>>>>> +         * before reacquiring the ptl.
>>>>> +         */
>>>>> +        if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
>>>>> +            spin_unlock(pml);
>>>>> +            goto pmd_change;
>>>> Seems we forget to flush TLB since we've cleared some pte entry?
>>> 
>>> See comment above the ptep_clear():
>>> 
>>> /*
>>> * Must clear entry, or a racing truncate may re-remove it.
>>> * TLB flush can be left until pmdp_collapse_flush() does it.
>>> * PTE dirty? Shmem page is already dirty; file is read-only.
>>> */
>>> 
>>> The TLB flush was handed over to pmdp_collapse_flush(). If a
>> But you skipped pmdp_collapse_flush().
> 
> I skip it only in !pmd_same() case, at which time it must be cleared
> by other thread, which will be responsible for flushing TLB:

WOW! AMAZING! You are right.

> 
> CPU 0				CPU 1
> 				pmd_clear
> 				spin_unlock
> 				flushing tlb
> spin_lock
> if (!pmd_same) 
> 	goto pmd_change;
> pmdp_collapse_flush
> 
> Did I miss something?
> 
>>> concurrent thread free the PTE page at this time, the TLB will
>>> also be flushed after pmd_clear().
>>> 
>>>>> +        }
>>>>>          if (ptl != pml)
>>>>>              spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
>>>>>      }
>>>>> @@ -1688,6 +1701,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
>>>>>          pte_unmap_unlock(start_pte, ptl);
>>>>>      if (pml && pml != ptl)
>>>>>          spin_unlock(pml);
>>>>> +pmd_change:
>>>>>      if (notified)
>>>>>          mmu_notifier_invalidate_range_end(&range);
>>>>>  drop_folio:
diff mbox series

Patch

diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 53bfa7f4b7f82..15d3f7f3c65f2 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1604,7 +1604,7 @@  int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED))
 		pml = pmd_lock(mm, pmd);
 
-	start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl);
+	start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl);
 	if (!start_pte)		/* mmap_lock + page lock should prevent this */
 		goto abort;
 	if (!pml)
@@ -1612,6 +1612,9 @@  int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	else if (ptl != pml)
 		spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
 
+	if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd))))
+		goto abort;
+
 	/* step 2: clear page table and adjust rmap */
 	for (i = 0, addr = haddr, pte = start_pte;
 	     i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
@@ -1657,6 +1660,16 @@  int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 	/* step 4: remove empty page table */
 	if (!pml) {
 		pml = pmd_lock(mm, pmd);
+		/*
+		 * We called pte_unmap() and release the ptl before acquiring
+		 * the pml, which means we left the RCU critical section, so the
+		 * PTE page may have been freed, so we must do pmd_same() check
+		 * before reacquiring the ptl.
+		 */
+		if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) {
+			spin_unlock(pml);
+			goto pmd_change;
+		}
 		if (ptl != pml)
 			spin_lock_nested(ptl, SINGLE_DEPTH_NESTING);
 	}
@@ -1688,6 +1701,7 @@  int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
 		pte_unmap_unlock(start_pte, ptl);
 	if (pml && pml != ptl)
 		spin_unlock(pml);
+pmd_change:
 	if (notified)
 		mmu_notifier_invalidate_range_end(&range);
 drop_folio: