Message ID | 1481831443-22761-5-git-send-email-arbab@linux.vnet.ibm.com (mailing list archive) |
---|---|
State | Superseded, archived |
Headers | show |
Reza Arbab <arbab@linux.vnet.ibm.com> writes: > Tear down and free the four-level page tables of the linear mapping > during memory hotremove. > > We borrow the basic structure of remove_pagetable() and friends from the > identically-named x86 functions. > Can you add more details here, which explain why we don't need to follow the RCU page table free when doing memory hotunplug ? > Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com> > --- > arch/powerpc/include/asm/book3s/64/radix.h | 1 + > arch/powerpc/mm/pgtable-book3s64.c | 2 +- > arch/powerpc/mm/pgtable-radix.c | 163 +++++++++++++++++++++++++++++ > 3 files changed, 165 insertions(+), 1 deletion(-) > .... .... > +static void remove_pte_table(pte_t *pte_start, unsigned long addr, > + unsigned long end) > +{ > + unsigned long next; > + pte_t *pte; > + > + pte = pte_start + pte_index(addr); > + for (; addr < end; addr = next, pte++) { > + next = (addr + PAGE_SIZE) & PAGE_MASK; > + if (next > end) > + next = end; > + > + if (!pte_present(*pte)) > + continue; > + > + spin_lock(&init_mm.page_table_lock); > + pte_clear(&init_mm, addr, pte); > + spin_unlock(&init_mm.page_table_lock); > + } > + > + flush_tlb_mm(&init_mm); Why call a flush here. we do that at the end of remove_page_table . Isn't that sufficient ? > +} > + > +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, > + unsigned long end, unsigned long map_page_size) > +{ > + unsigned long next; > + pte_t *pte_base; > + pmd_t *pmd; > + > + pmd = pmd_start + pmd_index(addr); > + for (; addr < end; addr = next, pmd++) { > + next = pmd_addr_end(addr, end); > + > + if (!pmd_present(*pmd)) > + continue; > + > + if (map_page_size == PMD_SIZE) { > + spin_lock(&init_mm.page_table_lock); > + pte_clear(&init_mm, addr, (pte_t *)pmd); > + spin_unlock(&init_mm.page_table_lock); > + > + continue; > + } > + > + pte_base = (pte_t *)pmd_page_vaddr(*pmd); > + remove_pte_table(pte_base, addr, next); > + free_pte_table(pte_base, pmd); > + } > +} > + > +static void remove_pud_table(pud_t *pud_start, unsigned long addr, > + unsigned long end, unsigned long map_page_size) > +{ > + unsigned long next; > + pmd_t *pmd_base; > + pud_t *pud; > + > + pud = pud_start + pud_index(addr); > + for (; addr < end; addr = next, pud++) { > + next = pud_addr_end(addr, end); > + > + if (!pud_present(*pud)) > + continue; > + > + if (map_page_size == PUD_SIZE) { > + spin_lock(&init_mm.page_table_lock); > + pte_clear(&init_mm, addr, (pte_t *)pud); > + spin_unlock(&init_mm.page_table_lock); > + > + continue; > + } > + > + pmd_base = (pmd_t *)pud_page_vaddr(*pud); > + remove_pmd_table(pmd_base, addr, next, map_page_size); > + free_pmd_table(pmd_base, pud); > + } > +} > + > +static void remove_pagetable(unsigned long start, unsigned long end, > + unsigned long map_page_size) > +{ > + unsigned long next; > + unsigned long addr; > + pgd_t *pgd; > + pud_t *pud; > + > + for (addr = start; addr < end; addr = next) { > + next = pgd_addr_end(addr, end); > + > + pgd = pgd_offset_k(addr); > + if (!pgd_present(*pgd)) > + continue; > + > + pud = (pud_t *)pgd_page_vaddr(*pgd); > + remove_pud_table(pud, addr, next, map_page_size); > + free_pud_table(pud, pgd); > + } > + > + flush_tlb_mm(&init_mm); So we want to flush the full kernel tlb when we do a hotplug ? May be check using flush_tlb_kernel_range(). Also that flush_tlb_mm() do check for mm_is_thread_local(). Do we update init_mm correct to handle that check ? I assume we want a tlbie() here instead of tlbiel() ? > +} > + > int radix__create_section_mapping(unsigned long start, unsigned long end) > { > unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift; > @@ -482,6 +635,16 @@ int radix__create_section_mapping(unsigned long start, unsigned long end) > > return 0; > } > + > +int radix__remove_section_mapping(unsigned long start, unsigned long end) > +{ > + unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift; > + > + start = _ALIGN_DOWN(start, page_size); > + remove_pagetable(start, end, page_size); > + > + return 0; > +} > #endif /* CONFIG_MEMORY_HOTPLUG */ > > #ifdef CONFIG_SPARSEMEM_VMEMMAP -aneesh
On Mon, Dec 19, 2016 at 03:18:07PM +0530, Aneesh Kumar K.V wrote: >Reza Arbab <arbab@linux.vnet.ibm.com> writes: >> +static void remove_pte_table(pte_t *pte_start, unsigned long addr, >> + unsigned long end) >> +{ >> + unsigned long next; >> + pte_t *pte; >> + >> + pte = pte_start + pte_index(addr); >> + for (; addr < end; addr = next, pte++) { >> + next = (addr + PAGE_SIZE) & PAGE_MASK; >> + if (next > end) >> + next = end; >> + >> + if (!pte_present(*pte)) >> + continue; >> + >> + spin_lock(&init_mm.page_table_lock); >> + pte_clear(&init_mm, addr, pte); >> + spin_unlock(&init_mm.page_table_lock); >> + } >> + >> + flush_tlb_mm(&init_mm); > >Why call a flush here. we do that at the end of remove_page_table . >Isn't that sufficient ? This was carried over from the x86 version of the function, where they do flush_tlb_all(). I can experiment to make sure things work without it. >> +static void remove_pagetable(unsigned long start, unsigned long end, >> + unsigned long map_page_size) >> +{ >> + unsigned long next; >> + unsigned long addr; >> + pgd_t *pgd; >> + pud_t *pud; >> + >> + for (addr = start; addr < end; addr = next) { >> + next = pgd_addr_end(addr, end); >> + >> + pgd = pgd_offset_k(addr); >> + if (!pgd_present(*pgd)) >> + continue; >> + >> + pud = (pud_t *)pgd_page_vaddr(*pgd); >> + remove_pud_table(pud, addr, next, map_page_size); >> + free_pud_table(pud, pgd); >> + } >> + >> + flush_tlb_mm(&init_mm); > > >So we want to flush the full kernel tlb when we do a hotplug ? >May be check using flush_tlb_kernel_range(). Also that flush_tlb_mm() do >check for mm_is_thread_local(). Do we update init_mm correct to handle >that check ? I assume we want a tlbie() here instead of tlbiel() ? I'll try using flush_tlb_kernel_range() instead. That sure does seem more appropriate.
On Mon, 2016-12-19 at 15:18 +0530, Aneesh Kumar K.V wrote: > > + pte = pte_start + pte_index(addr); > > + for (; addr < end; addr = next, pte++) { > > + next = (addr + PAGE_SIZE) & PAGE_MASK; > > + if (next > end) > > + next = end; > > + > > + if (!pte_present(*pte)) > > + continue; > > + > > + spin_lock(&init_mm.page_table_lock); > > + pte_clear(&init_mm, addr, pte); > > + spin_unlock(&init_mm.page_table_lock); > > + } > > + > > + flush_tlb_mm(&init_mm); > > Why call a flush here. we do that at the end of remove_page_table . > Isn't that sufficient ? All those lock/unlock ... what for ? Can't we just do the whole page ? Also I agree, we can delay the flush of the PTEs to the end. Ben.
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 43c2571..0032b66 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void) #ifdef CONFIG_MEMORY_HOTPLUG int radix__create_section_mapping(unsigned long start, unsigned long end); +int radix__remove_section_mapping(unsigned long start, unsigned long end); #endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 2b13f6b..b798ff6 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned long end) int remove_section_mapping(unsigned long start, unsigned long end) { if (radix_enabled()) - return -ENODEV; + return radix__remove_section_mapping(start, end); return hash__remove_section_mapping(start, end); } diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 8201d1f..315237c 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -466,6 +466,159 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, } #ifdef CONFIG_MEMORY_HOTPLUG +static void free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, pte_start); + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, pmd_start); + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +static void free_pud_table(pud_t *pud_start, pgd_t *pgd) +{ + pud_t *pud; + int i; + + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + pud_free(&init_mm, pud_start); + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); +} + +static void remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pte_t *pte; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + + flush_tlb_mm(&init_mm); +} + +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, + unsigned long end, unsigned long map_page_size) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmd; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (map_page_size == PMD_SIZE) { + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, (pte_t *)pmd); + spin_unlock(&init_mm.page_table_lock); + + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next); + free_pte_table(pte_base, pmd); + } +} + +static void remove_pud_table(pud_t *pud_start, unsigned long addr, + unsigned long end, unsigned long map_page_size) +{ + unsigned long next; + pmd_t *pmd_base; + pud_t *pud; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (map_page_size == PUD_SIZE) { + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, (pte_t *)pud); + spin_unlock(&init_mm.page_table_lock); + + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next, map_page_size); + free_pmd_table(pmd_base, pud); + } +} + +static void remove_pagetable(unsigned long start, unsigned long end, + unsigned long map_page_size) +{ + unsigned long next; + unsigned long addr; + pgd_t *pgd; + pud_t *pud; + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + pud = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud, addr, next, map_page_size); + free_pud_table(pud, pgd); + } + + flush_tlb_mm(&init_mm); +} + int radix__create_section_mapping(unsigned long start, unsigned long end) { unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift; @@ -482,6 +635,16 @@ int radix__create_section_mapping(unsigned long start, unsigned long end) return 0; } + +int radix__remove_section_mapping(unsigned long start, unsigned long end) +{ + unsigned long page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift; + + start = _ALIGN_DOWN(start, page_size); + remove_pagetable(start, end, page_size); + + return 0; +} #endif /* CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_SPARSEMEM_VMEMMAP
Tear down and free the four-level page tables of the linear mapping during memory hotremove. We borrow the basic structure of remove_pagetable() and friends from the identically-named x86 functions. Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com> --- arch/powerpc/include/asm/book3s/64/radix.h | 1 + arch/powerpc/mm/pgtable-book3s64.c | 2 +- arch/powerpc/mm/pgtable-radix.c | 163 +++++++++++++++++++++++++++++ 3 files changed, 165 insertions(+), 1 deletion(-)