diff mbox

[v5,3/4] powerpc/mm: add radix__remove_section_mapping()

Message ID 1484593666-8001-4-git-send-email-arbab@linux.vnet.ibm.com (mailing list archive)
State Accepted
Headers show

Commit Message

Reza Arbab Jan. 16, 2017, 7:07 p.m. UTC
Tear down and free the four-level page tables of physical mappings
during memory hotremove.

Borrow the basic structure of remove_pagetable() and friends from the
identically-named x86 functions. Reduce the frequency of tlb flushes and
page_table_lock spinlocks by only doing them in the outermost function.
There was some question as to whether the locking is needed at all.
Leave it for now, but we could consider dropping it.

Memory must be offline to be removed, thus not in use. So there
shouldn't be the sort of concurrent page walking activity here that
might prompt us to use RCU.

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
---
 arch/powerpc/include/asm/book3s/64/radix.h |   1 +
 arch/powerpc/mm/pgtable-book3s64.c         |   2 +-
 arch/powerpc/mm/pgtable-radix.c            | 133 +++++++++++++++++++++++++++++
 3 files changed, 135 insertions(+), 1 deletion(-)

Comments

Balbir Singh Jan. 17, 2017, 7:22 a.m. UTC | #1
On Mon, Jan 16, 2017 at 01:07:45PM -0600, Reza Arbab wrote:
> Tear down and free the four-level page tables of physical mappings
> during memory hotremove.
> 
> Borrow the basic structure of remove_pagetable() and friends from the
> identically-named x86 functions. Reduce the frequency of tlb flushes and
> page_table_lock spinlocks by only doing them in the outermost function.
> There was some question as to whether the locking is needed at all.
> Leave it for now, but we could consider dropping it.
> 
> Memory must be offline to be removed, thus not in use. So there
> shouldn't be the sort of concurrent page walking activity here that
> might prompt us to use RCU.
> 
> Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
> ---
>  arch/powerpc/include/asm/book3s/64/radix.h |   1 +
>  arch/powerpc/mm/pgtable-book3s64.c         |   2 +-
>  arch/powerpc/mm/pgtable-radix.c            | 133 +++++++++++++++++++++++++++++
>  3 files changed, 135 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
> index 43c2571..0032b66 100644
> --- a/arch/powerpc/include/asm/book3s/64/radix.h
> +++ b/arch/powerpc/include/asm/book3s/64/radix.h
> @@ -294,6 +294,7 @@ static inline unsigned long radix__get_tree_size(void)
>  
>  #ifdef CONFIG_MEMORY_HOTPLUG
>  int radix__create_section_mapping(unsigned long start, unsigned long end);
> +int radix__remove_section_mapping(unsigned long start, unsigned long end);
>  #endif /* CONFIG_MEMORY_HOTPLUG */
>  #endif /* __ASSEMBLY__ */
>  #endif
> diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
> index 2b13f6b..b798ff6 100644
> --- a/arch/powerpc/mm/pgtable-book3s64.c
> +++ b/arch/powerpc/mm/pgtable-book3s64.c
> @@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned long end)
>  int remove_section_mapping(unsigned long start, unsigned long end)
>  {
>  	if (radix_enabled())
> -		return -ENODEV;
> +		return radix__remove_section_mapping(start, end);
>  
>  	return hash__remove_section_mapping(start, end);
>  }
> diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
> index 075b4ec..cfba666 100644
> --- a/arch/powerpc/mm/pgtable-radix.c
> +++ b/arch/powerpc/mm/pgtable-radix.c
> @@ -476,10 +476,143 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
>  }
>

Shouldn't most of these functions have __meminit?
  
>  #ifdef CONFIG_MEMORY_HOTPLUG
> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> +	pte_t *pte;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PTE; i++) {
> +		pte = pte_start + i;
> +		if (!pte_none(*pte))
> +			return;

If !pte_none() we fail the hotplug? Or silently
leave the allocated pte's around. I guess this is
the same as x86

> +	}
> +
> +	pte_free_kernel(&init_mm, pte_start);
> +	pmd_clear(pmd);
> +}
> +
> +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> +	pmd_t *pmd;
> +	int i;
> +
> +	for (i = 0; i < PTRS_PER_PMD; i++) {
> +		pmd = pmd_start + i;
> +		if (!pmd_none(*pmd))
> +			return;
> +	}
> +
> +	pmd_free(&init_mm, pmd_start);
> +	pud_clear(pud);
> +}
> +
> +static void remove_pte_table(pte_t *pte_start, unsigned long addr,
> +			     unsigned long end)
> +{
> +	unsigned long next;
> +	pte_t *pte;
> +
> +	pte = pte_start + pte_index(addr);
> +	for (; addr < end; addr = next, pte++) {
> +		next = (addr + PAGE_SIZE) & PAGE_MASK;
> +		if (next > end)
> +			next = end;
> +
> +		if (!pte_present(*pte))
> +			continue;
> +
> +		pte_clear(&init_mm, addr, pte);
> +	}
> +}
> +
> +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> +			     unsigned long end)
> +{
> +	unsigned long next;
> +	pte_t *pte_base;
> +	pmd_t *pmd;
> +
> +	pmd = pmd_start + pmd_index(addr);
> +	for (; addr < end; addr = next, pmd++) {
> +		next = pmd_addr_end(addr, end);
> +
> +		if (!pmd_present(*pmd))
> +			continue;
> +
> +		if (pmd_huge(*pmd)) {
> +			pte_clear(&init_mm, addr, (pte_t *)pmd);

pmd_clear()?

> +			continue;
> +		}
> +
> +		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
> +		remove_pte_table(pte_base, addr, next);
> +		free_pte_table(pte_base, pmd);
> +	}
> +}
> +
> +static void remove_pud_table(pud_t *pud_start, unsigned long addr,
> +			     unsigned long end)
> +{
> +	unsigned long next;
> +	pmd_t *pmd_base;
> +	pud_t *pud;
> +
> +	pud = pud_start + pud_index(addr);
> +	for (; addr < end; addr = next, pud++) {
> +		next = pud_addr_end(addr, end);
> +
> +		if (!pud_present(*pud))
> +			continue;
> +
> +		if (pud_huge(*pud)) {
> +			pte_clear(&init_mm, addr, (pte_t *)pud);
pud_clear()?
> +			continue;
> +		}
> +
> +		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
> +		remove_pmd_table(pmd_base, addr, next);
> +		free_pmd_table(pmd_base, pud);
> +	}
> +}
> +
> +static void remove_pagetable(unsigned long start, unsigned long end)
> +{
> +	unsigned long addr, next;
> +	pud_t *pud_base;
> +	pgd_t *pgd;
> +
> +	spin_lock(&init_mm.page_table_lock);
> +

x86 does more granular lock acquisition only during
clearing the relevant entries. I suppose we don't have
to worry about it since its not fast path and frequent.

Balbir Singh.
Reza Arbab Jan. 17, 2017, 6:36 p.m. UTC | #2
On Tue, Jan 17, 2017 at 12:52:51PM +0530, Balbir Singh wrote:
>Shouldn't most of these functions have __meminit?

I don't think so. The mapping functions are __meminit, but the unmapping 
functions are completely within #ifdef CONFIG_MEMORY_HOTPLUG already.

>On Mon, Jan 16, 2017 at 01:07:45PM -0600, Reza Arbab wrote:
>>  #ifdef CONFIG_MEMORY_HOTPLUG
>> +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> +	pte_t *pte;
>> +	int i;
>> +
>> +	for (i = 0; i < PTRS_PER_PTE; i++) {
>> +		pte = pte_start + i;
>> +		if (!pte_none(*pte))
>> +			return;
>
>If !pte_none() we fail the hotplug? Or silently
>leave the allocated pte's around. I guess this is
>the same as x86

The latter--it's not a failure. If you provided remove_pagetable() an 
unaligned address range, there could be a pte left unremoved at either 
end.

>> +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
>> +			     unsigned long end)
>> +{
>> +	unsigned long next;
>> +	pte_t *pte_base;
>> +	pmd_t *pmd;
>> +
>> +	pmd = pmd_start + pmd_index(addr);
>> +	for (; addr < end; addr = next, pmd++) {
>> +		next = pmd_addr_end(addr, end);
>> +
>> +		if (!pmd_present(*pmd))
>> +			continue;
>> +
>> +		if (pmd_huge(*pmd)) {
>> +			pte_clear(&init_mm, addr, (pte_t *)pmd);
>
>pmd_clear()?

I used pte_clear() to mirror what happens in radix__map_kernel_page():

		if (map_page_size == PMD_SIZE) {
			ptep = (pte_t *)pmdp;
			goto set_the_pte;
		}

		[...]

	set_the_pte:
		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags));

Would pmd_clear() be equivalent, since the pointer got set like a pte?

>> +static void remove_pagetable(unsigned long start, unsigned long end)
>> +{
>> +	unsigned long addr, next;
>> +	pud_t *pud_base;
>> +	pgd_t *pgd;
>> +
>> +	spin_lock(&init_mm.page_table_lock);
>> +
>
>x86 does more granular lock acquisition only during
>clearing the relevant entries. I suppose we don't have
>to worry about it since its not fast path and frequent.

Yep. Ben thought the locking in remove_pte_table() was actually too 
granular, and Aneesh questioned what was being protected in the first 
place. So I left one lock/unlock in the outermost function for now.
Balbir Singh Jan. 18, 2017, 1:22 a.m. UTC | #3
On Tue, Jan 17, 2017 at 12:36:21PM -0600, Reza Arbab wrote:
> On Tue, Jan 17, 2017 at 12:52:51PM +0530, Balbir Singh wrote:
> > Shouldn't most of these functions have __meminit?
> 
> I don't think so. The mapping functions are __meminit, but the unmapping
> functions are completely within #ifdef CONFIG_MEMORY_HOTPLUG already.
> 
> > On Mon, Jan 16, 2017 at 01:07:45PM -0600, Reza Arbab wrote:
> > >  #ifdef CONFIG_MEMORY_HOTPLUG
> > > +static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
> > > +{
> > > +	pte_t *pte;
> > > +	int i;
> > > +
> > > +	for (i = 0; i < PTRS_PER_PTE; i++) {
> > > +		pte = pte_start + i;
> > > +		if (!pte_none(*pte))
> > > +			return;
> > 
> > If !pte_none() we fail the hotplug? Or silently
> > leave the allocated pte's around. I guess this is
> > the same as x86
> 
> The latter--it's not a failure. If you provided remove_pagetable() an
> unaligned address range, there could be a pte left unremoved at either end.
>

OK.
 
> > > +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
> > > +			     unsigned long end)
> > > +{
> > > +	unsigned long next;
> > > +	pte_t *pte_base;
> > > +	pmd_t *pmd;
> > > +
> > > +	pmd = pmd_start + pmd_index(addr);
> > > +	for (; addr < end; addr = next, pmd++) {
> > > +		next = pmd_addr_end(addr, end);
> > > +
> > > +		if (!pmd_present(*pmd))
> > > +			continue;
> > > +
> > > +		if (pmd_huge(*pmd)) {
> > > +			pte_clear(&init_mm, addr, (pte_t *)pmd);
> > 
> > pmd_clear()?
> 
> I used pte_clear() to mirror what happens in radix__map_kernel_page():
> 
> 		if (map_page_size == PMD_SIZE) {
> 			ptep = (pte_t *)pmdp;
> 			goto set_the_pte;
> 		}
> 
> 		[...]
> 
> 	set_the_pte:
> 		set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, flags));
> 
> Would pmd_clear() be equivalent, since the pointer got set like a pte?

But we are still setting a pmdp. pmd_clear() will set the pmd to 0,
pte_clear() will go through the pte_update() mechanism which is expensive
IMHO and we may not need to do it.

> 
> > > +static void remove_pagetable(unsigned long start, unsigned long end)
> > > +{
> > > +	unsigned long addr, next;
> > > +	pud_t *pud_base;
> > > +	pgd_t *pgd;
> > > +
> > > +	spin_lock(&init_mm.page_table_lock);
> > > +
> > 
> > x86 does more granular lock acquisition only during
> > clearing the relevant entries. I suppose we don't have
> > to worry about it since its not fast path and frequent.
> 
> Yep. Ben thought the locking in remove_pte_table() was actually too
> granular, and Aneesh questioned what was being protected in the first place.
> So I left one lock/unlock in the outermost function for now.
>

Fair enough

Balbir Singh.
diff mbox

Patch

diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h
index 43c2571..0032b66 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -294,6 +294,7 @@  static inline unsigned long radix__get_tree_size(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int radix__create_section_mapping(unsigned long start, unsigned long end);
+int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c
index 2b13f6b..b798ff6 100644
--- a/arch/powerpc/mm/pgtable-book3s64.c
+++ b/arch/powerpc/mm/pgtable-book3s64.c
@@ -139,7 +139,7 @@  int create_section_mapping(unsigned long start, unsigned long end)
 int remove_section_mapping(unsigned long start, unsigned long end)
 {
 	if (radix_enabled())
-		return -ENODEV;
+		return radix__remove_section_mapping(start, end);
 
 	return hash__remove_section_mapping(start, end);
 }
diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c
index 075b4ec..cfba666 100644
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -476,10 +476,143 @@  void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+	pte_t *pte;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte = pte_start + i;
+		if (!pte_none(*pte))
+			return;
+	}
+
+	pte_free_kernel(&init_mm, pte_start);
+	pmd_clear(pmd);
+}
+
+static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+	pmd_t *pmd;
+	int i;
+
+	for (i = 0; i < PTRS_PER_PMD; i++) {
+		pmd = pmd_start + i;
+		if (!pmd_none(*pmd))
+			return;
+	}
+
+	pmd_free(&init_mm, pmd_start);
+	pud_clear(pud);
+}
+
+static void remove_pte_table(pte_t *pte_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pte_t *pte;
+
+	pte = pte_start + pte_index(addr);
+	for (; addr < end; addr = next, pte++) {
+		next = (addr + PAGE_SIZE) & PAGE_MASK;
+		if (next > end)
+			next = end;
+
+		if (!pte_present(*pte))
+			continue;
+
+		pte_clear(&init_mm, addr, pte);
+	}
+}
+
+static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pte_t *pte_base;
+	pmd_t *pmd;
+
+	pmd = pmd_start + pmd_index(addr);
+	for (; addr < end; addr = next, pmd++) {
+		next = pmd_addr_end(addr, end);
+
+		if (!pmd_present(*pmd))
+			continue;
+
+		if (pmd_huge(*pmd)) {
+			pte_clear(&init_mm, addr, (pte_t *)pmd);
+			continue;
+		}
+
+		pte_base = (pte_t *)pmd_page_vaddr(*pmd);
+		remove_pte_table(pte_base, addr, next);
+		free_pte_table(pte_base, pmd);
+	}
+}
+
+static void remove_pud_table(pud_t *pud_start, unsigned long addr,
+			     unsigned long end)
+{
+	unsigned long next;
+	pmd_t *pmd_base;
+	pud_t *pud;
+
+	pud = pud_start + pud_index(addr);
+	for (; addr < end; addr = next, pud++) {
+		next = pud_addr_end(addr, end);
+
+		if (!pud_present(*pud))
+			continue;
+
+		if (pud_huge(*pud)) {
+			pte_clear(&init_mm, addr, (pte_t *)pud);
+			continue;
+		}
+
+		pmd_base = (pmd_t *)pud_page_vaddr(*pud);
+		remove_pmd_table(pmd_base, addr, next);
+		free_pmd_table(pmd_base, pud);
+	}
+}
+
+static void remove_pagetable(unsigned long start, unsigned long end)
+{
+	unsigned long addr, next;
+	pud_t *pud_base;
+	pgd_t *pgd;
+
+	spin_lock(&init_mm.page_table_lock);
+
+	for (addr = start; addr < end; addr = next) {
+		next = pgd_addr_end(addr, end);
+
+		pgd = pgd_offset_k(addr);
+		if (!pgd_present(*pgd))
+			continue;
+
+		if (pgd_huge(*pgd)) {
+			pte_clear(&init_mm, addr, (pte_t *)pgd);
+			continue;
+		}
+
+		pud_base = (pud_t *)pgd_page_vaddr(*pgd);
+		remove_pud_table(pud_base, addr, next);
+	}
+
+	spin_unlock(&init_mm.page_table_lock);
+	radix__flush_tlb_kernel_range(start, end);
+}
+
 int __ref radix__create_section_mapping(unsigned long start, unsigned long end)
 {
 	return create_physical_mapping(start, end);
 }
+
+int radix__remove_section_mapping(unsigned long start, unsigned long end)
+{
+	remove_pagetable(start, end);
+	return 0;
+}
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP