diff mbox

powerpc/mm: update arch_{add,remove}_memory() for radix

Message ID 1466699962-22412-1-git-send-email-arbab@linux.vnet.ibm.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Reza Arbab June 23, 2016, 4:39 p.m. UTC
These functions are making direct calls to the hash table APIs,
leading to a BUG() on systems using radix.

Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
move to the __meminit section.

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
---
 arch/powerpc/mm/mem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

Comments

Aneesh Kumar K.V June 23, 2016, 5:17 p.m. UTC | #1
Reza Arbab <arbab@linux.vnet.ibm.com> writes:

> These functions are making direct calls to the hash table APIs,
> leading to a BUG() on systems using radix.
>
> Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
> move to the __meminit section.


They are really not the same. They can possibly end up using different
base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
enabled. Does hotplug depend on sparsemem vmemmap ?

>
> Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
> ---
>  arch/powerpc/mm/mem.c | 8 ++++----
>  1 file changed, 4 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
> index 2fd57fa..80c6ee7 100644
> --- a/arch/powerpc/mm/mem.c
> +++ b/arch/powerpc/mm/mem.c
> @@ -116,7 +116,7 @@ int memory_add_physaddr_to_nid(u64 start)
>  }
>  #endif
>
> -int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
> +int __meminit arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  {
>  	struct pglist_data *pgdata;
>  	struct zone *zone;
> @@ -127,7 +127,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  	pgdata = NODE_DATA(nid);
>
>  	start = (unsigned long)__va(start);
> -	rc = create_section_mapping(start, start + size);
> +	rc = vmemmap_create_mapping(start, size, __pa(start));
>  	if (rc) {
>  		pr_warning(
>  			"Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
> @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>  }
>
>  #ifdef CONFIG_MEMORY_HOTREMOVE
> -int arch_remove_memory(u64 start, u64 size)
> +int __meminit arch_remove_memory(u64 start, u64 size)
>  {
>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>  	unsigned long nr_pages = size >> PAGE_SHIFT;
> @@ -157,7 +157,7 @@ int arch_remove_memory(u64 start, u64 size)
>
>  	/* Remove htab bolted mappings for this section of memory */
>  	start = (unsigned long)__va(start);
> -	ret = remove_section_mapping(start, start + size);
> +	vmemmap_remove_mapping(start, size);
>
>  	/* Ensure all vmalloc mappings are flushed in case they also
>  	 * hit that section of memory
> -- 
> 1.8.3.1

We really want to look at memory hotplug to fully understand the
radix impact. The unplug operation needs to do some additional freeing.

I did put in comments around that with the idea of coming back to that
later

#ifdef CONFIG_MEMORY_HOTPLUG
void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
{
	/* FIXME!! intel does more. We should free page tables mapping vmemmap ? */
}
#endif

-aneesh
Nathan Fontenot June 23, 2016, 5:27 p.m. UTC | #2
On 06/23/2016 12:17 PM, Aneesh Kumar K.V wrote:
> Reza Arbab <arbab@linux.vnet.ibm.com> writes:
> 
>> These functions are making direct calls to the hash table APIs,
>> leading to a BUG() on systems using radix.
>>
>> Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
>> move to the __meminit section.
> 
> 
> They are really not the same. They can possibly end up using different
> base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
> enabled. Does hotplug depend on sparsemem vmemmap ?

Sparse vmemmap is supported, and I think enabled by default for pseries,
but hotplug does not depend on sparse vmemmep.

-Nathan

> 
>>
>> Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
>> ---
>>  arch/powerpc/mm/mem.c | 8 ++++----
>>  1 file changed, 4 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index 2fd57fa..80c6ee7 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -116,7 +116,7 @@ int memory_add_physaddr_to_nid(u64 start)
>>  }
>>  #endif
>>
>> -int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>> +int __meminit arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>>  {
>>  	struct pglist_data *pgdata;
>>  	struct zone *zone;
>> @@ -127,7 +127,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>>  	pgdata = NODE_DATA(nid);
>>
>>  	start = (unsigned long)__va(start);
>> -	rc = create_section_mapping(start, start + size);
>> +	rc = vmemmap_create_mapping(start, size, __pa(start));
>>  	if (rc) {
>>  		pr_warning(
>>  			"Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
>> @@ -143,7 +143,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
>>  }
>>
>>  #ifdef CONFIG_MEMORY_HOTREMOVE
>> -int arch_remove_memory(u64 start, u64 size)
>> +int __meminit arch_remove_memory(u64 start, u64 size)
>>  {
>>  	unsigned long start_pfn = start >> PAGE_SHIFT;
>>  	unsigned long nr_pages = size >> PAGE_SHIFT;
>> @@ -157,7 +157,7 @@ int arch_remove_memory(u64 start, u64 size)
>>
>>  	/* Remove htab bolted mappings for this section of memory */
>>  	start = (unsigned long)__va(start);
>> -	ret = remove_section_mapping(start, start + size);
>> +	vmemmap_remove_mapping(start, size);
>>
>>  	/* Ensure all vmalloc mappings are flushed in case they also
>>  	 * hit that section of memory
>> -- 
>> 1.8.3.1
> 
> We really want to look at memory hotplug to fully understand the
> radix impact. The unplug operation needs to do some additional freeing.
> 
> I did put in comments around that with the idea of coming back to that
> later
> 
> #ifdef CONFIG_MEMORY_HOTPLUG
> void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
> {
> 	/* FIXME!! intel does more. We should free page tables mapping vmemmap ? */
> }
> #endif
> 
> -aneesh
> 
> _______________________________________________
> Linuxppc-dev mailing list
> Linuxppc-dev@lists.ozlabs.org
> https://lists.ozlabs.org/listinfo/linuxppc-dev
>
Reza Arbab June 23, 2016, 7:37 p.m. UTC | #3
On Thu, Jun 23, 2016 at 10:47:20PM +0530, Aneesh Kumar K.V wrote:
>Reza Arbab <arbab@linux.vnet.ibm.com> writes:
>> These functions are making direct calls to the hash table APIs,
>> leading to a BUG() on systems using radix.
>>
>> Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
>> move to the __meminit section.
>
>They are really not the same. They can possibly end up using different
>base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
>enabled. Does hotplug depend on sparsemem vmemmap ?

I'm not sure. Maybe it's best if I back up a step and explain what lead 
me to this patch. During hotplug, you get

...
	arch_add_memory
		create_section_mapping
			htab_bolt_mapping
				BUG_ON(!ppc_md.hpte_insert);

So it seemed to me that I needed a radix equivalent of 
create_section_mapping().

After some digging, I found hash__vmemmap_create_mapping() and 
radix__vmemmap_create_mapping() did what I needed. I did not notice the 
#ifdef SPARSEMEM_VMEMMAP around them.

Hotplug/remove are now working for me as far as I can tell, so I think
the functional change in itself was correct. Hotplug may not depend on 
sparsemem vmemmap, but we seem to need a radix create_section_mapping() 
regardless.

Could it be that the functions just need to be renamed 
hash__create_mapping()/radix__create_mapping() and moved out of #ifdef 
SPARSEMEM_VMEMMAP?
Reza Arbab June 23, 2016, 7:52 p.m. UTC | #4
On Thu, Jun 23, 2016 at 02:37:39PM -0500, Reza Arbab wrote:
>Could it be that the functions just need to be renamed 
>hash__create_mapping()/radix__create_mapping() and moved out of #ifdef 
>SPARSEMEM_VMEMMAP?

Or vice-versa, maybe the callers should have been wrapped in the first 
place:

arch_add_memory() {
	...
	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
		vmemmap_create_mapping()
	...
}

arch_remove_memory() {
	...
	if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
		vmemmap_remove_mapping()
	...
}
Balbir Singh June 24, 2016, 3:22 a.m. UTC | #5
On 24/06/16 03:17, Aneesh Kumar K.V wrote:
> Reza Arbab <arbab@linux.vnet.ibm.com> writes:
> 
>> These functions are making direct calls to the hash table APIs,
>> leading to a BUG() on systems using radix.
>>
>> Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
>> move to the __meminit section.
> 
> 
> They are really not the same. They can possibly end up using different
> base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
> enabled. Does hotplug depend on sparsemem vmemmap ?

# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
        bool "Allow for memory hot-add"
        depends on SPARSEMEM || X86_64_ACPI_NUMA
        depends on ARCH_ENABLE_MEMORY_HOTPLUG

We depend on sparsemem for sure. vmemmap is just a way of getting the memory
virtually mapped. From the patch perspective, I think we need the equivalent of
just mapping the pages in kernel. The address may differ based on whether vmemmap
is used or not and of-course page_size, 

Balbir Singh
Michael Ellerman June 28, 2016, 11:21 a.m. UTC | #6
On Thu, 2016-06-23 at 14:37 -0500, Reza Arbab wrote:
> On Thu, Jun 23, 2016 at 10:47:20PM +0530, Aneesh Kumar K.V wrote:
> > Reza Arbab <arbab@linux.vnet.ibm.com> writes:
> > > These functions are making direct calls to the hash table APIs,
> > > leading to a BUG() on systems using radix.
> > > 
> > > Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
> > > move to the __meminit section.
> > 
> > They are really not the same. They can possibly end up using different
> > base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
> > enabled. Does hotplug depend on sparsemem vmemmap ?
> 
> I'm not sure. Maybe it's best if I back up a step and explain what lead 
> me to this patch. During hotplug, you get
> 
> ...
> 	arch_add_memory
> 		create_section_mapping
> 			htab_bolt_mapping
> 				BUG_ON(!ppc_md.hpte_insert);
> 
> So it seemed to me that I needed a radix equivalent of 
> create_section_mapping().
> 
> After some digging, I found hash__vmemmap_create_mapping() and 
> radix__vmemmap_create_mapping() did what I needed. I did not notice the 
> #ifdef SPARSEMEM_VMEMMAP around them.

I think that's more by luck than design. The vmemmap routines use
mmu_vmemmap_psize which is probably but not definitely the same as
mmu_linear_psize.

> Could it be that the functions just need to be renamed 
> hash__create_mapping()/radix__create_mapping() and moved out of #ifdef 
> SPARSEMEM_VMEMMAP?

No, you need to use mmu_linear_psize for the hotplug case.

But you can probably factor out a common routine that both cases use, and hide
the hash vs radix check in that.

And probably send me a patch to make MEMORY_HOTPLUG depend on !RADIX for v4.7?

cheers
Aneesh Kumar K.V June 28, 2016, 2:03 p.m. UTC | #7
Michael Ellerman <mpe@ellerman.id.au> writes:

> On Thu, 2016-06-23 at 14:37 -0500, Reza Arbab wrote:
>> On Thu, Jun 23, 2016 at 10:47:20PM +0530, Aneesh Kumar K.V wrote:
>> > Reza Arbab <arbab@linux.vnet.ibm.com> writes:
>> > > These functions are making direct calls to the hash table APIs,
>> > > leading to a BUG() on systems using radix.
>> > > 
>> > > Switch them to the vmemmap_{create,remove}_mapping() wrappers, and
>> > > move to the __meminit section.
>> > 
>> > They are really not the same. They can possibly end up using different
>> > base page size. Also vmemmap is available only with SPARSEMEM_VMEMMAP
>> > enabled. Does hotplug depend on sparsemem vmemmap ?
>> 
>> I'm not sure. Maybe it's best if I back up a step and explain what lead 
>> me to this patch. During hotplug, you get
>> 
>> ...
>> 	arch_add_memory
>> 		create_section_mapping
>> 			htab_bolt_mapping
>> 				BUG_ON(!ppc_md.hpte_insert);
>> 
>> So it seemed to me that I needed a radix equivalent of 
>> create_section_mapping().
>> 
>> After some digging, I found hash__vmemmap_create_mapping() and 
>> radix__vmemmap_create_mapping() did what I needed. I did not notice the 
>> #ifdef SPARSEMEM_VMEMMAP around them.
>
> I think that's more by luck than design. The vmemmap routines use
> mmu_vmemmap_psize which is probably but not definitely the same as
> mmu_linear_psize.
>
>> Could it be that the functions just need to be renamed 
>> hash__create_mapping()/radix__create_mapping() and moved out of #ifdef 
>> SPARSEMEM_VMEMMAP?
>
> No, you need to use mmu_linear_psize for the hotplug case.
>
> But you can probably factor out a common routine that both cases use, and hide
> the hash vs radix check in that.
>
> And probably send me a patch to make MEMORY_HOTPLUG depend on !RADIX for v4.7?

Few other stuff we need to still look from Radix point
1) machine check handling/memory errors
2) kexec

-aneesh
Reza Arbab June 29, 2016, 3:37 p.m. UTC | #8
On Tue, Jun 28, 2016 at 09:21:05PM +1000, Michael Ellerman wrote:
>No, you need to use mmu_linear_psize for the hotplug case.
>
>But you can probably factor out a common routine that both cases use, and hide
>the hash vs radix check in that.

Okay, I'm trying to refactor {create,remove}_section_mapping() into 
hash__ and radix__ variants. This lead to a couple of questions.

Pseudocode for radix__create_section_mapping(start, end):

	page_size = 1 << mmu_psize_defs[mmu_linear_psize].shift;
	start = _ALIGN_DOWN(start, page_size);

	for (; start < end; start += page_size) {
		radix__map_kernel_page(start, __pa(start),
				       PAGE_KERNEL, page_size);
	}

Should the above use PAGE_KERNEL, like the the hash table bolt, or 
(_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_KERNEL_RW), like in the radix 
vmemmap creation?

The other question is what radix__remove_section_mapping() should do.
I don't know offhand what the opposite of map_kernel_page() is. As 
Aneesh mentioned, radix vmemmap removal is currently stubbed as a FIXME 
so I couldn't use that as a reference.
diff mbox

Patch

diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 2fd57fa..80c6ee7 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -116,7 +116,7 @@  int memory_add_physaddr_to_nid(u64 start)
 }
 #endif
 
-int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
+int __meminit arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 {
 	struct pglist_data *pgdata;
 	struct zone *zone;
@@ -127,7 +127,7 @@  int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
-	rc = create_section_mapping(start, start + size);
+	rc = vmemmap_create_mapping(start, size, __pa(start));
 	if (rc) {
 		pr_warning(
 			"Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
@@ -143,7 +143,7 @@  int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 }
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
-int arch_remove_memory(u64 start, u64 size)
+int __meminit arch_remove_memory(u64 start, u64 size)
 {
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -157,7 +157,7 @@  int arch_remove_memory(u64 start, u64 size)
 
 	/* Remove htab bolted mappings for this section of memory */
 	start = (unsigned long)__va(start);
-	ret = remove_section_mapping(start, start + size);
+	vmemmap_remove_mapping(start, size);
 
 	/* Ensure all vmalloc mappings are flushed in case they also
 	 * hit that section of memory