diff mbox

[v4,4/5] mm: make processing of movable_node arch-specific

Message ID 1475778995-1420-5-git-send-email-arbab@linux.vnet.ibm.com (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Reza Arbab Oct. 6, 2016, 6:36 p.m. UTC
Currently, CONFIG_MOVABLE_NODE depends on X86_64. In preparation to
enable it for other arches, we need to factor a detail which is unique
to x86 out of the generic mm code.

Specifically, as documented in kernel-parameters.txt, the use of
"movable_node" should remain restricted to x86:

movable_node    [KNL,X86] Boot-time switch to enable the effects
                of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.

This option tells x86 to find movable nodes identified by the ACPI SRAT.
On other arches, it would have no benefit, only the undesired side
effect of setting bottom-up memblock allocation.

Since #ifdef CONFIG_MOVABLE_NODE will no longer be enough to restrict
this option to x86, move it to an arch-specific compilation unit
instead.

Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
---
 arch/x86/mm/numa.c  | 35 ++++++++++++++++++++++++++++++++++-
 mm/memory_hotplug.c | 31 -------------------------------
 2 files changed, 34 insertions(+), 32 deletions(-)

Comments

Aneesh Kumar K.V Oct. 7, 2016, 6:37 a.m. UTC | #1
Reza Arbab <arbab@linux.vnet.ibm.com> writes:

> Currently, CONFIG_MOVABLE_NODE depends on X86_64. In preparation to
> enable it for other arches, we need to factor a detail which is unique
> to x86 out of the generic mm code.
>
> Specifically, as documented in kernel-parameters.txt, the use of
> "movable_node" should remain restricted to x86:
>
> movable_node    [KNL,X86] Boot-time switch to enable the effects
>                 of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
>
> This option tells x86 to find movable nodes identified by the ACPI SRAT.
> On other arches, it would have no benefit, only the undesired side
> effect of setting bottom-up memblock allocation.
>
> Since #ifdef CONFIG_MOVABLE_NODE will no longer be enough to restrict
> this option to x86, move it to an arch-specific compilation unit
> instead.

Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

>
> Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
> ---
>  arch/x86/mm/numa.c  | 35 ++++++++++++++++++++++++++++++++++-
>  mm/memory_hotplug.c | 31 -------------------------------
>  2 files changed, 34 insertions(+), 32 deletions(-)
>
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index fb68210..e95cab4 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -887,6 +887,38 @@ EXPORT_SYMBOL(cpumask_of_node);
>  #endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
>
>  #ifdef CONFIG_MEMORY_HOTPLUG
> +
> +static int __init cmdline_parse_movable_node(char *p)
> +{
> +#ifdef CONFIG_MOVABLE_NODE
> +	/*
> +	 * Memory used by the kernel cannot be hot-removed because Linux
> +	 * cannot migrate the kernel pages. When memory hotplug is
> +	 * enabled, we should prevent memblock from allocating memory
> +	 * for the kernel.
> +	 *
> +	 * ACPI SRAT records all hotpluggable memory ranges. But before
> +	 * SRAT is parsed, we don't know about it.
> +	 *
> +	 * The kernel image is loaded into memory at very early time. We
> +	 * cannot prevent this anyway. So on NUMA system, we set any
> +	 * node the kernel resides in as un-hotpluggable.
> +	 *
> +	 * Since on modern servers, one node could have double-digit
> +	 * gigabytes memory, we can assume the memory around the kernel
> +	 * image is also un-hotpluggable. So before SRAT is parsed, just
> +	 * allocate memory near the kernel image to try the best to keep
> +	 * the kernel away from hotpluggable memory.
> +	 */
> +	memblock_set_bottom_up(true);
> +	movable_node_enabled = true;
> +#else
> +	pr_warn("movable_node option not supported\n");
> +#endif
> +	return 0;
> +}
> +early_param("movable_node", cmdline_parse_movable_node);
> +
>  int memory_add_physaddr_to_nid(u64 start)
>  {
>  	struct numa_meminfo *mi = &numa_meminfo;
> @@ -899,4 +931,5 @@ int memory_add_physaddr_to_nid(u64 start)
>  	return nid;
>  }
>  EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
> -#endif
> +
> +#endif /* CONFIG_MEMORY_HOTPLUG */
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 9d29ba0..79c709a 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -1738,37 +1738,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
>  }
>  #endif /* CONFIG_MOVABLE_NODE */
>
> -static int __init cmdline_parse_movable_node(char *p)
> -{
> -#ifdef CONFIG_MOVABLE_NODE
> -	/*
> -	 * Memory used by the kernel cannot be hot-removed because Linux
> -	 * cannot migrate the kernel pages. When memory hotplug is
> -	 * enabled, we should prevent memblock from allocating memory
> -	 * for the kernel.
> -	 *
> -	 * ACPI SRAT records all hotpluggable memory ranges. But before
> -	 * SRAT is parsed, we don't know about it.
> -	 *
> -	 * The kernel image is loaded into memory at very early time. We
> -	 * cannot prevent this anyway. So on NUMA system, we set any
> -	 * node the kernel resides in as un-hotpluggable.
> -	 *
> -	 * Since on modern servers, one node could have double-digit
> -	 * gigabytes memory, we can assume the memory around the kernel
> -	 * image is also un-hotpluggable. So before SRAT is parsed, just
> -	 * allocate memory near the kernel image to try the best to keep
> -	 * the kernel away from hotpluggable memory.
> -	 */
> -	memblock_set_bottom_up(true);
> -	movable_node_enabled = true;
> -#else
> -	pr_warn("movable_node option not supported\n");
> -#endif
> -	return 0;
> -}
> -early_param("movable_node", cmdline_parse_movable_node);
> -
>  /* check which state of node_states will be changed when offline memory */
>  static void node_states_check_changes_offline(unsigned long nr_pages,
>  		struct zone *zone, struct memory_notify *arg)
> -- 
> 1.8.3.1
Balbir Singh Oct. 11, 2016, 12:26 p.m. UTC | #2
On 07/10/16 05:36, Reza Arbab wrote:
> Currently, CONFIG_MOVABLE_NODE depends on X86_64. In preparation to
> enable it for other arches, we need to factor a detail which is unique
> to x86 out of the generic mm code.
> 
> Specifically, as documented in kernel-parameters.txt, the use of
> "movable_node" should remain restricted to x86:
> 
> movable_node    [KNL,X86] Boot-time switch to enable the effects
>                 of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
> 
> This option tells x86 to find movable nodes identified by the ACPI SRAT.
> On other arches, it would have no benefit, only the undesired side
> effect of setting bottom-up memblock allocation.
> 
> Since #ifdef CONFIG_MOVABLE_NODE will no longer be enough to restrict
> this option to x86, move it to an arch-specific compilation unit
> instead.
> 
> Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>

Acked-by: Balbir Singh <bsingharora@gmail.com>
Balbir Singh Oct. 25, 2016, 12:15 p.m. UTC | #3
On 11/10/16 23:26, Balbir Singh wrote:
> 
> 
> On 07/10/16 05:36, Reza Arbab wrote:
>> Currently, CONFIG_MOVABLE_NODE depends on X86_64. In preparation to
>> enable it for other arches, we need to factor a detail which is unique
>> to x86 out of the generic mm code.
>>
>> Specifically, as documented in kernel-parameters.txt, the use of
>> "movable_node" should remain restricted to x86:
>>
>> movable_node    [KNL,X86] Boot-time switch to enable the effects
>>                 of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
>>
>> This option tells x86 to find movable nodes identified by the ACPI SRAT.
>> On other arches, it would have no benefit, only the undesired side
>> effect of setting bottom-up memblock allocation.
>>
>> Since #ifdef CONFIG_MOVABLE_NODE will no longer be enough to restrict
>> this option to x86, move it to an arch-specific compilation unit
>> instead.
>>
>> Signed-off-by: Reza Arbab <arbab@linux.vnet.ibm.com>
> 
> Acked-by: Balbir Singh <bsingharora@gmail.com>
> 

After the ack, I realized there were some more checks needed, IOW
questions for you :)

1. Have you checked to see if our memblock allocations spill
over to probably hotpluggable nodes?
2. Shouldn't we be marking nodes discovered as movable via
memblock_mark_hotplug()?

Balbir Singh.
Reza Arbab Oct. 25, 2016, 3:55 p.m. UTC | #4
On Tue, Oct 25, 2016 at 11:15:40PM +1100, Balbir Singh wrote:
>After the ack, I realized there were some more checks needed, IOW
>questions for you :)

Hey! No takebacks!

The short answer is that neither of these is a concern.

Longer; if you use "movable_node", x86 can identify these nodes at boot. 
They call memblock_mark_hotplug() while parsing the SRAT. Then, when the 
zones are initialized, those markings are used to determine ZONE_MOVABLE.

We have no analog of this SRAT information, so our movable nodes can 
only be created post boot, by hotplugging and explicitly onlining with 
online_movable.

>1. Have you checked to see if our memblock allocations spill
>over to probably hotpluggable nodes?

Since our nodes don't exist at boot, we don't have that short window 
before the zones are drawn where the node has normal memory, and a 
kernel allocation might occur within.

>2. Shouldn't we be marking nodes discovered as movable via
>memblock_mark_hotplug()?

Again, this early boot marking mechanism only applies to movable_node.
Balbir Singh Oct. 25, 2016, 10:34 p.m. UTC | #5
On 26/10/16 02:55, Reza Arbab wrote:
> On Tue, Oct 25, 2016 at 11:15:40PM +1100, Balbir Singh wrote:
>> After the ack, I realized there were some more checks needed, IOW
>> questions for you :)
> 
> Hey! No takebacks!
> 

I still believe we need your changes, I was wondering if we've tested
it against normal memory nodes and checked if any memblock
allocations end up there. Michael showed me some memblock
allocations on node 1 of a two node machine with movable_node
I'll double check at my end. See my question below


> The short answer is that neither of these is a concern.
> 
> Longer; if you use "movable_node", x86 can identify these nodes at boot. They call memblock_mark_hotplug() while parsing the SRAT. Then, when the zones are initialized, those markings are used to determine ZONE_MOVABLE.
> 
> We have no analog of this SRAT information, so our movable nodes can only be created post boot, by hotplugging and explicitly onlining with online_movable.
>

Is this true for all of system memory as well or only for nodes
hotplugged later?

Balbir Singh.
Balbir Singh Oct. 25, 2016, 10:59 p.m. UTC | #6
On 26/10/16 02:55, Reza Arbab wrote:
> On Tue, Oct 25, 2016 at 11:15:40PM +1100, Balbir Singh wrote:
>> After the ack, I realized there were some more checks needed, IOW
>> questions for you :)
> 
> Hey! No takebacks!
> 

I still believe we need your changes, I was wondering if we've tested
it against normal memory nodes and checked if any memblock
allocations end up there. Michael showed me some memblock
allocations on node 1 of a two node machine with movable_node
I'll double check at my end. See my question below


> The short answer is that neither of these is a concern.
> 
> Longer; if you use "movable_node", x86 can identify these nodes at boot. They call memblock_mark_hotplug() while parsing the SRAT. Then, when the zones are initialized, those markings are used to determine ZONE_MOVABLE.
> 
> We have no analog of this SRAT information, so our movable nodes can only be created post boot, by hotplugging and explicitly onlining with online_movable.
>

Is this true for all of system memory as well or only for nodes
hotplugged later?

Balbir Singh.
Reza Arbab Oct. 26, 2016, 12:49 a.m. UTC | #7
On Wed, Oct 26, 2016 at 09:34:18AM +1100, Balbir Singh wrote:
>I still believe we need your changes, I was wondering if we've tested
>it against normal memory nodes and checked if any memblock
>allocations end up there. Michael showed me some memblock
>allocations on node 1 of a two node machine with movable_node

The movable_node option is x86-only. Both of those nodes contain normal 
memory, so allocations on both are allowed.

>> Longer; if you use "movable_node", x86 can identify these nodes at 
>> boot. They call memblock_mark_hotplug() while parsing the SRAT. Then, 
>> when the zones are initialized, those markings are used to determine 
>> ZONE_MOVABLE.
>>
>> We have no analog of this SRAT information, so our movable nodes can 
>> only be created post boot, by hotplugging and explicitly onlining 
>> with online_movable.
>
>Is this true for all of system memory as well or only for nodes
>hotplugged later?

As far as I know, power has nothing like the SRAT that tells us, at 
boot, which memory is hotpluggable. So there is nothing to wire the 
movable_node option up to.

Of course, any memory you hotplug afterwards is, by definition, 
hotpluggable. So we can still create movable nodes that way.
Michael Ellerman Oct. 26, 2016, 10:52 a.m. UTC | #8
Reza Arbab <arbab@linux.vnet.ibm.com> writes:

> On Wed, Oct 26, 2016 at 09:34:18AM +1100, Balbir Singh wrote:
>>I still believe we need your changes, I was wondering if we've tested
>>it against normal memory nodes and checked if any memblock
>>allocations end up there. Michael showed me some memblock
>>allocations on node 1 of a two node machine with movable_node
>
> The movable_node option is x86-only. Both of those nodes contain normal 
> memory, so allocations on both are allowed.
>
>>> Longer; if you use "movable_node", x86 can identify these nodes at 
>>> boot. They call memblock_mark_hotplug() while parsing the SRAT. Then, 
>>> when the zones are initialized, those markings are used to determine 
>>> ZONE_MOVABLE.
>>>
>>> We have no analog of this SRAT information, so our movable nodes can 
>>> only be created post boot, by hotplugging and explicitly onlining 
>>> with online_movable.
>>
>>Is this true for all of system memory as well or only for nodes
>>hotplugged later?
>
> As far as I know, power has nothing like the SRAT that tells us, at 
> boot, which memory is hotpluggable.

On pseries we have the ibm,dynamic-memory device tree property, which
can contain ranges of memory that are not yet "assigned to the
partition" - ie. can be hotplugged later.

So in general that statement is not true.

But I think you're focused on bare-metal, in which case you might be
right. But that doesn't mean we couldn't have a similar property, if
skiboot/hostboot knew what the ranges of memory were going to be.

cheers
Reza Arbab Oct. 26, 2016, 5:03 p.m. UTC | #9
On Wed, Oct 26, 2016 at 09:52:53PM +1100, Michael Ellerman wrote:
>> As far as I know, power has nothing like the SRAT that tells us, at
>> boot, which memory is hotpluggable.
>
>On pseries we have the ibm,dynamic-memory device tree property, which
>can contain ranges of memory that are not yet "assigned to the
>partition" - ie. can be hotplugged later.
>
>So in general that statement is not true.
>
>But I think you're focused on bare-metal, in which case you might be
>right. But that doesn't mean we couldn't have a similar property, if
>skiboot/hostboot knew what the ranges of memory were going to be.

Yes, sorry, I should have qualified that statement to say I wasn't 
talking about pseries.

I can amend this set to actually implement movable_node on power too, 
but we'd have to settle on a name for the dt property. Is 
"linux,movable-node" too on the nose?
diff mbox

Patch

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index fb68210..e95cab4 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -887,6 +887,38 @@  EXPORT_SYMBOL(cpumask_of_node);
 #endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
+
+static int __init cmdline_parse_movable_node(char *p)
+{
+#ifdef CONFIG_MOVABLE_NODE
+	/*
+	 * Memory used by the kernel cannot be hot-removed because Linux
+	 * cannot migrate the kernel pages. When memory hotplug is
+	 * enabled, we should prevent memblock from allocating memory
+	 * for the kernel.
+	 *
+	 * ACPI SRAT records all hotpluggable memory ranges. But before
+	 * SRAT is parsed, we don't know about it.
+	 *
+	 * The kernel image is loaded into memory at very early time. We
+	 * cannot prevent this anyway. So on NUMA system, we set any
+	 * node the kernel resides in as un-hotpluggable.
+	 *
+	 * Since on modern servers, one node could have double-digit
+	 * gigabytes memory, we can assume the memory around the kernel
+	 * image is also un-hotpluggable. So before SRAT is parsed, just
+	 * allocate memory near the kernel image to try the best to keep
+	 * the kernel away from hotpluggable memory.
+	 */
+	memblock_set_bottom_up(true);
+	movable_node_enabled = true;
+#else
+	pr_warn("movable_node option not supported\n");
+#endif
+	return 0;
+}
+early_param("movable_node", cmdline_parse_movable_node);
+
 int memory_add_physaddr_to_nid(u64 start)
 {
 	struct numa_meminfo *mi = &numa_meminfo;
@@ -899,4 +931,5 @@  int memory_add_physaddr_to_nid(u64 start)
 	return nid;
 }
 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
-#endif
+
+#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9d29ba0..79c709a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1738,37 +1738,6 @@  static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
 }
 #endif /* CONFIG_MOVABLE_NODE */
 
-static int __init cmdline_parse_movable_node(char *p)
-{
-#ifdef CONFIG_MOVABLE_NODE
-	/*
-	 * Memory used by the kernel cannot be hot-removed because Linux
-	 * cannot migrate the kernel pages. When memory hotplug is
-	 * enabled, we should prevent memblock from allocating memory
-	 * for the kernel.
-	 *
-	 * ACPI SRAT records all hotpluggable memory ranges. But before
-	 * SRAT is parsed, we don't know about it.
-	 *
-	 * The kernel image is loaded into memory at very early time. We
-	 * cannot prevent this anyway. So on NUMA system, we set any
-	 * node the kernel resides in as un-hotpluggable.
-	 *
-	 * Since on modern servers, one node could have double-digit
-	 * gigabytes memory, we can assume the memory around the kernel
-	 * image is also un-hotpluggable. So before SRAT is parsed, just
-	 * allocate memory near the kernel image to try the best to keep
-	 * the kernel away from hotpluggable memory.
-	 */
-	memblock_set_bottom_up(true);
-	movable_node_enabled = true;
-#else
-	pr_warn("movable_node option not supported\n");
-#endif
-	return 0;
-}
-early_param("movable_node", cmdline_parse_movable_node);
-
 /* check which state of node_states will be changed when offline memory */
 static void node_states_check_changes_offline(unsigned long nr_pages,
 		struct zone *zone, struct memory_notify *arg)