[v2,2/2] numa, spapr: equally distribute memory on nodes

Message ID	20170427101259.13798-3-lvivier@redhat.com
State	New
Headers	show Return-Path: <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org> DMARC-Filter: OpenDMARC Filter v1.3.2 mx1.redhat.com 6C7B861D15 From: Laurent Vivier <lvivier@redhat.com> To: Eduardo Habkost <ehabkost@redhat.com> Date: Thu, 27 Apr 2017 12:12:59 +0200 Message-Id: <20170427101259.13798-3-lvivier@redhat.com> In-Reply-To: <20170427101259.13798-1-lvivier@redhat.com> References: <20170427101259.13798-1-lvivier@redhat.com> Subject: [Qemu-devel] [PATCH v2 2/2] numa, spapr: equally distribute memory on nodes Precedence: list Cc: Laurent Vivier <lvivier@redhat.com>, Thomas Huth <thuth@redhat.com>, qemu-devel@nongnu.org, David Gibson <david@gibson.dropbear.id.au> Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org Sender: "Qemu-devel" <qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>

Message ID

20170427101259.13798-3-lvivier@redhat.com

State

New

Headers

DMARC-Filter: OpenDMARC Filter v1.3.2 mx1.redhat.com 6C7B861D15
From: Laurent Vivier <lvivier@redhat.com>
To: Eduardo Habkost <ehabkost@redhat.com>
Date: Thu, 27 Apr 2017 12:12:59 +0200
Message-Id: <20170427101259.13798-3-lvivier@redhat.com>
In-Reply-To: <20170427101259.13798-1-lvivier@redhat.com>
References: <20170427101259.13798-1-lvivier@redhat.com>
Subject: [Qemu-devel] [PATCH v2 2/2] numa,
	spapr: equally distribute memory on nodes
Precedence: list
Cc: Laurent Vivier <lvivier@redhat.com>, Thomas Huth <thuth@redhat.com>,
	qemu-devel@nongnu.org, David Gibson <david@gibson.dropbear.id.au>
Errors-To: qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org
Sender: "Qemu-devel"
	<qemu-devel-bounces+incoming=patchwork.ozlabs.org@nongnu.org>

Commit Message

Laurent Vivier April 27, 2017, 10:12 a.m. UTC

When there are more nodes than memory available to put the minimum
allowed memory by node, all the memory is put on the last node.

This is because we put (ram_size / nb_numa_nodes) &
~((1 << mc->numa_mem_align_shift) - 1); on each node, and in this
case the value is 0. This is particularly true with pseries,
as the memory must be aligned to 256MB.

To avoid this problem, this patch uses an error diffusion algorithm [1]
to distribute equally the memory on nodes.

Example:

qemu-system-ppc64 -S -nographic  -nodefaults -monitor stdio -m 1G -smp 8 \
                  -numa node -numa node -numa node \
                  -numa node -numa node -numa node

Before:

(qemu) info numa
6 nodes
node 0 cpus: 0 6
node 0 size: 0 MB
node 1 cpus: 1 7
node 1 size: 0 MB
node 2 cpus: 2
node 2 size: 0 MB
node 3 cpus: 3
node 3 size: 0 MB
node 4 cpus: 4
node 4 size: 0 MB
node 5 cpus: 5
node 5 size: 1024 MB

After:
(qemu) info numa
6 nodes
node 0 cpus: 0 6
node 0 size: 0 MB
node 1 cpus: 1 7
node 1 size: 256 MB
node 2 cpus: 2
node 2 size: 0 MB
node 3 cpus: 3
node 3 size: 256 MB
node 4 cpus: 4
node 4 size: 256 MB
node 5 cpus: 5
node 5 size: 256 MB

[1] https://en.wikipedia.org/wiki/Error_diffusion

Signed-off-by: Laurent Vivier <lvivier@redhat.com>
---
 hw/ppc/spapr.c         | 21 ++++++++++++++++++++-
 include/hw/ppc/spapr.h |  3 ++-
 2 files changed, 22 insertions(+), 2 deletions(-)

Comments

Eduardo Habkost April 27, 2017, 8:31 p.m. UTC | #1

On Thu, Apr 27, 2017 at 12:12:59PM +0200, Laurent Vivier wrote:
> When there are more nodes than memory available to put the minimum
> allowed memory by node, all the memory is put on the last node.
> 
> This is because we put (ram_size / nb_numa_nodes) &
> ~((1 << mc->numa_mem_align_shift) - 1); on each node, and in this
> case the value is 0. This is particularly true with pseries,
> as the memory must be aligned to 256MB.
> 
> To avoid this problem, this patch uses an error diffusion algorithm [1]
> to distribute equally the memory on nodes.
> 
> Example:
> 
> qemu-system-ppc64 -S -nographic  -nodefaults -monitor stdio -m 1G -smp 8 \
>                   -numa node -numa node -numa node \
>                   -numa node -numa node -numa node
> 
> Before:
> 
> (qemu) info numa
> 6 nodes
> node 0 cpus: 0 6
> node 0 size: 0 MB
> node 1 cpus: 1 7
> node 1 size: 0 MB
> node 2 cpus: 2
> node 2 size: 0 MB
> node 3 cpus: 3
> node 3 size: 0 MB
> node 4 cpus: 4
> node 4 size: 0 MB
> node 5 cpus: 5
> node 5 size: 1024 MB
> 
> After:
> (qemu) info numa
> 6 nodes
> node 0 cpus: 0 6
> node 0 size: 0 MB
> node 1 cpus: 1 7
> node 1 size: 256 MB
> node 2 cpus: 2
> node 2 size: 0 MB
> node 3 cpus: 3
> node 3 size: 256 MB
> node 4 cpus: 4
> node 4 size: 256 MB
> node 5 cpus: 5
> node 5 size: 256 MB
> 
> [1] https://en.wikipedia.org/wiki/Error_diffusion
> 
> Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> ---
>  hw/ppc/spapr.c         | 21 ++++++++++++++++++++-
>  include/hw/ppc/spapr.h |  3 ++-
>  2 files changed, 22 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 80d12d0..be498e2 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -3106,6 +3106,23 @@ static void spapr_pic_print_info(InterruptStatsProvider *obj,
>      ics_pic_print_info(spapr->ics, mon);
>  }
>  
> +static void spapr_numa_auto_assign_ram(uint64_t *nodes, int nb_nodes,
> +                                       ram_addr_t size)
> +{
> +    int i;
> +    uint64_t usedmem = 0, node_mem;
> +    uint64_t granularity = size / nb_nodes;
> +    uint64_t propagate = 0;
> +
> +    for (i = 0; i < nb_nodes - 1; i++) {
> +        node_mem = (granularity + propagate) & ~(SPAPR_MEMORY_BLOCK_SIZE - 1);
> +        propagate = granularity + propagate - node_mem;
> +        nodes[i] = node_mem;
> +        usedmem += node_mem;
> +    }
> +    nodes[i] = ram_size - usedmem;
> +}

This new algorithm is a much better default, I would like to
enable it for all machines by default. We could move it to
core/machine.c:machine_class_init(), and set numa_auto_assign_ram
to NULL on pc-*-2.9 and spapr-2.9.

> +
>  static void spapr_machine_class_init(ObjectClass *oc, void *data)
>  {
>      MachineClass *mc = MACHINE_CLASS(oc);
> @@ -3162,7 +3179,8 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
>       * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
>       * in which LMBs are represented and hot-added
>       */
> -    mc->numa_mem_align_shift = 28;
> +    mc->numa_mem_align_shift = SPAPR_MEMORY_BLOCK_SIZE_SHIFT;
> +    mc->numa_auto_assign_ram = spapr_numa_auto_assign_ram;
>  }
>  
>  static const TypeInfo spapr_machine_info = {
> @@ -3242,6 +3260,7 @@ static void spapr_machine_2_9_class_options(MachineClass *mc)
>  {
>      spapr_machine_2_10_class_options(mc);
>      SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_9);
> +    mc->numa_auto_assign_ram = NULL;
>  }
>  
>  DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
> diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> index 5802f88..8f4a588 100644
> --- a/include/hw/ppc/spapr.h
> +++ b/include/hw/ppc/spapr.h
> @@ -653,7 +653,8 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
>  
>  int spapr_rng_populate_dt(void *fdt);
>  
> -#define SPAPR_MEMORY_BLOCK_SIZE (1 << 28) /* 256MB */
> +#define SPAPR_MEMORY_BLOCK_SIZE_SHIFT 28 /* 256MB */
> +#define SPAPR_MEMORY_BLOCK_SIZE (1 << SPAPR_MEMORY_BLOCK_SIZE_SHIFT)
>  
>  /*
>   * This defines the maximum number of DIMM slots we can have for sPAPR
> -- 
> 2.9.3
>

David Gibson May 1, 2017, 3:56 a.m. UTC | #2

On Thu, Apr 27, 2017 at 05:31:48PM -0300, Eduardo Habkost wrote:
> On Thu, Apr 27, 2017 at 12:12:59PM +0200, Laurent Vivier wrote:
> > When there are more nodes than memory available to put the minimum
> > allowed memory by node, all the memory is put on the last node.
> > 
> > This is because we put (ram_size / nb_numa_nodes) &
> > ~((1 << mc->numa_mem_align_shift) - 1); on each node, and in this
> > case the value is 0. This is particularly true with pseries,
> > as the memory must be aligned to 256MB.
> > 
> > To avoid this problem, this patch uses an error diffusion algorithm [1]
> > to distribute equally the memory on nodes.
> > 
> > Example:
> > 
> > qemu-system-ppc64 -S -nographic  -nodefaults -monitor stdio -m 1G -smp 8 \
> >                   -numa node -numa node -numa node \
> >                   -numa node -numa node -numa node
> > 
> > Before:
> > 
> > (qemu) info numa
> > 6 nodes
> > node 0 cpus: 0 6
> > node 0 size: 0 MB
> > node 1 cpus: 1 7
> > node 1 size: 0 MB
> > node 2 cpus: 2
> > node 2 size: 0 MB
> > node 3 cpus: 3
> > node 3 size: 0 MB
> > node 4 cpus: 4
> > node 4 size: 0 MB
> > node 5 cpus: 5
> > node 5 size: 1024 MB
> > 
> > After:
> > (qemu) info numa
> > 6 nodes
> > node 0 cpus: 0 6
> > node 0 size: 0 MB
> > node 1 cpus: 1 7
> > node 1 size: 256 MB
> > node 2 cpus: 2
> > node 2 size: 0 MB
> > node 3 cpus: 3
> > node 3 size: 256 MB
> > node 4 cpus: 4
> > node 4 size: 256 MB
> > node 5 cpus: 5
> > node 5 size: 256 MB
> > 
> > [1] https://en.wikipedia.org/wiki/Error_diffusion
> > 
> > Signed-off-by: Laurent Vivier <lvivier@redhat.com>
> > ---
> >  hw/ppc/spapr.c         | 21 ++++++++++++++++++++-
> >  include/hw/ppc/spapr.h |  3 ++-
> >  2 files changed, 22 insertions(+), 2 deletions(-)
> > 
> > diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> > index 80d12d0..be498e2 100644
> > --- a/hw/ppc/spapr.c
> > +++ b/hw/ppc/spapr.c
> > @@ -3106,6 +3106,23 @@ static void spapr_pic_print_info(InterruptStatsProvider *obj,
> >      ics_pic_print_info(spapr->ics, mon);
> >  }
> >  
> > +static void spapr_numa_auto_assign_ram(uint64_t *nodes, int nb_nodes,
> > +                                       ram_addr_t size)
> > +{
> > +    int i;
> > +    uint64_t usedmem = 0, node_mem;
> > +    uint64_t granularity = size / nb_nodes;
> > +    uint64_t propagate = 0;
> > +
> > +    for (i = 0; i < nb_nodes - 1; i++) {
> > +        node_mem = (granularity + propagate) & ~(SPAPR_MEMORY_BLOCK_SIZE - 1);
> > +        propagate = granularity + propagate - node_mem;
> > +        nodes[i] = node_mem;
> > +        usedmem += node_mem;
> > +    }
> > +    nodes[i] = ram_size - usedmem;
> > +}
> 
> This new algorithm is a much better default, I would like to
> enable it for all machines by default. We could move it to
> core/machine.c:machine_class_init(), and set numa_auto_assign_ram
> to NULL on pc-*-2.9 and spapr-2.9.

Right.  So, the convention that the default is the new behaviour and
we override to set the older behaviour on previous machine types is
strong enough that I think we should keep it here, even though it adds
some complications.

Note that it might be simpler not to have a "NULL" behaviour.  Instead
the core code can provide both old style and error diffusion versions
of the assign hook.  Set it to the error difusion one in the base
MachineClass, then override it to the old one in previous versioned
machine types (basically just the previous pc and spapr models,
AFAIK).

> 
> > +
> >  static void spapr_machine_class_init(ObjectClass *oc, void *data)
> >  {
> >      MachineClass *mc = MACHINE_CLASS(oc);
> > @@ -3162,7 +3179,8 @@ static void spapr_machine_class_init(ObjectClass *oc, void *data)
> >       * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
> >       * in which LMBs are represented and hot-added
> >       */
> > -    mc->numa_mem_align_shift = 28;
> > +    mc->numa_mem_align_shift = SPAPR_MEMORY_BLOCK_SIZE_SHIFT;
> > +    mc->numa_auto_assign_ram = spapr_numa_auto_assign_ram;
> >  }
> >  
> >  static const TypeInfo spapr_machine_info = {
> > @@ -3242,6 +3260,7 @@ static void spapr_machine_2_9_class_options(MachineClass *mc)
> >  {
> >      spapr_machine_2_10_class_options(mc);
> >      SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_9);
> > +    mc->numa_auto_assign_ram = NULL;
> >  }
> >  
> >  DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
> > diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
> > index 5802f88..8f4a588 100644
> > --- a/include/hw/ppc/spapr.h
> > +++ b/include/hw/ppc/spapr.h
> > @@ -653,7 +653,8 @@ int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
> >  
> >  int spapr_rng_populate_dt(void *fdt);
> >  
> > -#define SPAPR_MEMORY_BLOCK_SIZE (1 << 28) /* 256MB */
> > +#define SPAPR_MEMORY_BLOCK_SIZE_SHIFT 28 /* 256MB */
> > +#define SPAPR_MEMORY_BLOCK_SIZE (1 << SPAPR_MEMORY_BLOCK_SIZE_SHIFT)
> >  
> >  /*
> >   * This defines the maximum number of DIMM slots we can have for sPAPR
>

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 80d12d0..be498e2 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -3106,6 +3106,23 @@  static void spapr_pic_print_info(InterruptStatsProvider *obj,
     ics_pic_print_info(spapr->ics, mon);
 }
 
+static void spapr_numa_auto_assign_ram(uint64_t *nodes, int nb_nodes,
+                                       ram_addr_t size)
+{
+    int i;
+    uint64_t usedmem = 0, node_mem;
+    uint64_t granularity = size / nb_nodes;
+    uint64_t propagate = 0;
+
+    for (i = 0; i < nb_nodes - 1; i++) {
+        node_mem = (granularity + propagate) & ~(SPAPR_MEMORY_BLOCK_SIZE - 1);
+        propagate = granularity + propagate - node_mem;
+        nodes[i] = node_mem;
+        usedmem += node_mem;
+    }
+    nodes[i] = ram_size - usedmem;
+}
+
 static void spapr_machine_class_init(ObjectClass *oc, void *data)
 {
     MachineClass *mc = MACHINE_CLASS(oc);
@@ -3162,7 +3179,8 @@  static void spapr_machine_class_init(ObjectClass *oc, void *data)
      * SPAPR_MEMORY_BLOCK_SIZE (256M) since that's the granularity
      * in which LMBs are represented and hot-added
      */
-    mc->numa_mem_align_shift = 28;
+    mc->numa_mem_align_shift = SPAPR_MEMORY_BLOCK_SIZE_SHIFT;
+    mc->numa_auto_assign_ram = spapr_numa_auto_assign_ram;
 }
 
 static const TypeInfo spapr_machine_info = {
@@ -3242,6 +3260,7 @@  static void spapr_machine_2_9_class_options(MachineClass *mc)
 {
     spapr_machine_2_10_class_options(mc);
     SET_MACHINE_COMPAT(mc, SPAPR_COMPAT_2_9);
+    mc->numa_auto_assign_ram = NULL;
 }
 
 DEFINE_SPAPR_MACHINE(2_9, "2.9", false);
diff --git a/include/hw/ppc/spapr.h b/include/hw/ppc/spapr.h
index 5802f88..8f4a588 100644
--- a/include/hw/ppc/spapr.h
+++ b/include/hw/ppc/spapr.h
@@ -653,7 +653,8 @@  int spapr_rtc_import_offset(sPAPRRTCState *rtc, int64_t legacy_offset);
 
 int spapr_rng_populate_dt(void *fdt);
 
-#define SPAPR_MEMORY_BLOCK_SIZE (1 << 28) /* 256MB */
+#define SPAPR_MEMORY_BLOCK_SIZE_SHIFT 28 /* 256MB */
+#define SPAPR_MEMORY_BLOCK_SIZE (1 << SPAPR_MEMORY_BLOCK_SIZE_SHIFT)
 
 /*
  * This defines the maximum number of DIMM slots we can have for sPAPR

[v2,2/2] numa, spapr: equally distribute memory on nodes

Commit Message

Comments

Patch