diff mbox

[v18,10/14] numa: add -numa node, memdev= option

Message ID b87b2fb5255071e1f47c02f25949f2c73c856652.1392794450.git.hutao@cn.fujitsu.com
State New
Headers show

Commit Message

Hu Tao Feb. 19, 2014, 7:54 a.m. UTC
From: Paolo Bonzini <pbonzini@redhat.com>

This option provides the infrastructure for binding guest NUMA nodes
to host NUMA nodes.  For example:

 -object memory-ram,size=1024M,policy=membind,host-nodes=0,id=ram-node0 \
 -numa node,nodeid=0,cpus=0,memdev=ram-node0 \
 -object memory-ram,size=1024M,policy=interleave,host-nodes=1-3,id=ram-node1 \
 -numa node,nodeid=1,cpus=1,memdev=ram-node1

The option replaces "-numa mem".

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Conflicts:
	include/sysemu/sysemu.h
	numa.c

Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
---
 include/sysemu/sysemu.h |  2 ++
 numa.c                  | 64 +++++++++++++++++++++++++++++++++++++++++++++++--
 qapi-schema.json        |  6 ++++-
 3 files changed, 69 insertions(+), 3 deletions(-)

Comments

Igor Mammedov Feb. 19, 2014, 9:50 a.m. UTC | #1
On Wed, 19 Feb 2014 15:54:01 +0800
Hu Tao <hutao@cn.fujitsu.com> wrote:

> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> This option provides the infrastructure for binding guest NUMA nodes
> to host NUMA nodes.  For example:
> 
>  -object memory-ram,size=1024M,policy=membind,host-nodes=0,id=ram-node0 \
>  -numa node,nodeid=0,cpus=0,memdev=ram-node0 \
>  -object memory-ram,size=1024M,policy=interleave,host-nodes=1-3,id=ram-node1 \
>  -numa node,nodeid=1,cpus=1,memdev=ram-node1
> 
> The option replaces "-numa mem".
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> 
> Conflicts:
> 	include/sysemu/sysemu.h
> 	numa.c
> 
> Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> ---
>  include/sysemu/sysemu.h |  2 ++
>  numa.c                  | 64 +++++++++++++++++++++++++++++++++++++++++++++++--
>  qapi-schema.json        |  6 ++++-
>  3 files changed, 69 insertions(+), 3 deletions(-)
> 
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index e9da760..acfc0c7 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -12,6 +12,7 @@
>  #include "qemu/bitmap.h"
>  #include "qom/object.h"
>  #include "hw/boards.h"
> +#include "sysemu/hostmem.h"
>  
>  /* vl.c */
>  
> @@ -140,6 +141,7 @@ extern int nb_numa_nodes;
>  typedef struct node_info {
>      uint64_t node_mem;
>      DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
> +    HostMemoryBackend *node_memdev;
>  } NodeInfo;
>  extern NodeInfo numa_info[MAX_NODES];
>  void set_numa_nodes(void);
> diff --git a/numa.c b/numa.c
> index 403b08b..ca55ad7 100644
> --- a/numa.c
> +++ b/numa.c
> @@ -27,6 +27,8 @@
>  #include "qapi-visit.h"
>  #include "qapi/opts-visitor.h"
>  #include "qapi/dealloc-visitor.h"
> +#include "qapi/qmp/qerror.h"
> +
>  QemuOptsList qemu_numa_opts = {
>      .name = "numa",
>      .implied_opt_name = "type",
> @@ -34,10 +36,13 @@ QemuOptsList qemu_numa_opts = {
>      .desc = { { 0 } } /* validated with OptsVisitor */
>  };
>  
> +static int have_memdevs = -1;
> +
>  static int numa_node_parse(NumaNodeOptions *opts)
>  {
>      uint16_t nodenr;
>      uint16List *cpus = NULL;
> +    Error *local_err = NULL;
>  
>      if (opts->has_nodeid) {
>          nodenr = opts->nodeid;
> @@ -60,6 +65,19 @@ static int numa_node_parse(NumaNodeOptions *opts)
>          bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1);
>      }
>  
> +    if (opts->has_mem && opts->has_memdev) {
> +        fprintf(stderr, "qemu: cannot specify both mem= and memdev=\n");
> +        return -1;
> +    }
> +
> +    if (have_memdevs == -1) {
> +        have_memdevs = opts->has_memdev;
> +    }
> +    if (opts->has_memdev != have_memdevs) {
> +        fprintf(stderr, "qemu: memdev option must be specified for either "
> +                "all or no nodes\n");
> +    }
> +
>      if (opts->has_mem) {
>          int64_t mem_size;
>          char *endptr;
> @@ -70,7 +88,19 @@ static int numa_node_parse(NumaNodeOptions *opts)
>          }
>          numa_info[nodenr].node_mem = mem_size;
>      }
> +    if (opts->has_memdev) {
> +        Object *o;
> +        o = object_resolve_path_type(opts->memdev, TYPE_MEMORY_BACKEND, NULL);
> +        if (!o) {
> +            error_setg(&local_err, "memdev=%s is ambiguous", opts->memdev);
> +            qerror_report_err(local_err);
> +            return -1;
> +        }
>  
> +        object_ref(o);
> +        numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
> +        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
if you make numa_info  QOM object node_memdev link<> property,
then above hunk could be replaced with just setting link.
And node_mem could be replaced with readonly property that reads size
directly from memdev avoiding data duplication.

As side-effect it numa_info will also become accessible for introspection
using QOM interface. Something like:
 qom-list /machine/memory-node[X]
 qom-get /machine/memory-node[X]/memory_size


> +    }
>      return 0;
>  }
>  
> @@ -189,12 +219,42 @@ void set_numa_modes(void)
>      }
>  }
>  
> +static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
> +                                           const char *name,
> +                                           QEMUMachineInitArgs *args)
> +{
> +    uint64_t ram_size = args->ram_size;
> +
> +    memory_region_init_ram(mr, owner, name, ram_size);
> +    vmstate_register_ram_global(mr);
> +}
> +
>  void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
>                                            const char *name,
>                                            QEMUMachineInitArgs *args)
>  {
>      uint64_t ram_size = args->ram_size;
> +    uint64_t addr = 0;
> +    int i;
>  
> -    memory_region_init_ram(mr, owner, name, ram_size);
> -    vmstate_register_ram_global(mr);
> +    if (nb_numa_nodes == 0 || !have_memdevs) {
> +        allocate_system_memory_nonnuma(mr, owner, name, args);
> +        return;
> +    }
> +
> +    memory_region_init(mr, owner, name, ram_size);
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        Error *local_err = NULL;
> +        uint64_t size = numa_info[i].node_mem;
> +        HostMemoryBackend *backend = numa_info[i].node_memdev;
> +        MemoryRegion *seg = host_memory_backend_get_memory(backend, &local_err);
> +        if (local_err) {
> +            qerror_report_err(local_err);
> +            exit(1);
> +        }
> +
> +        memory_region_add_subregion(mr, addr, seg);
> +        vmstate_register_ram_global(seg);
> +        addr += size;
> +    }
>  }
> diff --git a/qapi-schema.json b/qapi-schema.json
> index a2839b8..498ea9b 100644
> --- a/qapi-schema.json
> +++ b/qapi-schema.json
> @@ -4441,7 +4441,10 @@
>  #
>  # @cpus: #optional VCPUs belong to this node
>  #
> -# @mem: #optional memory size of this node
> +# @memdev: #optional memory backend object.  If specified for one node,
> +#          it must be specified for all nodes.
> +#
> +# @mem: #optional memory size of this node; mutually exclusive with @memdev.
>  #
>  # Since: 2.0
>  ##
> @@ -4449,4 +4452,5 @@
>    'data': {
>     '*nodeid': 'uint16',
>     '*cpus':   ['uint16'],
> +   '*memdev': 'str',
>     '*mem':    'str' }}
Paolo Bonzini Feb. 19, 2014, 11:53 a.m. UTC | #2
Il 19/02/2014 10:50, Igor Mammedov ha scritto:
>> > +        numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
>> > +        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
> if you make numa_info  QOM object node_memdev link<> property,
> then above hunk could be replaced with just setting link.
> And node_mem could be replaced with readonly property that reads size
> directly from memdev avoiding data duplication.
>
> As side-effect it numa_info will also become accessible for introspection
> using QOM interface. Something like:
>  qom-list /machine/memory-node[X]
>  qom-get /machine/memory-node[X]/memory_size

I agree, but I think we can do it on top.

Paolo
Eric Blake March 4, 2014, 12:10 a.m. UTC | #3
On 02/19/2014 12:54 AM, Hu Tao wrote:
> From: Paolo Bonzini <pbonzini@redhat.com>
> 
> This option provides the infrastructure for binding guest NUMA nodes
> to host NUMA nodes.  For example:
> 
>  -object memory-ram,size=1024M,policy=membind,host-nodes=0,id=ram-node0 \
>  -numa node,nodeid=0,cpus=0,memdev=ram-node0 \
>  -object memory-ram,size=1024M,policy=interleave,host-nodes=1-3,id=ram-node1 \
>  -numa node,nodeid=1,cpus=1,memdev=ram-node1
> 
> The option replaces "-numa mem".
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> 
> Conflicts:
> 	include/sysemu/sysemu.h
> 	numa.c

Until this patch is taken upstream, a 'Conflicts:' section in your
commit message doesn't make sense.  That is useful for downstream or
stable branch backports, but doesn't belong on the mainline branch.

> 
> Signed-off-by: Hu Tao <hutao@cn.fujitsu.com>
> ---
>  include/sysemu/sysemu.h |  2 ++
>  numa.c                  | 64 +++++++++++++++++++++++++++++++++++++++++++++++--
>  qapi-schema.json        |  6 ++++-
>  3 files changed, 69 insertions(+), 3 deletions(-)
>
Hu Tao March 4, 2014, 2:20 a.m. UTC | #4
On Mon, Mar 03, 2014 at 05:10:20PM -0700, Eric Blake wrote:
> On 02/19/2014 12:54 AM, Hu Tao wrote:
> > From: Paolo Bonzini <pbonzini@redhat.com>
> > 
> > This option provides the infrastructure for binding guest NUMA nodes
> > to host NUMA nodes.  For example:
> > 
> >  -object memory-ram,size=1024M,policy=membind,host-nodes=0,id=ram-node0 \
> >  -numa node,nodeid=0,cpus=0,memdev=ram-node0 \
> >  -object memory-ram,size=1024M,policy=interleave,host-nodes=1-3,id=ram-node1 \
> >  -numa node,nodeid=1,cpus=1,memdev=ram-node1
> > 
> > The option replaces "-numa mem".
> > 
> > Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
> > 
> > Conflicts:
> > 	include/sysemu/sysemu.h
> > 	numa.c
> 
> Until this patch is taken upstream, a 'Conflicts:' section in your
> commit message doesn't make sense.  That is useful for downstream or
> stable branch backports, but doesn't belong on the mainline branch.

I think this was introduced during rebase. Will fix.

Thanks.
diff mbox

Patch

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index e9da760..acfc0c7 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -12,6 +12,7 @@ 
 #include "qemu/bitmap.h"
 #include "qom/object.h"
 #include "hw/boards.h"
+#include "sysemu/hostmem.h"
 
 /* vl.c */
 
@@ -140,6 +141,7 @@  extern int nb_numa_nodes;
 typedef struct node_info {
     uint64_t node_mem;
     DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS);
+    HostMemoryBackend *node_memdev;
 } NodeInfo;
 extern NodeInfo numa_info[MAX_NODES];
 void set_numa_nodes(void);
diff --git a/numa.c b/numa.c
index 403b08b..ca55ad7 100644
--- a/numa.c
+++ b/numa.c
@@ -27,6 +27,8 @@ 
 #include "qapi-visit.h"
 #include "qapi/opts-visitor.h"
 #include "qapi/dealloc-visitor.h"
+#include "qapi/qmp/qerror.h"
+
 QemuOptsList qemu_numa_opts = {
     .name = "numa",
     .implied_opt_name = "type",
@@ -34,10 +36,13 @@  QemuOptsList qemu_numa_opts = {
     .desc = { { 0 } } /* validated with OptsVisitor */
 };
 
+static int have_memdevs = -1;
+
 static int numa_node_parse(NumaNodeOptions *opts)
 {
     uint16_t nodenr;
     uint16List *cpus = NULL;
+    Error *local_err = NULL;
 
     if (opts->has_nodeid) {
         nodenr = opts->nodeid;
@@ -60,6 +65,19 @@  static int numa_node_parse(NumaNodeOptions *opts)
         bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1);
     }
 
+    if (opts->has_mem && opts->has_memdev) {
+        fprintf(stderr, "qemu: cannot specify both mem= and memdev=\n");
+        return -1;
+    }
+
+    if (have_memdevs == -1) {
+        have_memdevs = opts->has_memdev;
+    }
+    if (opts->has_memdev != have_memdevs) {
+        fprintf(stderr, "qemu: memdev option must be specified for either "
+                "all or no nodes\n");
+    }
+
     if (opts->has_mem) {
         int64_t mem_size;
         char *endptr;
@@ -70,7 +88,19 @@  static int numa_node_parse(NumaNodeOptions *opts)
         }
         numa_info[nodenr].node_mem = mem_size;
     }
+    if (opts->has_memdev) {
+        Object *o;
+        o = object_resolve_path_type(opts->memdev, TYPE_MEMORY_BACKEND, NULL);
+        if (!o) {
+            error_setg(&local_err, "memdev=%s is ambiguous", opts->memdev);
+            qerror_report_err(local_err);
+            return -1;
+        }
 
+        object_ref(o);
+        numa_info[nodenr].node_mem = object_property_get_int(o, "size", NULL);
+        numa_info[nodenr].node_memdev = MEMORY_BACKEND(o);
+    }
     return 0;
 }
 
@@ -189,12 +219,42 @@  void set_numa_modes(void)
     }
 }
 
+static void allocate_system_memory_nonnuma(MemoryRegion *mr, Object *owner,
+                                           const char *name,
+                                           QEMUMachineInitArgs *args)
+{
+    uint64_t ram_size = args->ram_size;
+
+    memory_region_init_ram(mr, owner, name, ram_size);
+    vmstate_register_ram_global(mr);
+}
+
 void memory_region_allocate_system_memory(MemoryRegion *mr, Object *owner,
                                           const char *name,
                                           QEMUMachineInitArgs *args)
 {
     uint64_t ram_size = args->ram_size;
+    uint64_t addr = 0;
+    int i;
 
-    memory_region_init_ram(mr, owner, name, ram_size);
-    vmstate_register_ram_global(mr);
+    if (nb_numa_nodes == 0 || !have_memdevs) {
+        allocate_system_memory_nonnuma(mr, owner, name, args);
+        return;
+    }
+
+    memory_region_init(mr, owner, name, ram_size);
+    for (i = 0; i < nb_numa_nodes; i++) {
+        Error *local_err = NULL;
+        uint64_t size = numa_info[i].node_mem;
+        HostMemoryBackend *backend = numa_info[i].node_memdev;
+        MemoryRegion *seg = host_memory_backend_get_memory(backend, &local_err);
+        if (local_err) {
+            qerror_report_err(local_err);
+            exit(1);
+        }
+
+        memory_region_add_subregion(mr, addr, seg);
+        vmstate_register_ram_global(seg);
+        addr += size;
+    }
 }
diff --git a/qapi-schema.json b/qapi-schema.json
index a2839b8..498ea9b 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -4441,7 +4441,10 @@ 
 #
 # @cpus: #optional VCPUs belong to this node
 #
-# @mem: #optional memory size of this node
+# @memdev: #optional memory backend object.  If specified for one node,
+#          it must be specified for all nodes.
+#
+# @mem: #optional memory size of this node; mutually exclusive with @memdev.
 #
 # Since: 2.0
 ##
@@ -4449,4 +4452,5 @@ 
   'data': {
    '*nodeid': 'uint16',
    '*cpus':   ['uint16'],
+   '*memdev': 'str',
    '*mem':    'str' }}