diff mbox

[V16,09/11] NUMA: set guest numa nodes memory policy

Message ID 1385086118-11699-11-git-send-email-gaowanlong@cn.fujitsu.com
State New
Headers show

Commit Message

Wanlong Gao Nov. 22, 2013, 2:08 a.m. UTC
Set the guest numa nodes memory policies using the mbind(2)
system call node by node.
After this patch, we are able to set guest nodes memory policies
through the QEMU options, this arms to solve the guest cross
nodes memory access performance issue.
And as you all know, if PCI-passthrough is used,
direct-attached-device uses DMA transfer between device and qemu process.
All pages of the guest will be pinned by get_user_pages().

KVM_ASSIGN_PCI_DEVICE ioctl
  kvm_vm_ioctl_assign_device()
    =>kvm_assign_device()
      => kvm_iommu_map_memslots()
        => kvm_iommu_map_pages()
           => kvm_pin_pages()

So, with direct-attached-device, all guest page's page count will be +1 and
any page migration will not work. AutoNUMA won't too.

So, we should set the guest nodes memory allocation policies before
the pages are really mapped.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com>
---
 numa.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

Comments

Paolo Bonzini Nov. 27, 2013, 2:35 p.m. UTC | #1
Il 22/11/2013 03:08, Wanlong Gao ha scritto:
> +static int set_node_mem_policy(int nodeid)
> +{
> +#ifdef __linux__
> +    void *ram_ptr;
> +    RAMBlock *block;
> +    ram_addr_t len, ram_offset = 0;
> +    int bind_mode;
> +    int i;
> +
> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
> +        if (!strcmp(block->mr->name, "pc.ram")) {

This is not acceptable, "pc.ram" is a board-specific name.

I think instead set_node_mem_policy could be something like

int memory_region_set_mem_policy(MemoryRegion *mr,
                                 uint64_t start, uint64_t length,
                                 uint64_t offset);

that applies the NUMA policies specified for [offset, offset+length) to
the host physical address [ptr+start, ptr+start+length), where ptr is
memory_region_get_ram_ptr(mr).

Each board then can call the function after it adds RAM with
memory_region_add_subregion.

Paolo

> +            break;
> +        }
> +    }
> +
> +    if (block->host == NULL) {
> +        return -1;
> +    }
> +
> +    ram_ptr = block->host;
> +    for (i = 0; i < nodeid; i++) {
> +        len = numa_info[i].node_mem;
> +        ram_offset += len;
> +    }
> +
> +    len = numa_info[nodeid].node_mem;
> +    bind_mode = node_parse_bind_mode(nodeid);
> +    unsigned long *nodes = numa_info[nodeid].host_mem;
> +
> +    /* This is a workaround for a long standing bug in Linux'
> +     * mbind implementation, which cuts off the last specified
> +     * node. To stay compatible should this bug be fixed, we
> +     * specify one more node and zero this one out.
> +     */
> +    unsigned long maxnode = find_last_bit(nodes, MAX_NODES);
> +    if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode,
> +                nodes, maxnode + 2, 0)) {
> +            perror("mbind");
> +            return -1;
> +    }

Also, it's still not clear to me why we're not using libnuma.

> +#endif
> +
> +    return 0;
> +}
> +
>  void set_numa_modes(void)
>  {
>      CPUState *cpu;
> @@ -240,4 +319,11 @@ void set_numa_modes(void)
>              }
>          }
>      }
> +
> +    for (i = 0; i < nb_numa_nodes; i++) {
> +        if (set_node_mem_policy(i) == -1) {
> +            fprintf(stderr,
> +                    "qemu: can not set host memory policy for node%d\n", i);
> +        }
> +    }
>  }
>
Wanlong Gao Nov. 27, 2013, 3:09 p.m. UTC | #2
On 11/27/2013 10:35 PM, Paolo Bonzini wrote:
> Il 22/11/2013 03:08, Wanlong Gao ha scritto:
>> +static int set_node_mem_policy(int nodeid)
>> +{
>> +#ifdef __linux__
>> +    void *ram_ptr;
>> +    RAMBlock *block;
>> +    ram_addr_t len, ram_offset = 0;
>> +    int bind_mode;
>> +    int i;
>> +
>> +    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
>> +        if (!strcmp(block->mr->name, "pc.ram")) {
> 
> This is not acceptable, "pc.ram" is a board-specific name.
> 
> I think instead set_node_mem_policy could be something like
> 
> int memory_region_set_mem_policy(MemoryRegion *mr,
>                                  uint64_t start, uint64_t length,
>                                  uint64_t offset);
> 
> that applies the NUMA policies specified for [offset, offset+length) to
> the host physical address [ptr+start, ptr+start+length), where ptr is
> memory_region_get_ram_ptr(mr).
> 
> Each board then can call the function after it adds RAM with
> memory_region_add_subregion.

Got it, than you.

Thanks,
Wanlong Gao


> 
> Paolo
> 
>> +            break;
>> +        }
>> +    }
>> +
>> +    if (block->host == NULL) {
>> +        return -1;
>> +    }
>> +
>> +    ram_ptr = block->host;
>> +    for (i = 0; i < nodeid; i++) {
>> +        len = numa_info[i].node_mem;
>> +        ram_offset += len;
>> +    }
>> +
>> +    len = numa_info[nodeid].node_mem;
>> +    bind_mode = node_parse_bind_mode(nodeid);
>> +    unsigned long *nodes = numa_info[nodeid].host_mem;
>> +
>> +    /* This is a workaround for a long standing bug in Linux'
>> +     * mbind implementation, which cuts off the last specified
>> +     * node. To stay compatible should this bug be fixed, we
>> +     * specify one more node and zero this one out.
>> +     */
>> +    unsigned long maxnode = find_last_bit(nodes, MAX_NODES);
>> +    if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode,
>> +                nodes, maxnode + 2, 0)) {
>> +            perror("mbind");
>> +            return -1;
>> +    }
> 
> Also, it's still not clear to me why we're not using libnuma.
> 
>> +#endif
>> +
>> +    return 0;
>> +}
>> +
>>  void set_numa_modes(void)
>>  {
>>      CPUState *cpu;
>> @@ -240,4 +319,11 @@ void set_numa_modes(void)
>>              }
>>          }
>>      }
>> +
>> +    for (i = 0; i < nb_numa_nodes; i++) {
>> +        if (set_node_mem_policy(i) == -1) {
>> +            fprintf(stderr,
>> +                    "qemu: can not set host memory policy for node%d\n", i);
>> +        }
>> +    }
>>  }
>>
> 
>
Andrew Jones Nov. 27, 2013, 4:19 p.m. UTC | #3
On Wed, Nov 27, 2013 at 03:35:53PM +0100, Paolo Bonzini wrote:
> > +
> > +    len = numa_info[nodeid].node_mem;
> > +    bind_mode = node_parse_bind_mode(nodeid);
> > +    unsigned long *nodes = numa_info[nodeid].host_mem;
> > +
> > +    /* This is a workaround for a long standing bug in Linux'
> > +     * mbind implementation, which cuts off the last specified
> > +     * node. To stay compatible should this bug be fixed, we
> > +     * specify one more node and zero this one out.
> > +     */
> > +    unsigned long maxnode = find_last_bit(nodes, MAX_NODES);
> > +    if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode,
> > +                nodes, maxnode + 2, 0)) {
> > +            perror("mbind");
> > +            return -1;
> > +    }
> 
> Also, it's still not clear to me why we're not using libnuma.
>

There only seem to be two reasons to use it right now; 1)
it provides a wrapper around mbind 2) it exists. IMHO libnuma
needs some work before it would be suitable for qemu to marry
it. Its interfaces don't buy a lot of elegance, it only reads
system information on link time (i.e. no good if you want to
consider hotplug of cpus and nodes), and it's currently linux
specific anyway.  It's a good idea, but is it worth yet another
qemu build dependency?

drew
diff mbox

Patch

diff --git a/numa.c b/numa.c
index da4dbbd..915a67a 100644
--- a/numa.c
+++ b/numa.c
@@ -27,6 +27,16 @@ 
 #include "qapi-visit.h"
 #include "qapi/opts-visitor.h"
 #include "qapi/dealloc-visitor.h"
+#include "exec/memory.h"
+
+#ifdef __linux__
+#include <sys/syscall.h>
+#ifndef MPOL_F_RELATIVE_NODES
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_F_STATIC_NODES   (1 << 15)
+#endif
+#endif
+
 QemuOptsList qemu_numa_opts = {
     .name = "numa",
     .implied_opt_name = "type",
@@ -228,6 +238,75 @@  void set_numa_nodes(void)
     }
 }
 
+#ifdef __linux__
+static int node_parse_bind_mode(unsigned int nodeid)
+{
+    int bind_mode;
+
+    switch (numa_info[nodeid].policy) {
+    case NUMA_NODE_POLICY_DEFAULT:
+    case NUMA_NODE_POLICY_PREFERRED:
+    case NUMA_NODE_POLICY_MEMBIND:
+    case NUMA_NODE_POLICY_INTERLEAVE:
+        bind_mode = numa_info[nodeid].policy;
+        break;
+    default:
+        bind_mode = NUMA_NODE_POLICY_DEFAULT;
+        return bind_mode;
+    }
+
+    bind_mode |= numa_info[nodeid].relative ?
+        MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES;
+
+    return bind_mode;
+}
+#endif
+
+static int set_node_mem_policy(int nodeid)
+{
+#ifdef __linux__
+    void *ram_ptr;
+    RAMBlock *block;
+    ram_addr_t len, ram_offset = 0;
+    int bind_mode;
+    int i;
+
+    QTAILQ_FOREACH(block, &ram_list.blocks, next) {
+        if (!strcmp(block->mr->name, "pc.ram")) {
+            break;
+        }
+    }
+
+    if (block->host == NULL) {
+        return -1;
+    }
+
+    ram_ptr = block->host;
+    for (i = 0; i < nodeid; i++) {
+        len = numa_info[i].node_mem;
+        ram_offset += len;
+    }
+
+    len = numa_info[nodeid].node_mem;
+    bind_mode = node_parse_bind_mode(nodeid);
+    unsigned long *nodes = numa_info[nodeid].host_mem;
+
+    /* This is a workaround for a long standing bug in Linux'
+     * mbind implementation, which cuts off the last specified
+     * node. To stay compatible should this bug be fixed, we
+     * specify one more node and zero this one out.
+     */
+    unsigned long maxnode = find_last_bit(nodes, MAX_NODES);
+    if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode,
+                nodes, maxnode + 2, 0)) {
+            perror("mbind");
+            return -1;
+    }
+#endif
+
+    return 0;
+}
+
 void set_numa_modes(void)
 {
     CPUState *cpu;
@@ -240,4 +319,11 @@  void set_numa_modes(void)
             }
         }
     }
+
+    for (i = 0; i < nb_numa_nodes; i++) {
+        if (set_node_mem_policy(i) == -1) {
+            fprintf(stderr,
+                    "qemu: can not set host memory policy for node%d\n", i);
+        }
+    }
 }