Message ID | 1377231003-2816-9-git-send-email-gaowanlong@cn.fujitsu.com |
---|---|
State | New |
Headers | show |
----- Original Message ----- > Set the guest numa nodes memory policies using the mbind(2) > system call node by node. > After this patch, we are able to set guest nodes memory policies > through the QEMU options, this arms to solve the guest cross > nodes memory access performance issue. > And as you all know, if PCI-passthrough is used, > direct-attached-device uses DMA transfer between device and qemu process. > All pages of the guest will be pinned by get_user_pages(). > > KVM_ASSIGN_PCI_DEVICE ioctl > kvm_vm_ioctl_assign_device() > =>kvm_assign_device() > => kvm_iommu_map_memslots() > => kvm_iommu_map_pages() > => kvm_pin_pages() > > So, with direct-attached-device, all guest page's page count will be +1 and > any page migration will not work. AutoNUMA won't too. > > So, we should set the guest nodes memory allocation policies before > the pages are really mapped. > > Signed-off-by: Andre Przywara <andre.przywara@amd.com> > Signed-off-by: Wanlong Gao <gaowanlong@cn.fujitsu.com> > --- > numa.c | 90 > ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 90 insertions(+) > > diff --git a/numa.c b/numa.c > index 4ccc6cb..4a9c368 100644 > --- a/numa.c > +++ b/numa.c > @@ -28,6 +28,16 @@ > #include "qapi-visit.h" > #include "qapi/opts-visitor.h" > #include "qapi/dealloc-visitor.h" > +#include "exec/memory.h" > + > +#ifdef CONFIG_NUMA > +#include <numa.h> > +#include <numaif.h> > +#ifndef MPOL_F_RELATIVE_NODES > +#define MPOL_F_RELATIVE_NODES (1 << 14) > +#define MPOL_F_STATIC_NODES (1 << 15) > +#endif > +#endif > > QemuOptsList qemu_numa_opts = { > .name = "numa", > @@ -219,6 +229,79 @@ void set_numa_nodes(void) > } > } > > +#ifdef CONFIG_NUMA > +static int node_parse_bind_mode(unsigned int nodeid) > +{ > + int bind_mode; > + > + switch (numa_info[nodeid].policy) { > + case NUMA_NODE_POLICY_MEMBIND: > + bind_mode = MPOL_BIND; > + break; > + case NUMA_NODE_POLICY_INTERLEAVE: > + bind_mode = MPOL_INTERLEAVE; > + break; > + case NUMA_NODE_POLICY_PREFERRED: > + bind_mode = MPOL_PREFERRED; > + break; > + case NUMA_NODE_POLICY_DEFAULT: > + default: > + bind_mode = MPOL_DEFAULT; > + return bind_mode; > + } > + > + bind_mode |= numa_info[nodeid].relative ? > + MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; > + > + return bind_mode; > +} > +#endif > + > +static int set_node_mem_policy(int nodeid) > +{ > +#ifdef CONFIG_NUMA > + void *ram_ptr; > + RAMBlock *block; > + ram_addr_t len, ram_offset = 0; > + int bind_mode; > + int i; > + > + QTAILQ_FOREACH(block, &ram_list.blocks, next) { > + if (!strcmp(block->mr->name, "pc.ram")) { > + break; > + } > + } > + > + if (block->host == NULL) { > + return -1; > + } > + > + ram_ptr = block->host; > + for (i = 0; i < nodeid; i++) { > + len = numa_info[i].node_mem; > + ram_offset += len; > + } > + > + len = numa_info[nodeid].node_mem; > + bind_mode = node_parse_bind_mode(nodeid); > + unsigned long *nodes = numa_info[nodeid].host_mem; > + > + /* This is a workaround for a long standing bug in Linux' > + * mbind implementation, which cuts off the last specified > + * node. To stay compatible should this bug be fixed, we > + * specify one more node and zero this one out. > + */ > + unsigned long maxnode = find_last_bit(nodes, MAX_CPUMASK_BITS); > + clear_bit(maxnode + 1, nodes); This clear_bit() isn't necessary. We know that maxnode+1 is certainly already clear, because find_last_bit() just returned maxnode as the last "non-clear" bit. > + if (mbind(ram_ptr + ram_offset, len, bind_mode, nodes, maxnode + 1, 0)) > { > + perror("mbind"); > + return -1; > + } > +#endif > + > + return 0; > +} > + > void set_numa_modes(void) > { > CPUState *cpu; > @@ -231,4 +314,11 @@ void set_numa_modes(void) > } > } > } > + > + for (i = 0; i < nb_numa_nodes; i++) { > + if (set_node_mem_policy(i) == -1) { > + fprintf(stderr, > + "qemu: can not set host memory policy for node%d\n", i); > + } > + } > } > -- > 1.8.4.rc4 > > >
diff --git a/numa.c b/numa.c index 4ccc6cb..4a9c368 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef CONFIG_NUMA +#include <numa.h> +#include <numaif.h> +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif QemuOptsList qemu_numa_opts = { .name = "numa", @@ -219,6 +229,79 @@ void set_numa_nodes(void) } } +#ifdef CONFIG_NUMA +static int node_parse_bind_mode(unsigned int nodeid) +{ + int bind_mode; + + switch (numa_info[nodeid].policy) { + case NUMA_NODE_POLICY_MEMBIND: + bind_mode = MPOL_BIND; + break; + case NUMA_NODE_POLICY_INTERLEAVE: + bind_mode = MPOL_INTERLEAVE; + break; + case NUMA_NODE_POLICY_PREFERRED: + bind_mode = MPOL_PREFERRED; + break; + case NUMA_NODE_POLICY_DEFAULT: + default: + bind_mode = MPOL_DEFAULT; + return bind_mode; + } + + bind_mode |= numa_info[nodeid].relative ? + MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + + return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef CONFIG_NUMA + void *ram_ptr; + RAMBlock *block; + ram_addr_t len, ram_offset = 0; + int bind_mode; + int i; + + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + if (!strcmp(block->mr->name, "pc.ram")) { + break; + } + } + + if (block->host == NULL) { + return -1; + } + + ram_ptr = block->host; + for (i = 0; i < nodeid; i++) { + len = numa_info[i].node_mem; + ram_offset += len; + } + + len = numa_info[nodeid].node_mem; + bind_mode = node_parse_bind_mode(nodeid); + unsigned long *nodes = numa_info[nodeid].host_mem; + + /* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ + unsigned long maxnode = find_last_bit(nodes, MAX_CPUMASK_BITS); + clear_bit(maxnode + 1, nodes); + if (mbind(ram_ptr + ram_offset, len, bind_mode, nodes, maxnode + 1, 0)) { + perror("mbind"); + return -1; + } +#endif + + return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -231,4 +314,11 @@ void set_numa_modes(void) } } } + + for (i = 0; i < nb_numa_nodes; i++) { + if (set_node_mem_policy(i) == -1) { + fprintf(stderr, + "qemu: can not set host memory policy for node%d\n", i); + } + } }