Message ID | 1385086118-11699-11-git-send-email-gaowanlong@cn.fujitsu.com |
---|---|
State | New |
Headers | show |
Il 22/11/2013 03:08, Wanlong Gao ha scritto: > +static int set_node_mem_policy(int nodeid) > +{ > +#ifdef __linux__ > + void *ram_ptr; > + RAMBlock *block; > + ram_addr_t len, ram_offset = 0; > + int bind_mode; > + int i; > + > + QTAILQ_FOREACH(block, &ram_list.blocks, next) { > + if (!strcmp(block->mr->name, "pc.ram")) { This is not acceptable, "pc.ram" is a board-specific name. I think instead set_node_mem_policy could be something like int memory_region_set_mem_policy(MemoryRegion *mr, uint64_t start, uint64_t length, uint64_t offset); that applies the NUMA policies specified for [offset, offset+length) to the host physical address [ptr+start, ptr+start+length), where ptr is memory_region_get_ram_ptr(mr). Each board then can call the function after it adds RAM with memory_region_add_subregion. Paolo > + break; > + } > + } > + > + if (block->host == NULL) { > + return -1; > + } > + > + ram_ptr = block->host; > + for (i = 0; i < nodeid; i++) { > + len = numa_info[i].node_mem; > + ram_offset += len; > + } > + > + len = numa_info[nodeid].node_mem; > + bind_mode = node_parse_bind_mode(nodeid); > + unsigned long *nodes = numa_info[nodeid].host_mem; > + > + /* This is a workaround for a long standing bug in Linux' > + * mbind implementation, which cuts off the last specified > + * node. To stay compatible should this bug be fixed, we > + * specify one more node and zero this one out. > + */ > + unsigned long maxnode = find_last_bit(nodes, MAX_NODES); > + if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, > + nodes, maxnode + 2, 0)) { > + perror("mbind"); > + return -1; > + } Also, it's still not clear to me why we're not using libnuma. > +#endif > + > + return 0; > +} > + > void set_numa_modes(void) > { > CPUState *cpu; > @@ -240,4 +319,11 @@ void set_numa_modes(void) > } > } > } > + > + for (i = 0; i < nb_numa_nodes; i++) { > + if (set_node_mem_policy(i) == -1) { > + fprintf(stderr, > + "qemu: can not set host memory policy for node%d\n", i); > + } > + } > } >
On 11/27/2013 10:35 PM, Paolo Bonzini wrote: > Il 22/11/2013 03:08, Wanlong Gao ha scritto: >> +static int set_node_mem_policy(int nodeid) >> +{ >> +#ifdef __linux__ >> + void *ram_ptr; >> + RAMBlock *block; >> + ram_addr_t len, ram_offset = 0; >> + int bind_mode; >> + int i; >> + >> + QTAILQ_FOREACH(block, &ram_list.blocks, next) { >> + if (!strcmp(block->mr->name, "pc.ram")) { > > This is not acceptable, "pc.ram" is a board-specific name. > > I think instead set_node_mem_policy could be something like > > int memory_region_set_mem_policy(MemoryRegion *mr, > uint64_t start, uint64_t length, > uint64_t offset); > > that applies the NUMA policies specified for [offset, offset+length) to > the host physical address [ptr+start, ptr+start+length), where ptr is > memory_region_get_ram_ptr(mr). > > Each board then can call the function after it adds RAM with > memory_region_add_subregion. Got it, than you. Thanks, Wanlong Gao > > Paolo > >> + break; >> + } >> + } >> + >> + if (block->host == NULL) { >> + return -1; >> + } >> + >> + ram_ptr = block->host; >> + for (i = 0; i < nodeid; i++) { >> + len = numa_info[i].node_mem; >> + ram_offset += len; >> + } >> + >> + len = numa_info[nodeid].node_mem; >> + bind_mode = node_parse_bind_mode(nodeid); >> + unsigned long *nodes = numa_info[nodeid].host_mem; >> + >> + /* This is a workaround for a long standing bug in Linux' >> + * mbind implementation, which cuts off the last specified >> + * node. To stay compatible should this bug be fixed, we >> + * specify one more node and zero this one out. >> + */ >> + unsigned long maxnode = find_last_bit(nodes, MAX_NODES); >> + if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, >> + nodes, maxnode + 2, 0)) { >> + perror("mbind"); >> + return -1; >> + } > > Also, it's still not clear to me why we're not using libnuma. > >> +#endif >> + >> + return 0; >> +} >> + >> void set_numa_modes(void) >> { >> CPUState *cpu; >> @@ -240,4 +319,11 @@ void set_numa_modes(void) >> } >> } >> } >> + >> + for (i = 0; i < nb_numa_nodes; i++) { >> + if (set_node_mem_policy(i) == -1) { >> + fprintf(stderr, >> + "qemu: can not set host memory policy for node%d\n", i); >> + } >> + } >> } >> > >
On Wed, Nov 27, 2013 at 03:35:53PM +0100, Paolo Bonzini wrote: > > + > > + len = numa_info[nodeid].node_mem; > > + bind_mode = node_parse_bind_mode(nodeid); > > + unsigned long *nodes = numa_info[nodeid].host_mem; > > + > > + /* This is a workaround for a long standing bug in Linux' > > + * mbind implementation, which cuts off the last specified > > + * node. To stay compatible should this bug be fixed, we > > + * specify one more node and zero this one out. > > + */ > > + unsigned long maxnode = find_last_bit(nodes, MAX_NODES); > > + if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, > > + nodes, maxnode + 2, 0)) { > > + perror("mbind"); > > + return -1; > > + } > > Also, it's still not clear to me why we're not using libnuma. > There only seem to be two reasons to use it right now; 1) it provides a wrapper around mbind 2) it exists. IMHO libnuma needs some work before it would be suitable for qemu to marry it. Its interfaces don't buy a lot of elegance, it only reads system information on link time (i.e. no good if you want to consider hotplug of cpus and nodes), and it's currently linux specific anyway. It's a good idea, but is it worth yet another qemu build dependency? drew
diff --git a/numa.c b/numa.c index da4dbbd..915a67a 100644 --- a/numa.c +++ b/numa.c @@ -27,6 +27,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include <sys/syscall.h> +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif + QemuOptsList qemu_numa_opts = { .name = "numa", .implied_opt_name = "type", @@ -228,6 +238,75 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ + int bind_mode; + + switch (numa_info[nodeid].policy) { + case NUMA_NODE_POLICY_DEFAULT: + case NUMA_NODE_POLICY_PREFERRED: + case NUMA_NODE_POLICY_MEMBIND: + case NUMA_NODE_POLICY_INTERLEAVE: + bind_mode = numa_info[nodeid].policy; + break; + default: + bind_mode = NUMA_NODE_POLICY_DEFAULT; + return bind_mode; + } + + bind_mode |= numa_info[nodeid].relative ? + MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + + return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef __linux__ + void *ram_ptr; + RAMBlock *block; + ram_addr_t len, ram_offset = 0; + int bind_mode; + int i; + + QTAILQ_FOREACH(block, &ram_list.blocks, next) { + if (!strcmp(block->mr->name, "pc.ram")) { + break; + } + } + + if (block->host == NULL) { + return -1; + } + + ram_ptr = block->host; + for (i = 0; i < nodeid; i++) { + len = numa_info[i].node_mem; + ram_offset += len; + } + + len = numa_info[nodeid].node_mem; + bind_mode = node_parse_bind_mode(nodeid); + unsigned long *nodes = numa_info[nodeid].host_mem; + + /* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ + unsigned long maxnode = find_last_bit(nodes, MAX_NODES); + if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, + nodes, maxnode + 2, 0)) { + perror("mbind"); + return -1; + } +#endif + + return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -240,4 +319,11 @@ void set_numa_modes(void) } } } + + for (i = 0; i < nb_numa_nodes; i++) { + if (set_node_mem_policy(i) == -1) { + fprintf(stderr, + "qemu: can not set host memory policy for node%d\n", i); + } + } }