@@ -1178,6 +1178,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram,
0, below_4g_mem_size);
memory_region_add_subregion(system_memory, 0, ram_below_4g);
+ if (memory_region_set_mem_policy(ram_below_4g, 0, below_4g_mem_size, 0)) {
+ fprintf(stderr, "qemu: set below 4g memory policy failed\n");
+ exit(1);
+ }
e820_add_entry(0, below_4g_mem_size, E820_RAM);
if (above_4g_mem_size > 0) {
ram_above_4g = g_malloc(sizeof(*ram_above_4g));
@@ -1185,6 +1189,11 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory,
below_4g_mem_size, above_4g_mem_size);
memory_region_add_subregion(system_memory, 0x100000000ULL,
ram_above_4g);
+ if (memory_region_set_mem_policy(ram_above_4g, 0, above_4g_mem_size,
+ below_4g_mem_size)) {
+ fprintf(stderr, "qemu: set above 4g memory policy failed\n");
+ exit(1);
+ }
e820_add_entry(0x100000000ULL, above_4g_mem_size, E820_RAM);
}
@@ -905,6 +905,21 @@ void memory_region_transaction_begin(void);
void memory_region_transaction_commit(void);
/**
+ * memory_region_set_mem_policy: Set memory policy
+ *
+ * Set the memory policy for the specified area.
+ *
+ * @mr: a MemoryRegion we are setting memory policy for
+ * @start: the start offset of the specific region in this MemoryRegion
+ * @length: the specific memory area length
+ * @offset: the start offset of the specific area in NUMA setting
+ */
+int memory_region_set_mem_policy(MemoryRegion *mr,
+ ram_addr_t start,
+ ram_addr_t length,
+ ram_addr_t offset);
+
+/**
* memory_listener_register: register callbacks to be called when memory
* sections are mapped or unmapped into an address
* space
@@ -27,6 +27,16 @@
#include "qapi-visit.h"
#include "qapi/opts-visitor.h"
#include "qapi/dealloc-visitor.h"
+#include "exec/memory.h"
+
+#ifdef __linux__
+#include <sys/syscall.h>
+#ifndef MPOL_F_RELATIVE_NODES
+#define MPOL_F_RELATIVE_NODES (1 << 14)
+#define MPOL_F_STATIC_NODES (1 << 15)
+#endif
+#endif
+
QemuOptsList qemu_numa_opts = {
.name = "numa",
.implied_opt_name = "type",
@@ -228,6 +238,95 @@ void set_numa_nodes(void)
}
}
+#ifdef __linux__
+static int node_parse_bind_mode(unsigned int nodeid)
+{
+ int bind_mode;
+
+ switch (numa_info[nodeid].policy) {
+ case NUMA_NODE_POLICY_DEFAULT:
+ case NUMA_NODE_POLICY_PREFERRED:
+ case NUMA_NODE_POLICY_MEMBIND:
+ case NUMA_NODE_POLICY_INTERLEAVE:
+ bind_mode = numa_info[nodeid].policy;
+ break;
+ default:
+ bind_mode = NUMA_NODE_POLICY_DEFAULT;
+ return bind_mode;
+ }
+
+ bind_mode |= numa_info[nodeid].relative ?
+ MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES;
+
+ return bind_mode;
+}
+
+static int node_set_mem_policy(void *ram_ptr, ram_addr_t length, int nodeid)
+{
+ int bind_mode = node_parse_bind_mode(nodeid);
+ unsigned long *nodes = numa_info[nodeid].host_mem;
+
+ /* This is a workaround for a long standing bug in Linux'
+ * mbind implementation, which cuts off the last specified
+ * node. To stay compatible should this bug be fixed, we
+ * specify one more node and zero this one out.
+ */
+ unsigned long maxnode = find_last_bit(nodes, MAX_NODES);
+ if (syscall(SYS_mbind, ram_ptr, length, bind_mode,
+ nodes, maxnode + 2, 0)) {
+ perror("mbind");
+ return -1;
+ }
+
+ return 0;
+}
+#endif
+
+int memory_region_set_mem_policy(MemoryRegion *mr,
+ ram_addr_t start, ram_addr_t length,
+ ram_addr_t offset)
+{
+#ifdef __linux__
+ ram_addr_t len = 0;
+ int i;
+ for (i = 0; i < nb_numa_nodes; i++) {
+ len += numa_info[i].node_mem;
+ if (offset < len) {
+ break;
+ }
+ }
+ if (i == nb_numa_nodes) {
+ return -1;
+ }
+
+ void *ptr = memory_region_get_ram_ptr(mr);
+ for (; i < nb_numa_nodes; i++ ) {
+ if (offset + length <= len) {
+ if (node_set_mem_policy(ptr + start, length, i)) {
+ return -1;
+ }
+ break;
+ } else {
+ ram_addr_t tmp_len = len - offset;
+ offset += tmp_len;
+ length -= tmp_len;
+ if (node_set_mem_policy(ptr + start, tmp_len, i)) {
+ return -1;
+ }
+ start += tmp_len;
+ }
+
+ len += numa_info[i].node_mem;
+ }
+
+ if (i == nb_numa_nodes) {
+ return -1;
+ }
+#endif
+
+ return 0;
+}
+
void set_numa_modes(void)
{
CPUState *cpu;