diff mbox series

[v1,2/2] osdep: support mempolicy for preallocation in os_mem_prealloc

Message ID 20211207070607.1422670-2-d-tatianin@yandex-team.ru
State New
Headers show
Series [v1,1/2] hostmem: use a static size for maxnode, validate policy everywhere | expand

Commit Message

Daniil Tatianin Dec. 7, 2021, 7:06 a.m. UTC
This is needed for cases where we want to make sure that a shared memory
region gets allocated from a specific NUMA node. This is impossible to do
with mbind(2) because it ignores the policy for memory mapped with
MAP_SHARED. We work around this by calling set_mempolicy from prealloc
threads instead.

Signed-off-by: Daniil Tatianin <d-tatianin@yandex-team.ru>
---
 backends/hostmem.c   |  6 ++++--
 include/qemu/osdep.h |  3 ++-
 util/meson.build     |  2 ++
 util/oslib-posix.c   | 29 ++++++++++++++++++++++++++---
 util/oslib-win32.c   |  3 ++-
 5 files changed, 36 insertions(+), 7 deletions(-)

Comments

David Hildenbrand Dec. 7, 2021, 8:13 a.m. UTC | #1
On 07.12.21 08:06, Daniil Tatianin wrote:
> This is needed for cases where we want to make sure that a shared memory
> region gets allocated from a specific NUMA node. This is impossible to do
> with mbind(2) because it ignores the policy for memory mapped with
> MAP_SHARED. We work around this by calling set_mempolicy from prealloc
> threads instead.

That's not quite true I think, how were you able to observe this? Do you
have a reproducer?

While the man page says:

"
The specified policy will be ignored for any  MAP_SHARED  mappings  in
the  specified  memory range. Rather  the pages will be allocated
according to the memory policy of the thread that caused the page to be
allocated.  Again, this may not be the thread that called mbind().
"

What it really means is that as long as we access that memory via the
*VMA* for which we called mbind(), which is the case when *not* using
fallocate() to preallocate memory, we end up using the correct policy.


I did experiments a while ago with hugetlbfs shared memory and it
properly allocated from the right node. So I'd be curious how you
trigger this.
David Hildenbrand Dec. 7, 2021, 3:05 p.m. UTC | #2
On 07.12.21 14:58, Daniil Tatianin wrote:
> I believe you're right. Looking at the implementation of
> shmem_alloc_page, it uses the inode policy, which is set via
> vma->set_policy (from the mbind() call in this case). set_mempolicy is
> both useless and redundant here, as thread's
> policy is only ever used in case vma->get_policy returns NULL (which it
> doesn't in our case).
> Sorry for the confusion.

Hi Danlil,

not an issue, the man page is really confusing ... so I was similarly
confused a few months ago until I actually started to dig :)
diff mbox series

Patch

diff --git a/backends/hostmem.c b/backends/hostmem.c
index 392026efe6..0c508ed9df 100644
--- a/backends/hostmem.c
+++ b/backends/hostmem.c
@@ -269,7 +269,8 @@  static void host_memory_backend_set_prealloc(Object *obj, bool value,
         void *ptr = memory_region_get_ram_ptr(&backend->mr);
         uint64_t sz = memory_region_size(&backend->mr);
 
-        os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, &local_err);
+        os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, backend->policy,
+                        backend->host_nodes, MAX_NODES + 1, &local_err);
         if (local_err) {
             error_propagate(errp, local_err);
             return;
@@ -415,7 +416,8 @@  host_memory_backend_memory_complete(UserCreatable *uc, Error **errp)
          */
         if (backend->prealloc) {
             os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz,
-                            backend->prealloc_threads, &local_err);
+                            backend->prealloc_threads, backend->policy,
+                            backend->host_nodes, MAX_NODES + 1, &local_err);
             if (local_err) {
                 goto out;
             }
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 60718fc342..abf88aeb0e 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -688,7 +688,8 @@  unsigned long qemu_getauxval(unsigned long type);
 void qemu_set_tty_echo(int fd, bool echo);
 
 void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus,
-                     Error **errp);
+                     int policy, unsigned long *node_bitmap,
+                     unsigned long max_node, Error **errp);
 
 /**
  * qemu_get_pid_name:
diff --git a/util/meson.build b/util/meson.build
index 05b593055a..25f9fca379 100644
--- a/util/meson.build
+++ b/util/meson.build
@@ -87,3 +87,5 @@  if have_block
                                         if_false: files('filemonitor-stub.c'))
   util_ss.add(when: 'CONFIG_LINUX', if_true: files('vfio-helpers.c'))
 endif
+
+util_ss.add(when: 'CONFIG_NUMA', if_true: numa)
diff --git a/util/oslib-posix.c b/util/oslib-posix.c
index e8bdb02e1d..bca25698c5 100644
--- a/util/oslib-posix.c
+++ b/util/oslib-posix.c
@@ -38,11 +38,13 @@ 
 #include "qemu/sockets.h"
 #include "qemu/thread.h"
 #include <libgen.h>
+#include "qemu/bitmap.h"
 #include "qemu/cutils.h"
 #include "qemu/compiler.h"
 
 #ifdef CONFIG_LINUX
 #include <sys/syscall.h>
+#include <numaif.h>
 #endif
 
 #ifdef __FreeBSD__
@@ -79,6 +81,9 @@  struct MemsetThread {
     size_t hpagesize;
     QemuThread pgthread;
     sigjmp_buf env;
+    int policy;
+    unsigned long *node_bitmap;
+    unsigned long max_node;
 };
 typedef struct MemsetThread MemsetThread;
 
@@ -464,6 +469,18 @@  static void *do_touch_pages(void *arg)
     }
     qemu_mutex_unlock(&page_mutex);
 
+#ifdef CONFIG_NUMA
+    if (memset_args->max_node &&
+        !bitmap_empty(memset_args->node_bitmap, memset_args->max_node)) {
+        long ret = set_mempolicy(memset_args->policy, memset_args->node_bitmap,
+                                 memset_args->max_node);
+        if (ret < 0) {
+            memset_thread_failed = true;
+            return NULL;
+        }
+    }
+#endif
+
     /* unblock SIGBUS */
     sigemptyset(&set);
     sigaddset(&set, SIGBUS);
@@ -510,7 +527,8 @@  static inline int get_memset_num_threads(int smp_cpus)
 }
 
 static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
-                            int smp_cpus)
+                            int smp_cpus, int policy,
+                            unsigned long *node_bitmap, unsigned long max_node)
 {
     static gsize initialized = 0;
     size_t numpages_per_thread, leftover;
@@ -533,6 +551,9 @@  static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
         memset_thread[i].addr = addr;
         memset_thread[i].numpages = numpages_per_thread + (i < leftover);
         memset_thread[i].hpagesize = hpagesize;
+        memset_thread[i].policy = policy;
+        memset_thread[i].node_bitmap = node_bitmap;
+        memset_thread[i].max_node = max_node;
         qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
                            do_touch_pages, &memset_thread[i],
                            QEMU_THREAD_JOINABLE);
@@ -554,7 +575,8 @@  static bool touch_all_pages(char *area, size_t hpagesize, size_t numpages,
 }
 
 void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
-                     Error **errp)
+                     int policy, unsigned long *node_bitmap,
+                     unsigned long max_node, Error **errp)
 {
     int ret;
     struct sigaction act, oldact;
@@ -573,7 +595,8 @@  void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
     }
 
     /* touch pages simultaneously */
-    if (touch_all_pages(area, hpagesize, numpages, smp_cpus)) {
+    if (touch_all_pages(area, hpagesize, numpages, smp_cpus, policy,
+                        node_bitmap, max_node)) {
         error_setg(errp, "os_mem_prealloc: Insufficient free host memory "
             "pages available to allocate guest RAM");
     }
diff --git a/util/oslib-win32.c b/util/oslib-win32.c
index af559ef339..3e56bf9f09 100644
--- a/util/oslib-win32.c
+++ b/util/oslib-win32.c
@@ -371,7 +371,8 @@  int getpagesize(void)
 }
 
 void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
-                     Error **errp)
+                     int policy, unsigned long *node_bitmap,
+                     unsigned long max_node, Error **errp)
 {
     int i;
     size_t pagesize = qemu_real_host_page_size;