@@ -57,6 +57,10 @@
#include <sys/eventfd.h>
#endif
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
/* KVM uses PAGE_SIZE in its definition of KVM_COALESCED_MMIO_MAX. We
* need to use the real host PAGE_SIZE, as that's what KVM will use.
*/
@@ -1211,6 +1215,9 @@ static void kvm_unpoison_all(void *param)
{
HWPoisonPage *page, *next_page;
+#ifdef CONFIG_HUGETLBFS_RAS
+ hugetlbfs_ras_empty();
+#endif
QLIST_FOREACH_SAFE(page, &hwpoison_page_list, list, next_page) {
QLIST_REMOVE(page, list);
qemu_ram_remap(page->ram_addr, page->page_size);
@@ -3029,6 +3029,8 @@ if host_os == 'windows'
endif
endif
+config_host_data.set('CONFIG_HUGETLBFS_RAS', host_os == 'linux')
+
########################
# Target configuration #
########################
new file mode 100644
@@ -0,0 +1,546 @@
+/*
+ * Deal with memory hugetlbfs largepage errors in userland.
+ *
+ * Copyright (c) 2024 Oracle and/or its affiliates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * (at your option) any later version.
+ */
+
+#include "qemu/osdep.h"
+#include <unistd.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <poll.h>
+#include <errno.h>
+#include <string.h>
+
+#include "exec/address-spaces.h"
+#include "exec/memory.h"
+#include "exec/ramblock.h"
+#include "qemu/thread.h"
+#include "qemu/queue.h"
+#include "qemu/error-report.h"
+#include "block/thread-pool.h"
+#include "sysemu/runstate.h"
+#include "sysemu/kvm.h"
+#include "qemu/main-loop.h"
+#include "block/aio-wait.h"
+
+#include "hugetlbfs_ras.h"
+
+/* #define DEBUG_HUGETLBFS_RAS */
+
+#ifdef DEBUG_HUGETLBFS_RAS
+#define DPRINTF(fmt, ...) \
+ do { fprintf(stderr, "lpg_ras[%s]: " fmt, __func__, ## __VA_ARGS__); \
+ } while (0)
+#else
+#define DPRINTF(fmt, ...) do {} while (0)
+#endif
+
+/*
+ * We track the Large Poisoned Pages to be able to:
+ * - Identify if a faulting page is already under replacement
+ * - Trigger a replacement for new pages
+ * - Inform the suspended signal handlers that they can continue
+ */
+typedef enum LPP_state {
+ LPP_SUBMITTED = 1,
+ LPP_PREPARING,
+ LPP_DONE,
+ LPP_FAILED,
+} LPP_state;
+
+typedef struct LargeHWPoisonPage {
+ void *page_addr; /* hva of the poisoned large page */
+ size_t page_size;
+ LPP_state page_state;
+ void *first_poison; /* location of the first poison found */
+ struct timespec creation_time;
+ QLIST_ENTRY(LargeHWPoisonPage) list;
+} LargeHWPoisonPage;
+
+static QLIST_HEAD(, LargeHWPoisonPage) large_hwpoison_page_list =
+ QLIST_HEAD_INITIALIZER(large_hwpoison_page_list);
+static int large_hwpoison_vm_stop; /* indicate that VM is stopping */
+static QemuCond large_hwpoison_cv;
+static QemuCond large_hwpoison_new;
+static QemuCond large_hwpoison_vm_running;
+static QemuMutex large_hwpoison_mtx;
+static QemuThread thread;
+static void *hugetlbfs_ras_listener(void *arg);
+static int vm_running;
+static bool hugetlbfs_ras_initialized;
+static int _PAGE_SIZE = 4096;
+static int _PAGE_SHIFT = 12;
+
+/* size should be a power of 2 */
+static int
+shift(int sz)
+{
+ int e, s = 0;
+
+ for (e = 0; (s < sz) && (e < 31); e++) {
+ s = (1 << e);
+ if (s == sz) {
+ return e;
+ }
+ }
+ return -1;
+}
+
+static int
+hugetlbfs_ras_init(void)
+{
+ _PAGE_SIZE = qemu_real_host_page_size();
+ _PAGE_SHIFT = shift(_PAGE_SIZE);
+ if (_PAGE_SHIFT < 0) {
+ warn_report("No support for hugetlbfs largepage errors: "
+ "Bad page size (%d)", _PAGE_SIZE);
+ return -EIO;
+ }
+ qemu_cond_init(&large_hwpoison_cv);
+ qemu_cond_init(&large_hwpoison_new);
+ qemu_cond_init(&large_hwpoison_vm_running);
+ qemu_mutex_init(&large_hwpoison_mtx);
+
+ qemu_thread_create(&thread, "hugetlbfs_error", hugetlbfs_ras_listener,
+ NULL, QEMU_THREAD_DETACHED);
+
+ hugetlbfs_ras_initialized = true;
+ return 0;
+}
+
+bool
+hugetlbfs_ras_use(void)
+{
+ static bool answered;
+
+ if (answered) {
+ return hugetlbfs_ras_initialized;
+ }
+
+ /* XXX we could verify if ras feature should be used or not (on ARM) */
+
+ /* CAP_SYS_ADMIN capability needed for madvise(MADV_HWPOISON) */
+ if (getuid() != 0) {
+ warn_report("Priviledges needed to deal with hugetlbfs memory poison");
+ } else {
+ hugetlbfs_ras_init();
+ }
+
+ answered = true;
+ return hugetlbfs_ras_initialized;
+}
+
+/* return the backend real page size used for the given address */
+static size_t
+hugetlbfs_ras_backend_sz(void *addr)
+{
+ ram_addr_t offset;
+ RAMBlock *rb;
+
+ rb = qemu_ram_block_from_host(addr, false, &offset);
+ if (!rb) {
+ warn_report("No associated RAMBlock to addr: %p", addr);
+ return _PAGE_SIZE;
+ }
+ return rb->page_size;
+}
+
+/*
+ * Report if this std page address of the given faulted large page should be
+ * retried or if the current signal handler should continue to deal with it.
+ * Once the mapping is replaced, we retry the errors appeared before the
+ * 'page struct' creation, to deal with previous errors that haven't been
+ * taken into account yet.
+ * But the 4k pages of the mapping can also experience HW errors in their
+ * lifetime.
+ */
+static int
+hugetlbfs_ras_retry(void *addr, LargeHWPoisonPage *page,
+ struct timespec *entry_time)
+{
+ if (addr == page->first_poison) {
+ return 0;
+ }
+ if (entry_time->tv_sec < page->creation_time.tv_sec) {
+ return 1;
+ }
+ if ((entry_time->tv_sec == page->creation_time.tv_sec) &&
+ (entry_time->tv_nsec <= page->creation_time.tv_nsec)) {
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * If the given address is a large page, we try to replace it
+ * with a set of standard sized pages where we copy what remains
+ * valid from the failed large page.
+ * We reset the two values pointed by paddr and psz to point
+ * to the first poisoned page in the new set, and the size
+ * of this poisoned page.
+ * Return True when it's done. The handler continues with the
+ * possibly corrected values.
+ * Returning False means that there is no signal handler further
+ * action to be taken, the handler should exit.
+ */
+bool
+hugetlbfs_ras_correct(void **paddr, size_t *psz, int code)
+{
+ void *p, *reported_addr;
+ size_t reported_sz, real_sz;
+ LargeHWPoisonPage *page;
+ int found = 0;
+ struct timespec et;
+
+ assert(psz != NULL && paddr != NULL);
+
+ DPRINTF("SIGBUS (%s) at %p (size: %lu)\n",
+ (code == BUS_MCEERR_AR ? "AR" : "AO"), *paddr, *psz);
+
+ if (!hugetlbfs_ras_initialized) {
+ return true;
+ }
+
+ /*
+ * XXX kernel provided size is not reliable...
+ * As kvm_send_hwpoison_signal() uses a hard-coded PAGE_SHIFT
+ * signal value on hwpoison signal.
+ * So in the case of a std page size, we must identify the actual
+ * size to consider (from the mapping block size, or if we
+ * already reduced the page to 4k chunks)
+ */
+ reported_sz = *psz;
+
+ p = *paddr;
+ reported_addr = p;
+
+ if (clock_gettime(CLOCK_MONOTONIC, &et) != 0) {
+ et.tv_sec = 0;
+ et.tv_nsec = 1;
+ }
+ qemu_mutex_lock(&large_hwpoison_mtx);
+
+ if (large_hwpoison_vm_stop) {
+ qemu_mutex_unlock(&large_hwpoison_mtx);
+ return false;
+ }
+
+ QLIST_FOREACH(page, &large_hwpoison_page_list, list) {
+ if (page->page_addr <= p &&
+ page->page_addr + page->page_size > p) {
+ found = 1;
+ break;
+ }
+ }
+
+ if (!found) {
+ if (reported_sz > _PAGE_SIZE) {
+ /* we trust the kernel in this case */
+ real_sz = reported_sz;
+ } else {
+ real_sz = hugetlbfs_ras_backend_sz(p);
+ if (real_sz <= _PAGE_SIZE) {
+ /* not part of a large page */
+ qemu_mutex_unlock(&large_hwpoison_mtx);
+ return true;
+ }
+ }
+ page = g_new0(LargeHWPoisonPage, 1);
+ p = (void *)ROUND_DOWN((unsigned long long)p, real_sz);
+ page->page_addr = p;
+ page->page_size = real_sz;
+ page->page_state = LPP_SUBMITTED;
+ QLIST_INSERT_HEAD(&large_hwpoison_page_list, page, list);
+ qemu_cond_signal(&large_hwpoison_new);
+ } else {
+ if ((code == BUS_MCEERR_AR) && (reported_sz <= _PAGE_SIZE) &&
+ hugetlbfs_ras_retry(p, page, &et)) {
+ *paddr = NULL;
+ }
+ }
+
+ while (page->page_state < LPP_DONE && !large_hwpoison_vm_stop) {
+ qemu_cond_wait(&large_hwpoison_cv, &large_hwpoison_mtx);
+ }
+
+ if (large_hwpoison_vm_stop) {
+ DPRINTF("Handler exit requested as on page %p\n", page->page_addr);
+ *paddr = NULL;
+ }
+ qemu_mutex_unlock(&large_hwpoison_mtx);
+
+ if (page->page_state == LPP_FAILED) {
+ warn_report("Failed recovery for page: %p (error at %p)",
+ page->page_addr, reported_addr);
+ return (*paddr == NULL ? false : true);
+ }
+
+ *psz = (size_t)_PAGE_SIZE;
+
+ DPRINTF("SIGBUS (%s) corrected from %p to %p (size %ld to %ld)\n",
+ (code == BUS_MCEERR_AR ? "AR" : "AO"),
+ reported_addr, *paddr, reported_sz, *psz);
+
+ return (*paddr == NULL ? false : true);
+}
+
+/*
+ * Sequentially read the valid data from the failed large page (shared) backend
+ * file and copy that into our set of standard sized pages.
+ * Any error reading this file (not only EIO) means that we don't retrieve
+ * valid data for the read location, so it results in the corresponding
+ * standard page to be marked as poisoned.
+ * And if this file mapping is not set with "share=on", we can't rely on the
+ * content on the backend file, so the entire replacing set of pages
+ * is poisoned in this case.
+ */
+static int take_valid_data_lpg(LargeHWPoisonPage *page, const char **err)
+{
+ int fd, i, ps = _PAGE_SIZE, slot_num, poison_count = 0;
+ ram_addr_t offset;
+ RAMBlock *rb;
+ uint64_t fd_offset;
+ ssize_t count, retrieved;
+
+ /* find the backend to get the associated fd and offset */
+ rb = qemu_ram_block_from_host(page->page_addr, false, &offset);
+ if (!rb) {
+ if (err) {
+ *err = "No associated RAMBlock";
+ }
+ return -1;
+ }
+ fd = qemu_ram_get_fd(rb);
+ fd_offset = rb->fd_offset;
+ offset += fd_offset;
+ assert(page->page_size == qemu_ram_pagesize(rb));
+ slot_num = page->page_size / ps;
+
+ if (!qemu_ram_is_shared(rb)) { /* we can't use the backend file */
+ if (madvise(page->page_addr, page->page_size, MADV_HWPOISON) == 0) {
+ page->first_poison = page->page_addr;
+ warn_report("Large memory error, unrecoverable section "
+ "(unshared hugetlbfs): start:%p length: %ld",
+ page->page_addr, page->page_size);
+ return 0;
+ } else {
+ if (err) {
+ *err = "large poison injection failed";
+ }
+ return -1;
+ }
+ }
+
+ for (i = 0; i < slot_num; i++) {
+ retrieved = 0;
+ while (retrieved < ps) {
+ count = pread(fd, page->page_addr + i * ps + retrieved,
+ ps - retrieved, offset + i * ps + retrieved);
+ if (count == 0) {
+ DPRINTF("read reach end of the file\n");
+ break;
+ } else if (count < 0) {
+ DPRINTF("read backend failed: %s\n", strerror(errno));
+ break;
+ }
+ retrieved += count;
+ }
+ if (retrieved < ps) { /* consider this page as poisoned */
+ if (madvise(page->page_addr + i * ps, ps, MADV_HWPOISON)) {
+ if (err) {
+ *err = "poison injection failed";
+ }
+ return -1;
+ }
+ if (page->first_poison == NULL) {
+ page->first_poison = page->page_addr + i * ps;
+ }
+ poison_count++;
+ DPRINTF("Found a poison at index %d = addr %p\n",
+ i, page->page_addr + i * ps);
+ }
+ }
+
+ /*
+ * A large page without at least a 4k poison will not have an
+ * entry into hwpoison_page_list, so won't be correctly replaced
+ * with a new large page on VM reset with qemu_ram_remap().
+ * Any new error on this area will fail to be handled.
+ */
+ if (poison_count == 0) {
+ if (err) {
+ *err = "No Poison found";
+ }
+ return -1;
+ }
+
+ DPRINTF("Num poison for page %p : %d / %d\n",
+ page->page_addr, poison_count, i);
+ return 0;
+}
+
+/*
+ * Empty the large_hwpoison_page_list -- to be called on address space
+ * poison cleanup outside of concurrent memory access.
+ */
+void hugetlbfs_ras_empty(void)
+{
+ LargeHWPoisonPage *page, *next_page;
+
+ if (!hugetlbfs_ras_initialized) {
+ return;
+ }
+ qemu_mutex_lock(&large_hwpoison_mtx);
+ QLIST_FOREACH_SAFE(page, &large_hwpoison_page_list, list, next_page) {
+ QLIST_REMOVE(page, list);
+ g_free(page);
+ }
+ qemu_mutex_unlock(&large_hwpoison_mtx);
+}
+
+/*
+ * Deal with the given page, initializing its data.
+ */
+static void
+hugetlbfs_ras_transform_page(LargeHWPoisonPage *page, const char **err_info)
+{
+ const char *err_msg;
+ int fd;
+ ram_addr_t offset;
+ RAMBlock *rb;
+
+ /* find the backend to get the associated fd and offset */
+ rb = qemu_ram_block_from_host(page->page_addr, false, &offset);
+ if (!rb) {
+ DPRINTF("No associated RAMBlock to %p\n", page->page_addr);
+ err_msg = "qemu_ram_block_from_host error";
+ goto err;
+ }
+ fd = qemu_ram_get_fd(rb);
+
+ if (sync_file_range(fd, offset, page->page_size,
+ SYNC_FILE_RANGE_WAIT_AFTER) != 0) {
+ err_msg = "sync_file_range error on the backend";
+ perror("sync_file_range");
+ goto err;
+ }
+ if (fsync(fd) != 0) {
+ err_msg = "fsync error on the backend";
+ perror("fsync");
+ goto err;
+ }
+ if (msync(page->page_addr, page->page_size, MS_SYNC) != 0) {
+ err_msg = "msync error on the backend";
+ perror("msync");
+ goto err;
+ }
+ page->page_state = LPP_PREPARING;
+
+ if (munmap(page->page_addr, page->page_size) != 0) {
+ err_msg = "Failed to unmap";
+ perror("munmap");
+ goto err;
+ }
+
+ /* replace the large page with standard pages */
+ if (mmap(page->page_addr, page->page_size, PROT_READ | PROT_WRITE,
+ MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0)
+ == MAP_FAILED) {
+ err_msg = "Failed to map std page";
+ perror("mmap");
+ goto err;
+ }
+
+ /* take a copy of still valid data and mark the failed pages as poisoned */
+ if (take_valid_data_lpg(page, &err_msg) != 0) {
+ goto err;
+ }
+
+ if (clock_gettime(CLOCK_MONOTONIC, &page->creation_time) != 0) {
+ err_msg = "Failed to set creation time";
+ perror("clock_gettime");
+ goto err;
+ }
+
+ page->page_state = LPP_DONE;
+ return;
+
+err:
+ if (err_info) {
+ *err_info = err_msg;
+ }
+ page->page_state = LPP_FAILED;
+}
+
+/* attempt to vm_stop the entire VM in the IOthread */
+static void coroutine_hugetlbfs_ras_vmstop_bh(void *opaque)
+{
+ vm_stop(RUN_STATE_PAUSED);
+ DPRINTF("VM STOPPED\n");
+ qemu_mutex_lock(&large_hwpoison_mtx);
+ vm_running = 0;
+ qemu_cond_signal(&large_hwpoison_vm_running);
+ qemu_mutex_unlock(&large_hwpoison_mtx);
+}
+
+static void coroutine_hugetlbfs_ras_vmstart_bh(void *opaque)
+{
+ vm_start();
+}
+
+static void *
+hugetlbfs_ras_listener(void *arg)
+{
+ LargeHWPoisonPage *page;
+ int new;
+ const char *err;
+
+ /* monitor any newly submitted element in the list */
+ qemu_mutex_lock(&large_hwpoison_mtx);
+ while (1) {
+ new = 0;
+ QLIST_FOREACH(page, &large_hwpoison_page_list, list) {
+ if (page->page_state == LPP_SUBMITTED) {
+ new++;
+ vm_running = 1;
+ DPRINTF("Stopping the VM\n");
+ aio_bh_schedule_oneshot(qemu_get_aio_context(),
+ coroutine_hugetlbfs_ras_vmstop_bh, NULL);
+ /* inform all SIGBUS threads that they have to return */
+ large_hwpoison_vm_stop++;
+ qemu_cond_broadcast(&large_hwpoison_cv);
+
+ /* wait until VM is stopped */
+ while (vm_running) {
+ DPRINTF("waiting for vm to stop\n");
+ qemu_cond_wait(&large_hwpoison_vm_running,
+ &large_hwpoison_mtx);
+ }
+
+ hugetlbfs_ras_transform_page(page, &err);
+ if (page->page_state == LPP_FAILED) {
+ error_report("fatal: unrecoverable hugepage memory error"
+ " at %p (%s)", page->page_addr, err);
+ exit(1);
+ }
+
+ large_hwpoison_vm_stop--;
+
+ DPRINTF("Restarting the VM\n");
+ aio_bh_schedule_oneshot(qemu_get_aio_context(),
+ coroutine_hugetlbfs_ras_vmstart_bh, NULL);
+ }
+ }
+ if (new) {
+ qemu_cond_broadcast(&large_hwpoison_cv);
+ }
+
+ qemu_cond_wait(&large_hwpoison_new, &large_hwpoison_mtx);
+ }
+ qemu_mutex_unlock(&large_hwpoison_mtx);
+ return NULL;
+}
new file mode 100644
@@ -0,0 +1,3 @@
+bool hugetlbfs_ras_use(void);
+bool hugetlbfs_ras_correct(void **paddr, size_t *psz, int code);
+void hugetlbfs_ras_empty(void);
@@ -37,4 +37,5 @@ system_ss.add(when: 'CONFIG_DEVICE_TREE',
if_false: files('device_tree-stub.c'))
if host_os == 'linux'
system_ss.add(files('async-teardown.c'))
+ system_ss.add(files('hugetlbfs_ras.c'))
endif
@@ -82,6 +82,10 @@
#include <daxctl/libdaxctl.h>
#endif
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
//#define DEBUG_SUBPAGE
/* ram_list is read under rcu_read_lock()/rcu_read_unlock(). Writes
@@ -2061,6 +2065,19 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
return NULL;
}
+#ifdef CONFIG_HUGETLBFS_RAS
+ {
+ QemuFsType ftyp = qemu_fd_getfs(fd);
+
+ if (ftyp == QEMU_FS_TYPE_HUGETLBFS) {
+ if (hugetlbfs_ras_use() && !(ram_flags & RAM_SHARED)) {
+ warn_report("'share=on' option must be set to support "
+ "hugetlbfs memory error handling");
+ }
+ }
+ }
+#endif
+
block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, offset, errp);
if (!block) {
if (created) {
@@ -40,6 +40,10 @@
#include "hw/acpi/ghes.h"
#include "target/arm/gtimer.h"
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
KVM_CAP_LAST_INFO
};
@@ -2356,6 +2360,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr, short addr_lsb)
assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
+#ifdef CONFIG_HUGETLBFS_RAS
+ if (!hugetlbfs_ras_correct(&addr, &sz, code)) {
+ return;
+ }
+#endif
+
if (acpi_ghes_present() && addr) {
ram_addr = qemu_ram_addr_from_host(addr);
if (ram_addr != RAM_ADDR_INVALID &&
@@ -69,6 +69,10 @@
#include "exec/memattrs.h"
#include "trace.h"
+#ifdef CONFIG_HUGETLBFS_RAS
+#include "system/hugetlbfs_ras.h"
+#endif
+
#include CONFIG_DEVICES
//#define DEBUG_KVM
@@ -729,6 +733,12 @@ void kvm_arch_on_sigbus_vcpu(CPUState *c, int code, void *addr, short addr_lsb)
*/
assert(code == BUS_MCEERR_AR || code == BUS_MCEERR_AO);
+#ifdef CONFIG_HUGETLBFS_RAS
+ if (!hugetlbfs_ras_correct(&addr, &sz, code)) {
+ return;
+ }
+#endif
+
if ((env->mcg_cap & MCG_SER_P) && addr) {
ram_addr = qemu_ram_addr_from_host(addr);
if (ram_addr != RAM_ADDR_INVALID &&