@@ -341,6 +341,7 @@ struct SpaprMachineState {
#define H_P7 -60
#define H_P8 -61
#define H_P9 -62
+#define H_UNSUPPORTED -67
#define H_OVERLAP -68
#define H_UNSUPPORTED_FLAG -256
#define H_MULTI_THREADS_ACTIVE -9005
@@ -559,8 +560,9 @@ struct SpaprMachineState {
#define H_SCM_UNBIND_ALL 0x3FC
#define H_SCM_HEALTH 0x400
#define H_RPT_INVALIDATE 0x448
+#define H_SCM_FLUSH 0x44C
-#define MAX_HCALL_OPCODE H_RPT_INVALIDATE
+#define MAX_HCALL_OPCODE H_SCM_FLUSH
/* The hcalls above are standardized in PAPR and implemented by pHyp
* as well.
@@ -21,5 +21,6 @@ void spapr_dt_persistent_memory(SpaprMachineState *spapr, void *fdt);
bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp);
void spapr_add_nvdimm(DeviceState *dev, uint64_t slot);
+void spapr_nvdimm_finish_flushes(void);
#endif
@@ -1634,6 +1634,8 @@ static void spapr_machine_reset(MachineState *machine)
spapr->ov5_cas = spapr_ovec_clone(spapr->ov5);
}
+ spapr_nvdimm_finish_flushes();
+
/* DRC reset may cause a device to be unplugged. This will cause troubles
* if this device is used by another device (eg, a running vhost backend
* will crash QEMU if the DIMM holding the vring goes away). To avoid such
@@ -22,6 +22,7 @@
* THE SOFTWARE.
*/
#include "qemu/osdep.h"
+#include "qemu/cutils.h"
#include "qapi/error.h"
#include "hw/ppc/spapr_drc.h"
#include "hw/ppc/spapr_nvdimm.h"
@@ -30,6 +31,9 @@
#include "hw/ppc/fdt.h"
#include "qemu/range.h"
#include "hw/ppc/spapr_numa.h"
+#include "block/thread-pool.h"
+#include "migration/vmstate.h"
+#include "qemu/pmem.h"
/* DIMM health bitmap bitmap indicators. Taken from kernel's papr_scm.c */
/* SCM device is unable to persist memory contents */
@@ -47,6 +51,14 @@
/* Have an explicit check for alignment */
QEMU_BUILD_BUG_ON(SPAPR_MINIMUM_SCM_BLOCK_SIZE % SPAPR_MEMORY_BLOCK_SIZE);
+#define TYPE_SPAPR_NVDIMM "spapr-nvdimm"
+OBJECT_DECLARE_TYPE(SpaprNVDIMMDevice, SPAPRNVDIMMClass, SPAPR_NVDIMM)
+
+struct SPAPRNVDIMMClass {
+ /* private */
+ NVDIMMClass parent_class;
+};
+
bool spapr_nvdimm_validate(HotplugHandler *hotplug_dev, NVDIMMDevice *nvdimm,
uint64_t size, Error **errp)
{
@@ -375,6 +387,253 @@ static target_ulong h_scm_bind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
return H_SUCCESS;
}
+typedef struct SpaprNVDIMMDeviceFlushState {
+ uint64_t continue_token;
+ int64_t hcall_ret;
+ uint32_t drcidx;
+
+ QLIST_ENTRY(SpaprNVDIMMDeviceFlushState) node;
+} SpaprNVDIMMDeviceFlushState;
+
+typedef struct SpaprNVDIMMDevice SpaprNVDIMMDevice;
+struct SpaprNVDIMMDevice {
+ NVDIMMDevice parent_obj;
+
+ uint64_t nvdimm_flush_token;
+ QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) pending_nvdimm_flush_states;
+ QLIST_HEAD(, SpaprNVDIMMDeviceFlushState) completed_nvdimm_flush_states;
+};
+
+static int flush_worker_cb(void *opaque)
+{
+ SpaprNVDIMMDeviceFlushState *state = opaque;
+ SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+ PCDIMMDevice *dimm = PC_DIMM(drc->dev);
+ HostMemoryBackend *backend = MEMORY_BACKEND(dimm->hostmem);
+ int backend_fd = memory_region_get_fd(&backend->mr);
+
+ if (object_property_get_bool(OBJECT(backend), "pmem", NULL)) {
+ MemoryRegion *mr = host_memory_backend_get_memory(dimm->hostmem);
+ void *ptr = memory_region_get_ram_ptr(mr);
+ size_t size = object_property_get_uint(OBJECT(dimm), PC_DIMM_SIZE_PROP,
+ NULL);
+
+ /* flush pmem backend */
+ pmem_persist(ptr, size);
+ } else {
+ /* flush raw backing image */
+ if (qemu_fdatasync(backend_fd) < 0) {
+ error_report("papr_scm: Could not sync nvdimm to backend file: %s",
+ strerror(errno));
+ return H_HARDWARE;
+ }
+ }
+
+ return H_SUCCESS;
+}
+
+static void spapr_nvdimm_flush_completion_cb(void *opaque, int hcall_ret)
+{
+ SpaprNVDIMMDeviceFlushState *state = opaque;
+ SpaprDrc *drc = spapr_drc_by_index(state->drcidx);
+ SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(drc->dev);
+
+ state->hcall_ret = hcall_ret;
+ QLIST_REMOVE(state, node);
+ QLIST_INSERT_HEAD(&s_nvdimm->completed_nvdimm_flush_states, state, node);
+}
+
+static int spapr_nvdimm_flush_post_load(void *opaque, int version_id)
+{
+ SpaprNVDIMMDevice *s_nvdimm = (SpaprNVDIMMDevice *)opaque;
+ SpaprNVDIMMDeviceFlushState *state;
+ ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+
+ QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
+ thread_pool_submit_aio(pool, flush_worker_cb, state,
+ spapr_nvdimm_flush_completion_cb, state);
+ }
+
+ return 0;
+}
+
+static const VMStateDescription vmstate_spapr_nvdimm_flush_state = {
+ .name = "spapr_nvdimm_flush_state",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(continue_token, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_INT64(hcall_ret, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_UINT32(drcidx, SpaprNVDIMMDeviceFlushState),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+const VMStateDescription vmstate_spapr_nvdimm_states = {
+ .name = "spapr_nvdimm_states",
+ .version_id = 1,
+ .minimum_version_id = 1,
+ .post_load = spapr_nvdimm_flush_post_load,
+ .fields = (VMStateField[]) {
+ VMSTATE_UINT64(nvdimm_flush_token, SpaprNVDIMMDevice),
+ VMSTATE_QLIST_V(completed_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
+ vmstate_spapr_nvdimm_flush_state,
+ SpaprNVDIMMDeviceFlushState, node),
+ VMSTATE_QLIST_V(pending_nvdimm_flush_states, SpaprNVDIMMDevice, 1,
+ vmstate_spapr_nvdimm_flush_state,
+ SpaprNVDIMMDeviceFlushState, node),
+ VMSTATE_END_OF_LIST()
+ },
+};
+
+/*
+ * Assign a token and reserve it for the new flush state.
+ */
+static SpaprNVDIMMDeviceFlushState *spapr_nvdimm_init_new_flush_state(
+ SpaprNVDIMMDevice *spapr_nvdimm)
+{
+ SpaprNVDIMMDeviceFlushState *state;
+
+ state = g_malloc0(sizeof(*state));
+
+ spapr_nvdimm->nvdimm_flush_token++;
+ /* Token zero is presumed as no job pending. Assert on overflow to zero */
+ g_assert(spapr_nvdimm->nvdimm_flush_token != 0);
+
+ state->continue_token = spapr_nvdimm->nvdimm_flush_token;
+
+ QLIST_INSERT_HEAD(&spapr_nvdimm->pending_nvdimm_flush_states, state, node);
+
+ return state;
+}
+
+/*
+ * spapr_nvdimm_finish_flushes
+ * Waits for all pending flush requests to complete
+ * their execution and free the states
+ */
+void spapr_nvdimm_finish_flushes(void)
+{
+ SpaprNVDIMMDeviceFlushState *state, *next;
+ GSList *list, *nvdimms;
+
+ /*
+ * Called on reset path, the main loop thread which calls
+ * the pending BHs has gotten out running in the reset path,
+ * finally reaching here. Other code path being guest
+ * h_client_architecture_support, thats early boot up.
+ */
+ nvdimms = nvdimm_get_device_list();
+ for (list = nvdimms; list; list = list->next) {
+ NVDIMMDevice *nvdimm = list->data;
+ if (object_dynamic_cast(OBJECT(nvdimm), TYPE_SPAPR_NVDIMM)) {
+ SpaprNVDIMMDevice *s_nvdimm = SPAPR_NVDIMM(nvdimm);
+ while (!QLIST_EMPTY(&s_nvdimm->pending_nvdimm_flush_states)) {
+ aio_poll(qemu_get_aio_context(), true);
+ }
+
+ QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
+ node, next) {
+ QLIST_REMOVE(state, node);
+ g_free(state);
+ }
+ }
+ }
+ g_slist_free(nvdimms);
+}
+
+/*
+ * spapr_nvdimm_get_flush_status
+ * Fetches the status of the hcall worker and returns
+ * H_LONG_BUSY_ORDER_10_MSEC if the worker is still running.
+ */
+static int spapr_nvdimm_get_flush_status(SpaprNVDIMMDevice *s_nvdimm,
+ uint64_t token)
+{
+ SpaprNVDIMMDeviceFlushState *state, *node;
+
+ QLIST_FOREACH(state, &s_nvdimm->pending_nvdimm_flush_states, node) {
+ if (state->continue_token == token) {
+ return H_LONG_BUSY_ORDER_10_MSEC;
+ }
+ }
+
+ QLIST_FOREACH_SAFE(state, &s_nvdimm->completed_nvdimm_flush_states,
+ node, node) {
+ if (state->continue_token == token) {
+ int ret = state->hcall_ret;
+ QLIST_REMOVE(state, node);
+ g_free(state);
+ return ret;
+ }
+ }
+
+ /* If not found in complete list too, invalid token */
+ return H_P2;
+}
+
+/*
+ * H_SCM_FLUSH
+ * Input: drc_index, continue-token
+ * Out: continue-token
+ * Return Value: H_SUCCESS, H_Parameter, H_P2, H_LONG_BUSY_ORDER_10_MSEC,
+ * H_UNSUPPORTED
+ *
+ * Given a DRC Index Flush the data to backend NVDIMM device. The hcall returns
+ * H_LONG_BUSY_ORDER_10_MSEC when the flush takes longer time and the hcall
+ * needs to be issued multiple times in order to be completely serviced. The
+ * continue-token from the output to be passed in the argument list of
+ * subsequent hcalls until the hcall is completely serviced at which point
+ * H_SUCCESS or other error is returned.
+ */
+static target_ulong h_scm_flush(PowerPCCPU *cpu, SpaprMachineState *spapr,
+ target_ulong opcode, target_ulong *args)
+{
+ int ret;
+ uint32_t drc_index = args[0];
+ uint64_t continue_token = args[1];
+ SpaprDrc *drc = spapr_drc_by_index(drc_index);
+ PCDIMMDevice *dimm;
+ HostMemoryBackend *backend = NULL;
+ SpaprNVDIMMDeviceFlushState *state;
+ ThreadPool *pool = aio_get_thread_pool(qemu_get_aio_context());
+ int fd;
+
+ if (!drc || !drc->dev ||
+ spapr_drc_type(drc) != SPAPR_DR_CONNECTOR_TYPE_PMEM) {
+ return H_PARAMETER;
+ }
+
+ dimm = PC_DIMM(drc->dev);
+ if (continue_token == 0) {
+ backend = MEMORY_BACKEND(dimm->hostmem);
+ fd = memory_region_get_fd(&backend->mr);
+
+ if (fd < 0) {
+ return H_UNSUPPORTED;
+ }
+
+ state = spapr_nvdimm_init_new_flush_state(SPAPR_NVDIMM(dimm));
+ if (!state) {
+ return H_HARDWARE;
+ }
+
+ state->drcidx = drc_index;
+
+ thread_pool_submit_aio(pool, flush_worker_cb, state,
+ spapr_nvdimm_flush_completion_cb, state);
+
+ continue_token = state->continue_token;
+ }
+
+ ret = spapr_nvdimm_get_flush_status(SPAPR_NVDIMM(dimm), continue_token);
+ if (H_IS_LONG_BUSY(ret)) {
+ args[0] = continue_token;
+ }
+
+ return ret;
+}
+
static target_ulong h_scm_unbind_mem(PowerPCCPU *cpu, SpaprMachineState *spapr,
target_ulong opcode, target_ulong *args)
{
@@ -523,6 +782,7 @@ static void spapr_scm_register_types(void)
spapr_register_hypercall(H_SCM_UNBIND_MEM, h_scm_unbind_mem);
spapr_register_hypercall(H_SCM_UNBIND_ALL, h_scm_unbind_all);
spapr_register_hypercall(H_SCM_HEALTH, h_scm_health);
+ spapr_register_hypercall(H_SCM_FLUSH, h_scm_flush);
}
type_init(spapr_scm_register_types)