diff mbox series

[5/5] hw/nvme: flexible data placement emulation

Message ID 20230216164806.233533-6-jwd@defmacro.it
State New
Headers show
Series Support Flexible Data Placement (FDP) | expand

Commit Message

Jesper Wendel Devantier Feb. 16, 2023, 4:48 p.m. UTC
From: Jesper Devantier <j.devantier@samsung.com>

Add emulation of TP4146 ("Flexible Data Placement").

Signed-off-by: Jesper Devantier <j.devantier@samsung.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/nvme/ctrl.c       | 703 ++++++++++++++++++++++++++++++++++++++++++-
 hw/nvme/ns.c         | 141 +++++++++
 hw/nvme/nvme.h       |  85 +++++-
 hw/nvme/subsys.c     |  94 +++++-
 hw/nvme/trace-events |   5 +
 include/block/nvme.h | 168 ++++++++++-
 6 files changed, 1186 insertions(+), 10 deletions(-)

Comments

Keith Busch Feb. 16, 2023, 6:26 p.m. UTC | #1
This mostly looks fine. I need to clarify some spec decisions, though.

By default, FDP feature is disabled: "The default value of this Feature shall
be 0h." You can't change the value as long as namespaces exist within the
group, so FDP requires NS Mgmt be supported if you're going to enable it. The
spec, however, doesn't seem to explicitly say that, and NS Mgmt isn't currently
supported in this controller.

We can look past that for this implementation to allow static settings even if
that doesn't perfectly align with this feature (NS Mgmt is kind of difficult
here). In order to match what the spec says is possible though, we can't have a
namespace with more than 128 placement handles since that's the most you can
provide when you create the namespace.

Does that make sense, or am I totally misunderstanding the details?
Jesper Wendel Devantier Feb. 16, 2023, 6:36 p.m. UTC | #2
Correct. We deliberately allow enabling fdp as a startup parameter to side-step the need for NS mgmt.

You are also correct on the second point - the NS placement handle list can at most hold 128 elements.

> On 16 Feb 2023, at 19.26, Keith Busch <kbusch@kernel.org> wrote:
> 
> This mostly looks fine. I need to clarify some spec decisions, though.
> 
> By default, FDP feature is disabled: "The default value of this Feature shall
> be 0h." You can't change the value as long as namespaces exist within the
> group, so FDP requires NS Mgmt be supported if you're going to enable it. The
> spec, however, doesn't seem to explicitly say that, and NS Mgmt isn't currently
> supported in this controller.
> 
> We can look past that for this implementation to allow static settings even if
> that doesn't perfectly align with this feature (NS Mgmt is kind of difficult
> here). In order to match what the spec says is possible though, we can't have a
> namespace with more than 128 placement handles since that's the most you can
> provide when you create the namespace.
> 
> Does that make sense, or am I totally misunderstanding the details?
Keith Busch Feb. 16, 2023, 7:32 p.m. UTC | #3
On Thu, Feb 16, 2023 at 05:48:06PM +0100, Jesper Devantier wrote:
> +static bool nvme_ns_init_fdp(NvmeNamespace *ns, Error **errp)
> +{
> +    NvmeEnduranceGroup *endgrp = ns->endgrp;
> +    NvmeRuHandle *ruh;
> +    uint8_t lbafi = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> +    unsigned int *ruhid, *ruhids;
> +    char *r, *p, *token;
> +    uint16_t *ph;
> +
> +    if (!ns->params.fdp.ruhs) {
> +        ns->fdp.nphs = 1;
> +        ph = ns->fdp.phs = g_new(uint16_t, 1);
> +
> +        ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_CTRL, ph);
> +        if (!ruh) {
> +            ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_UNUSED, ph);
> +            if (!ruh) {
> +                error_setg(errp, "no unused reclaim unit handles left");
> +                return false;
> +            }
> +
> +            ruh->ruha = NVME_RUHA_CTRL;
> +            ruh->lbafi = lbafi;
> +            ruh->ruamw = endgrp->fdp.runs >> ns->lbaf.ds;
> +
> +            for (uint16_t rg = 0; rg < endgrp->fdp.nrg; rg++) {
> +                ruh->rus[rg].ruamw = ruh->ruamw;
> +            }
> +        } else if (ruh->lbafi != lbafi) {
> +            error_setg(errp, "lba format index of controller assigned "
> +                       "reclaim unit handle does not match namespace lba "
> +                       "format index");
> +            return false;
> +        }
> +
> +        return true;
> +    }
> +
> +    ruhid = ruhids = g_new0(unsigned int, endgrp->fdp.nruh);
> +    r = p = strdup(ns->params.fdp.ruhs);
> +
> +    /* parse the reclaim unit handle identifiers */
> +    while ((token = qemu_strsep(&p, ";")) != NULL) {
> +        if (++ns->fdp.nphs == endgrp->fdp.nruh) {

Since a namespace can't have more than 128 placement handles, and the endurance
group can have more, I think the 128 limit needs to be checked here.

> +            error_setg(errp, "too many placement handles");
> +            free(r);
> +            return false;
> +        }
> +
> +        if (qemu_strtoui(token, NULL, 0, ruhid++) < 0) {
> +            error_setg(errp, "cannot parse reclaim unit handle identifier");
> +            free(r);
> +            return false;
> +        }
> +    }
diff mbox series

Patch

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 17e6b430e2..b894bda326 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -238,6 +238,8 @@  static const bool nvme_feature_support[NVME_FID_MAX] = {
     [NVME_TIMESTAMP]                = true,
     [NVME_HOST_BEHAVIOR_SUPPORT]    = true,
     [NVME_COMMAND_SET_PROFILE]      = true,
+    [NVME_FDP_MODE]                 = true,
+    [NVME_FDP_EVENTS]               = true,
 };
 
 static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
@@ -249,6 +251,8 @@  static const uint32_t nvme_feature_cap[NVME_FID_MAX] = {
     [NVME_TIMESTAMP]                = NVME_FEAT_CAP_CHANGE,
     [NVME_HOST_BEHAVIOR_SUPPORT]    = NVME_FEAT_CAP_CHANGE,
     [NVME_COMMAND_SET_PROFILE]      = NVME_FEAT_CAP_CHANGE,
+    [NVME_FDP_MODE]                 = NVME_FEAT_CAP_CHANGE,
+    [NVME_FDP_EVENTS]               = NVME_FEAT_CAP_CHANGE | NVME_FEAT_CAP_NS,
 };
 
 static const uint32_t nvme_cse_acs[256] = {
@@ -281,6 +285,8 @@  static const uint32_t nvme_cse_iocs_nvm[256] = {
     [NVME_CMD_VERIFY]               = NVME_CMD_EFF_CSUPP,
     [NVME_CMD_COPY]                 = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
     [NVME_CMD_COMPARE]              = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_IO_MGMT_RECV]         = NVME_CMD_EFF_CSUPP,
+    [NVME_CMD_IO_MGMT_SEND]         = NVME_CMD_EFF_CSUPP | NVME_CMD_EFF_LBCC,
 };
 
 static const uint32_t nvme_cse_iocs_zoned[256] = {
@@ -299,12 +305,66 @@  static const uint32_t nvme_cse_iocs_zoned[256] = {
 
 static void nvme_process_sq(void *opaque);
 static void nvme_ctrl_reset(NvmeCtrl *n, NvmeResetType rst);
+static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n);
 
 static uint16_t nvme_sqid(NvmeRequest *req)
 {
     return le16_to_cpu(req->sq->sqid);
 }
 
+static inline uint16_t nvme_make_pid(NvmeNamespace *ns, uint16_t rg,
+                                     uint16_t ph)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return ph;
+    }
+
+    return (rg << (16 - rgif)) | ph;
+}
+
+static inline bool nvme_ph_valid(NvmeNamespace *ns, uint16_t ph)
+{
+    return ph < ns->fdp.nphs;
+}
+
+static inline bool nvme_rg_valid(NvmeEnduranceGroup *endgrp, uint16_t rg)
+{
+    return rg < endgrp->fdp.nrg;
+}
+
+static inline uint16_t nvme_pid2ph(NvmeNamespace *ns, uint16_t pid)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return pid;
+    }
+
+    return pid & ((1 << (15 - rgif)) - 1);
+}
+
+static inline uint16_t nvme_pid2rg(NvmeNamespace *ns, uint16_t pid)
+{
+    uint16_t rgif = ns->endgrp->fdp.rgif;
+
+    if (!rgif) {
+        return 0;
+    }
+
+    return pid >> (16 - rgif);
+}
+
+static inline bool nvme_parse_pid(NvmeNamespace *ns, uint16_t pid,
+                                  uint16_t *ph, uint16_t *rg)
+{
+    *rg = nvme_pid2rg(ns, pid);
+    *ph = nvme_pid2ph(ns, pid);
+
+    return nvme_ph_valid(ns, *ph) && nvme_rg_valid(ns->endgrp, *rg);
+}
+
 static void nvme_assign_zone_state(NvmeNamespace *ns, NvmeZone *zone,
                                    NvmeZoneState state)
 {
@@ -378,6 +438,69 @@  static uint16_t nvme_aor_check(NvmeNamespace *ns, uint32_t act, uint32_t opn)
     return nvme_zns_check_resources(ns, act, opn, 0);
 }
 
+static NvmeFdpEvent *nvme_fdp_alloc_event(NvmeCtrl *n, NvmeFdpEventBuffer *ebuf)
+{
+    NvmeFdpEvent *ret = NULL;
+    bool is_full = ebuf->next == ebuf->start && ebuf->nelems;
+
+    ret = &ebuf->events[ebuf->next++];
+    if (unlikely(ebuf->next == NVME_FDP_MAX_EVENTS)) {
+        ebuf->next = 0;
+    }
+    if (is_full) {
+        ebuf->start = ebuf->next;
+    } else {
+        ebuf->nelems++;
+    }
+
+    memset(ret, 0, sizeof(NvmeFdpEvent));
+    ret->timestamp = nvme_get_timestamp(n);
+
+    return ret;
+}
+
+static inline int log_event(NvmeRuHandle *ruh, uint8_t event_type)
+{
+    return (ruh->event_filter >> nvme_fdp_evf_shifts[event_type]) & 0x1;
+}
+
+static bool nvme_update_ruh(NvmeCtrl *n, NvmeNamespace *ns, uint16_t pid)
+{
+    NvmeEnduranceGroup *endgrp = ns->endgrp;
+    NvmeRuHandle *ruh;
+    NvmeReclaimUnit *ru;
+    NvmeFdpEvent *e = NULL;
+    uint16_t ph, rg, ruhid;
+
+    if (!nvme_parse_pid(ns, pid, &ph, &rg)) {
+        return false;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+
+    ruh = &endgrp->fdp.ruhs[ruhid];
+    ru = &ruh->rus[rg];
+
+    if (ru->ruamw) {
+        if (log_event(ruh, FDP_EVT_RU_NOT_FULLY_WRITTEN)) {
+            e = nvme_fdp_alloc_event(n, &endgrp->fdp.host_events);
+            e->type = FDP_EVT_RU_NOT_FULLY_WRITTEN;
+            e->flags = FDPEF_PIV | FDPEF_NSIDV | FDPEF_LV;
+            e->pid = cpu_to_le16(pid);
+            e->nsid = cpu_to_le32(ns->params.nsid);
+            e->rgid = cpu_to_le16(rg);
+            e->ruhid = cpu_to_le16(ruhid);
+        }
+
+        /* log (eventual) GC overhead of prematurely swapping the RU */
+        nvme_fdp_stat_inc(&endgrp->fdp.mbmw, nvme_l2b(ns, ru->ruamw));
+    }
+
+    ru->ruamw = ruh->ruamw;
+
+    return true;
+}
+
 static bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
 {
     hwaddr hi, lo;
@@ -3322,6 +3445,44 @@  invalid:
     return status | NVME_DNR;
 }
 
+static void nvme_do_write_fdp(NvmeCtrl *n, NvmeRequest *req, uint64_t slba,
+                              uint32_t nlb)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+    uint64_t data_size = nvme_l2b(ns, nlb);
+    uint32_t dw12 = le32_to_cpu(req->cmd.cdw12);
+    uint8_t dtype = (dw12 >> 20) & 0xf;
+    uint16_t pid = le16_to_cpu(rw->dspec);
+    uint16_t ph, rg, ruhid;
+    NvmeReclaimUnit *ru;
+
+    if (dtype != NVME_DIRECTIVE_DATA_PLACEMENT
+        || !nvme_parse_pid(ns, pid, &ph, &rg)) {
+        ph = 0;
+        rg = 0;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ru = &ns->endgrp->fdp.ruhs[ruhid].rus[rg];
+
+    nvme_fdp_stat_inc(&ns->endgrp->fdp.hbmw, data_size);
+    nvme_fdp_stat_inc(&ns->endgrp->fdp.mbmw, data_size);
+
+    //trace_pci_nvme_fdp_ruh_write(ruh->rgid, ruh->ruhid, ruh->nlb_ruamw, nlb);
+
+    while (nlb) {
+        if (nlb < ru->ruamw) {
+            ru->ruamw -= nlb;
+            break;
+        }
+
+        nlb -= ru->ruamw;
+        //trace_pci_nvme_fdp_ruh_change(ruh->rgid, ruh->ruhid);
+        nvme_update_ruh(n, ns, pid);
+    }
+}
+
 static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
                               bool wrz)
 {
@@ -3431,6 +3592,8 @@  static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
         if (!(zone->d.za & NVME_ZA_ZRWA_VALID)) {
             zone->w_ptr += nlb;
         }
+    } else if (ns->endgrp && ns->endgrp->fdp.enabled) {
+        nvme_do_write_fdp(n, req, slba, nlb);
     }
 
     data_offset = nvme_l2b(ns, slba);
@@ -4088,6 +4251,126 @@  static uint16_t nvme_zone_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
     return status;
 }
 
+static uint16_t nvme_io_mgmt_recv_ruhs(NvmeCtrl *n, NvmeRequest *req,
+                                       size_t len)
+{
+    NvmeNamespace *ns = req->ns;
+    NvmeEnduranceGroup *endgrp;
+    NvmeRuhStatus *hdr;
+    NvmeRuhStatusDescr *ruhsd;
+    unsigned int nruhsd;
+    uint16_t rg, ph, *ruhid;
+    size_t trans_len;
+    g_autofree uint8_t *buf = NULL;
+
+    if (!n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (ns->params.nsid == 0 || ns->params.nsid == 0xffffffff) {
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+
+    if (!n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    endgrp = ns->endgrp;
+
+    nruhsd = ns->fdp.nphs * endgrp->fdp.nrg;
+    trans_len = sizeof(NvmeRuhStatus) + nruhsd * sizeof(NvmeRuhStatusDescr);
+    buf = g_malloc(trans_len);
+
+    trans_len = MIN(trans_len, len);
+
+    hdr = (NvmeRuhStatus *)buf;
+    ruhsd = (NvmeRuhStatusDescr *)(buf + sizeof(NvmeRuhStatus));
+
+    hdr->nruhsd = cpu_to_le16(nruhsd);
+
+    ruhid = ns->fdp.phs;
+
+    for (ph = 0; ph < ns->fdp.nphs; ph++, ruhid++) {
+        NvmeRuHandle *ruh = &endgrp->fdp.ruhs[*ruhid];
+
+        for (rg = 0; rg < endgrp->fdp.nrg; rg++, ruhsd++) {
+            uint16_t pid = nvme_make_pid(ns, rg, ph);
+
+            ruhsd->pid = cpu_to_le16(pid);
+            ruhsd->ruhid = *ruhid;
+            ruhsd->earutr = 0;
+            ruhsd->ruamw = cpu_to_le64(ruh->rus[rg].ruamw);
+        }
+    }
+
+    return nvme_c2h(n, buf, trans_len, req);
+}
+
+static uint16_t nvme_io_mgmt_recv(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint32_t numd = le32_to_cpu(cmd->cdw11);
+    uint8_t mo = (cdw10 & 0xff);
+    size_t len = (numd + 1) << 2;
+
+    switch (mo) {
+    case NVME_IOMR_MO_NOP:
+        return 0;
+    case NVME_IOMR_MO_RUH_STATUS:
+        return nvme_io_mgmt_recv_ruhs(n, req, len);
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    };
+}
+
+static uint16_t nvme_io_mgmt_send_ruh_update(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    NvmeNamespace *ns = req->ns;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint16_t ret = NVME_SUCCESS;
+    uint32_t npid = (cdw10 >> 1) + 1;
+    unsigned int i = 0;
+    g_autofree uint16_t *pids = NULL;
+    uint32_t maxnpid = n->subsys->endgrp.fdp.nrg * n->subsys->endgrp.fdp.nruh;
+
+    if (unlikely(npid >= MIN(NVME_FDP_MAXPIDS, maxnpid))) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    pids = g_new(uint16_t, npid);
+
+    ret = nvme_h2c(n, pids, npid * sizeof(uint16_t), req);
+    if (ret) {
+        return ret;
+    }
+
+    for (; i < npid; i++) {
+        if (!nvme_update_ruh(n, ns, pids[i])) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+    }
+
+    return ret;
+}
+
+static uint16_t nvme_io_mgmt_send(NvmeCtrl *n, NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw10 = le32_to_cpu(cmd->cdw10);
+    uint8_t mo = (cdw10 & 0xff);
+
+    switch (mo) {
+    case NVME_IOMS_MO_NOP:
+        return 0;
+    case NVME_IOMS_MO_RUH_UPDATE:
+        return nvme_io_mgmt_send_ruh_update(n, req);
+    default:
+        return NVME_INVALID_FIELD | NVME_DNR;
+    };
+}
+
 static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns;
@@ -4164,6 +4447,10 @@  static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
         return nvme_zone_mgmt_send(n, req);
     case NVME_CMD_ZONE_MGMT_RECV:
         return nvme_zone_mgmt_recv(n, req);
+    case NVME_CMD_IO_MGMT_RECV:
+        return nvme_io_mgmt_recv(n, req);
+    case NVME_CMD_IO_MGMT_SEND:
+        return nvme_io_mgmt_send(n, req);
     default:
         assert(false);
     }
@@ -4619,6 +4906,209 @@  static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
 }
 
+static size_t sizeof_fdp_conf_descr(size_t nruh, size_t vss)
+{
+    size_t entry_siz = sizeof(NvmeFdpDescrHdr) + nruh * sizeof(NvmeRuhDescr)
+                       + vss;
+    return ROUND_UP(entry_siz, 8);
+}
+
+static uint16_t nvme_fdp_confs(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
+                               uint64_t off, NvmeRequest *req)
+{
+    uint32_t log_size, trans_len;
+    g_autofree uint8_t *buf = NULL;
+    NvmeFdpDescrHdr *hdr;
+    NvmeRuhDescr *ruhd;
+    NvmeEnduranceGroup *endgrp;
+    NvmeFdpConfsHdr *log;
+    size_t nruh, fdp_descr_size;
+    int i;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (endgrp->fdp.enabled) {
+        nruh = endgrp->fdp.nruh;
+    } else {
+        nruh = 1;
+    }
+
+    fdp_descr_size = sizeof_fdp_conf_descr(nruh, FDPVSS);
+    log_size = sizeof(NvmeFdpConfsHdr) + fdp_descr_size;
+
+    if (off >= log_size) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    trans_len = MIN(log_size - off, buf_len);
+
+    buf = g_malloc0(log_size);
+    log = (NvmeFdpConfsHdr *)buf;
+    hdr = (NvmeFdpDescrHdr *)(log + 1);
+    ruhd = (NvmeRuhDescr *)(buf + sizeof(*log) + sizeof(*hdr));
+
+    log->num_confs = cpu_to_le16(0);
+    log->size = cpu_to_le32(log_size);
+
+    hdr->descr_size = cpu_to_le16(fdp_descr_size);
+    if (endgrp->fdp.enabled) {
+        hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, VALID, 1);
+        hdr->fdpa = FIELD_DP8(hdr->fdpa, FDPA, RGIF, endgrp->fdp.rgif);
+        hdr->nrg = cpu_to_le16(endgrp->fdp.nrg);
+        hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
+        hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
+        hdr->nnss = cpu_to_le32(NVME_MAX_NAMESPACES);
+        hdr->runs = cpu_to_le64(endgrp->fdp.runs);
+
+        for (i = 0; i < nruh; i++) {
+            ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
+            ruhd++;
+        }
+    } else {
+        /* 1 bit for RUH in PIF -> 2 RUHs max. */
+        hdr->nrg = cpu_to_le16(1);
+        hdr->nruh = cpu_to_le16(1);
+        hdr->maxpids = cpu_to_le16(NVME_FDP_MAXPIDS - 1);
+        hdr->nnss = cpu_to_le32(1);
+        hdr->runs = cpu_to_le64(96 * MiB);
+
+        ruhd->ruht = NVME_RUHT_INITIALLY_ISOLATED;
+    }
+
+    return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_ruh_usage(NvmeCtrl *n, uint32_t endgrpid,
+                                   uint32_t dw10, uint32_t dw12,
+                                   uint32_t buf_len, uint64_t off,
+                                   NvmeRequest *req)
+{
+    NvmeRuHandle *ruh;
+    NvmeRuhuLog *hdr;
+    NvmeRuhuDescr *ruhud;
+    NvmeEnduranceGroup *endgrp;
+    g_autofree uint8_t *buf = NULL;
+    uint32_t log_size, trans_len;
+    uint16_t i;
+
+    trace_pci_nvme_fdp_log_ruh_usage(endgrpid);
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (!endgrp->fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    log_size = sizeof(NvmeRuhuLog) + endgrp->fdp.nruh * sizeof(NvmeRuhuDescr);
+
+    if (off >= log_size) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    trans_len = MIN(log_size - off, buf_len);
+
+    buf = g_malloc0(log_size);
+    hdr = (NvmeRuhuLog *)buf;
+    ruhud = (NvmeRuhuDescr *)(hdr + 1);
+
+    ruh = endgrp->fdp.ruhs;
+    hdr->nruh = cpu_to_le16(endgrp->fdp.nruh);
+
+    for (i = 0; i < endgrp->fdp.nruh; i++, ruhud++, ruh++) {
+        ruhud->ruha = ruh->ruha;
+    }
+
+    return nvme_c2h(n, (uint8_t *)buf + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_stats(NvmeCtrl *n, uint32_t endgrpid, uint32_t buf_len,
+                               uint64_t off, NvmeRequest *req)
+{
+    NvmeEnduranceGroup *endgrp;
+    NvmeFdpStatsLog log = {};
+    uint32_t trans_len;
+
+    if (off >= sizeof(NvmeFdpStatsLog)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    if (!n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    trans_len = MIN(sizeof(log) - off, buf_len);
+
+    /* spec value is 128 bit, we only use 64 bit */
+    log.hbmw[0] = cpu_to_le64(endgrp->fdp.hbmw);
+    log.mbmw[0] = cpu_to_le64(endgrp->fdp.mbmw);
+    log.mbe[0] = cpu_to_le64(endgrp->fdp.mbe);
+
+    return nvme_c2h(n, (uint8_t *)&log + off, trans_len, req);
+}
+
+static uint16_t nvme_fdp_events(NvmeCtrl *n, uint32_t endgrpid,
+                                uint32_t buf_len, uint64_t off,
+                                NvmeRequest *req)
+{
+    NvmeEnduranceGroup *endgrp;
+    NvmeCmd *cmd = &req->cmd;
+    bool host_events = (cmd->cdw10 >> 8) & 0x1;
+    uint32_t log_size, trans_len;
+    NvmeFdpEventBuffer *ebuf;
+    g_autofree NvmeFdpEventsLog *elog = NULL;
+    NvmeFdpEvent *event;
+
+    if (endgrpid != 1 || !n->subsys) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    endgrp = &n->subsys->endgrp;
+
+    if (!endgrp->fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (host_events) {
+        ebuf = &endgrp->fdp.host_events;
+    } else {
+        ebuf = &endgrp->fdp.ctrl_events;
+    }
+
+    log_size = sizeof(NvmeFdpEventsLog) + ebuf->nelems * sizeof(NvmeFdpEvent);
+    trans_len = MIN(log_size - off, buf_len);
+    elog = g_malloc0(log_size);
+    elog->num_events = cpu_to_le32(ebuf->nelems);
+    event = (NvmeFdpEvent *)(elog + 1);
+
+    if (ebuf->nelems && ebuf->start == ebuf->next) {
+        unsigned int nelems = (NVME_FDP_MAX_EVENTS - ebuf->start);
+        /* wrap over, copy [start;NVME_FDP_MAX_EVENTS[ and [0; next[ */
+        memcpy(event, &ebuf->events[ebuf->start],
+               sizeof(NvmeFdpEvent) * nelems);
+        memcpy(event + nelems, ebuf->events,
+               sizeof(NvmeFdpEvent) * ebuf->next);
+    } else if (ebuf->start < ebuf->next) {
+        memcpy(event, &ebuf->events[ebuf->start],
+               sizeof(NvmeFdpEvent) * (ebuf->next - ebuf->start));
+    }
+
+    return nvme_c2h(n, (uint8_t *)elog + off, trans_len, req);
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = &req->cmd;
@@ -4631,13 +5121,14 @@  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
     uint8_t  lsp = (dw10 >> 8) & 0xf;
     uint8_t  rae = (dw10 >> 15) & 0x1;
     uint8_t  csi = le32_to_cpu(cmd->cdw14) >> 24;
-    uint32_t numdl, numdu;
+    uint32_t numdl, numdu, lspi;
     uint64_t off, lpol, lpou;
     size_t   len;
     uint16_t status;
 
     numdl = (dw10 >> 16);
     numdu = (dw11 & 0xffff);
+    lspi = (dw11 >> 16);
     lpol = dw12;
     lpou = dw13;
 
@@ -4668,6 +5159,14 @@  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
         return nvme_cmd_effects(n, csi, len, off, req);
     case NVME_LOG_ENDGRP:
         return nvme_endgrp_info(n, rae, len, off, req);
+    case NVME_LOG_FDP_CONFS:
+        return nvme_fdp_confs(n, lspi, len, off, req);
+    case NVME_LOG_FDP_RUH_USAGE:
+        return nvme_fdp_ruh_usage(n, lspi, dw10, dw12, len, off, req);
+    case NVME_LOG_FDP_STATS:
+        return nvme_fdp_stats(n, lspi, len, off, req);
+    case NVME_LOG_FDP_EVENTS:
+        return nvme_fdp_events(n, lspi, len, off, req);
     default:
         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
         return NVME_INVALID_FIELD | NVME_DNR;
@@ -5254,6 +5753,84 @@  static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
     return nvme_c2h(n, (uint8_t *)&timestamp, sizeof(timestamp), req);
 }
 
+static int nvme_get_feature_fdp(NvmeCtrl *n, uint32_t endgrpid,
+                                uint32_t *result)
+{
+    *result = 0;
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    *result = FIELD_DP16(0, FEAT_FDP, FDPE, 1);
+    *result = FIELD_DP16(*result, FEAT_FDP, CONF_NDX, 0);
+
+    return NVME_SUCCESS;
+}
+
+static uint16_t nvme_get_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
+                                            NvmeRequest *req, uint32_t *result)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
+    uint16_t ph = cdw11 & 0xffff;
+    uint8_t noet = (cdw11 >> 16) & 0xff;
+    uint16_t ruhid, ret;
+    uint32_t nentries = 0;
+    uint8_t s_events_ndx = 0;
+    size_t s_events_siz = sizeof(NvmeFdpEventDescr) * noet;
+    g_autofree NvmeFdpEventDescr *s_events = g_malloc0(s_events_siz);
+    NvmeRuHandle *ruh;
+    NvmeFdpEventDescr *s_event;
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (!nvme_ph_valid(ns, ph)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
+
+    assert(ruh);
+
+    if (unlikely(noet == 0)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    for (uint8_t event_type = 0; event_type < FDP_EVT_MAX; event_type++) {
+        uint8_t shift = nvme_fdp_evf_shifts[event_type];
+        if (!shift && event_type) {
+            /*
+             * only first entry (event_type == 0) has a shift value of 0
+             * other entries are simply unpopulated.
+             */
+            continue;
+        }
+
+        nentries++;
+
+        s_event = &s_events[s_events_ndx];
+        s_event->evt = event_type;
+        s_event->evta = (ruh->event_filter >> shift) & 0x1;
+
+        /* break if all `noet` entries are filled */
+        if ((++s_events_ndx) == noet) {
+            break;
+        }
+    }
+
+    ret = nvme_c2h(n, s_events, s_events_siz, req);
+    if (ret) {
+        return ret;
+    }
+
+    *result = nentries;
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = &req->cmd;
@@ -5266,6 +5843,7 @@  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
     uint16_t iv;
     NvmeNamespace *ns;
     int i;
+    uint16_t endgrpid = 0, ret = NVME_SUCCESS;
 
     static const uint32_t nvme_feature_default[NVME_FID_MAX] = {
         [NVME_ARBITRATION] = NVME_ARB_AB_NOLIMIT,
@@ -5363,6 +5941,33 @@  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
     case NVME_HOST_BEHAVIOR_SUPPORT:
         return nvme_c2h(n, (uint8_t *)&n->features.hbs,
                         sizeof(n->features.hbs), req);
+    case NVME_FDP_MODE:
+        endgrpid = dw11 & 0xff;
+
+        if (endgrpid != 0x1) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp(n, endgrpid, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
+    case NVME_FDP_EVENTS:
+        if (!nvme_nsid_valid(n, nsid)) {
+            return NVME_INVALID_NSID | NVME_DNR;
+        }
+
+        ns = nvme_ns(n, nsid);
+        if (unlikely(!ns)) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp_events(n, ns, req, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
     default:
         break;
     }
@@ -5395,6 +6000,20 @@  defaults:
         if (iv == n->admin_cq.vector) {
             result |= NVME_INTVC_NOCOALESCING;
         }
+        break;
+    case NVME_FDP_MODE:
+        endgrpid = dw11 & 0xff;
+
+        if (endgrpid != 0x1) {
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+        ret = nvme_get_feature_fdp(n, endgrpid, &result);
+        if (ret) {
+            return ret;
+        }
+        goto out;
+
         break;
     default:
         result = nvme_feature_default[fid];
@@ -5403,7 +6022,7 @@  defaults:
 
 out:
     req->cqe.result = cpu_to_le32(result);
-    return NVME_SUCCESS;
+    return ret;
 }
 
 static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
@@ -5421,6 +6040,51 @@  static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_feature_fdp_events(NvmeCtrl *n, NvmeNamespace *ns,
+                                            NvmeRequest *req)
+{
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t cdw11 = le32_to_cpu(cmd->cdw11);
+    uint16_t ph = cdw11 & 0xffff;
+    uint8_t noet = (cdw11 >> 16) & 0xff;
+    uint16_t ret, ruhid;
+    uint8_t enable = le32_to_cpu(cmd->cdw12) & 0x1;
+    uint8_t event_mask = 0;
+    unsigned int i;
+    g_autofree uint8_t *events = g_malloc0(noet);
+    NvmeRuHandle *ruh = NULL;
+
+    assert(ns);
+
+    if (!n->subsys || !n->subsys->endgrp.fdp.enabled) {
+        return NVME_FDP_DISABLED | NVME_DNR;
+    }
+
+    if (!nvme_ph_valid(ns, ph)) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    ruhid = ns->fdp.phs[ph];
+    ruh = &n->subsys->endgrp.fdp.ruhs[ruhid];
+
+    ret = nvme_h2c(n, events, noet, req);
+    if (ret) {
+        return ret;
+    }
+
+    for (i = 0; i < noet; i++) {
+        event_mask |= (1 << nvme_fdp_evf_shifts[events[i]]);
+    }
+
+    if (enable) {
+        ruh->event_filter |= event_mask;
+    } else {
+        ruh->event_filter = ruh->event_filter & ~event_mask;
+    }
+
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = NULL;
@@ -5580,6 +6244,11 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest *req)
             return NVME_CMD_SET_CMB_REJECTED | NVME_DNR;
         }
         break;
+    case NVME_FDP_MODE:
+        /* spec: abort with cmd seq err if there's one or more NS' in endgrp */
+        return NVME_CMD_SEQ_ERROR | NVME_DNR;
+    case NVME_FDP_EVENTS:
+        return nvme_set_feature_fdp_events(n, ns, req);
     default:
         return NVME_FEAT_NOT_CHANGEABLE | NVME_DNR;
     }
@@ -6155,6 +6824,7 @@  static uint16_t nvme_directive_send(NvmeCtrl *n, NvmeRequest *req)
 
 static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
 {
+    NvmeNamespace *ns;
     uint32_t dw10 = le32_to_cpu(req->cmd.cdw10);
     uint32_t dw11 = le32_to_cpu(req->cmd.cdw11);
     uint32_t nsid = le32_to_cpu(req->cmd.nsid);
@@ -6176,7 +6846,30 @@  static uint16_t nvme_directive_receive(NvmeCtrl *n, NvmeRequest *req)
         return NVME_INVALID_FIELD | NVME_DNR;
     }
 
-    return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
+    ns = nvme_ns(n, nsid);
+    if (!ns) {
+        return NVME_INVALID_FIELD | NVME_DNR;
+    }
+
+    switch (dtype) {
+    case NVME_DIRECTIVE_IDENTIFY:
+        switch (doper) {
+        case NVME_DIRECTIVE_RETURN_PARAMS:
+            if (ns->endgrp->fdp.enabled) {
+                id.supported |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+                id.enabled |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+                id.persistent |= 1 << NVME_DIRECTIVE_DATA_PLACEMENT;
+            }
+
+            return nvme_c2h(n, (uint8_t *)&id, trans_len, req);
+
+        default:
+            return NVME_INVALID_FIELD | NVME_DNR;
+        }
+
+    default:
+        return NVME_INVALID_FIELD;
+    }
 }
 
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeRequest *req)
@@ -7541,6 +8234,10 @@  static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
         ctratt |= NVME_CTRATT_ENDGRPS;
 
         id->endgidmax = cpu_to_le16(0x1);
+
+        if (n->subsys->endgrp.fdp.enabled) {
+            ctratt |= NVME_CTRATT_FDPS;
+        }
     }
 
     id->ctratt = cpu_to_le32(ctratt);
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 23872a174c..c690c48da8 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -14,8 +14,10 @@ 
 
 #include "qemu/osdep.h"
 #include "qemu/units.h"
+#include "qemu/cutils.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "qemu/bitops.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/block-backend.h"
 
@@ -377,6 +379,128 @@  static void nvme_zoned_ns_shutdown(NvmeNamespace *ns)
     assert(ns->nr_open_zones == 0);
 }
 
+static NvmeRuHandle *nvme_find_ruh_by_attr(NvmeEnduranceGroup *endgrp,
+                                           uint8_t ruha, uint16_t *ruhid)
+{
+    for (uint16_t i = 0; i < endgrp->fdp.nruh; i++) {
+        NvmeRuHandle *ruh = &endgrp->fdp.ruhs[i];
+
+        if (ruh->ruha == ruha) {
+            *ruhid = i;
+            return ruh;
+        }
+    }
+
+    return NULL;
+}
+
+static bool nvme_ns_init_fdp(NvmeNamespace *ns, Error **errp)
+{
+    NvmeEnduranceGroup *endgrp = ns->endgrp;
+    NvmeRuHandle *ruh;
+    uint8_t lbafi = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+    unsigned int *ruhid, *ruhids;
+    char *r, *p, *token;
+    uint16_t *ph;
+
+    if (!ns->params.fdp.ruhs) {
+        ns->fdp.nphs = 1;
+        ph = ns->fdp.phs = g_new(uint16_t, 1);
+
+        ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_CTRL, ph);
+        if (!ruh) {
+            ruh = nvme_find_ruh_by_attr(endgrp, NVME_RUHA_UNUSED, ph);
+            if (!ruh) {
+                error_setg(errp, "no unused reclaim unit handles left");
+                return false;
+            }
+
+            ruh->ruha = NVME_RUHA_CTRL;
+            ruh->lbafi = lbafi;
+            ruh->ruamw = endgrp->fdp.runs >> ns->lbaf.ds;
+
+            for (uint16_t rg = 0; rg < endgrp->fdp.nrg; rg++) {
+                ruh->rus[rg].ruamw = ruh->ruamw;
+            }
+        } else if (ruh->lbafi != lbafi) {
+            error_setg(errp, "lba format index of controller assigned "
+                       "reclaim unit handle does not match namespace lba "
+                       "format index");
+            return false;
+        }
+
+        return true;
+    }
+
+    ruhid = ruhids = g_new0(unsigned int, endgrp->fdp.nruh);
+    r = p = strdup(ns->params.fdp.ruhs);
+
+    /* parse the reclaim unit handle identifiers */
+    while ((token = qemu_strsep(&p, ";")) != NULL) {
+        if (++ns->fdp.nphs == endgrp->fdp.nruh) {
+            error_setg(errp, "too many placement handles");
+            free(r);
+            return false;
+        }
+
+        if (qemu_strtoui(token, NULL, 0, ruhid++) < 0) {
+            error_setg(errp, "cannot parse reclaim unit handle identifier");
+            free(r);
+            return false;
+        }
+    }
+
+    free(r);
+
+    ph = ns->fdp.phs = g_new(uint16_t, ns->fdp.nphs);
+
+    ruhid = ruhids;
+
+    /* verify the identifiers */
+    for (unsigned int i = 0; i < ns->fdp.nphs; i++, ruhid++, ph++) {
+        if (*ruhid >= endgrp->fdp.nruh) {
+            error_setg(errp, "invalid reclaim unit handle identifier");
+            return false;
+        }
+
+        ruh = &endgrp->fdp.ruhs[*ruhid];
+
+        switch (ruh->ruha) {
+        case NVME_RUHA_UNUSED:
+            ruh->ruha = NVME_RUHA_HOST;
+            ruh->lbafi = lbafi;
+            ruh->ruamw = endgrp->fdp.runs >> ns->lbaf.ds;
+
+            for (uint16_t rg = 0; rg < endgrp->fdp.nrg; rg++) {
+                ruh->rus[rg].ruamw = ruh->ruamw;
+            }
+
+            break;
+
+        case NVME_RUHA_HOST:
+            if (ruh->lbafi != lbafi) {
+                error_setg(errp, "lba format index of host assigned"
+                           "reclaim unit handle does not match namespace "
+                           "lba format index");
+                return false;
+            }
+
+            break;
+
+        case NVME_RUHA_CTRL:
+            error_setg(errp, "reclaim unit handle is controller assigned");
+            return false;
+
+        default:
+            abort();
+        }
+
+        *ph = *ruhid;
+    }
+
+    return true;
+}
+
 static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
 {
     unsigned int pi_size;
@@ -417,6 +541,11 @@  static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
         return -1;
     }
 
+    if (ns->params.zoned && ns->endgrp && ns->endgrp->fdp.enabled) {
+        error_setg(errp, "cannot be a zoned- in an FDP configuration");
+        return -1;
+    }
+
     if (ns->params.zoned) {
         if (ns->params.max_active_zones) {
             if (ns->params.max_open_zones > ns->params.max_active_zones) {
@@ -502,6 +631,12 @@  int nvme_ns_setup(NvmeNamespace *ns, Error **errp)
         nvme_ns_init_zoned(ns);
     }
 
+    if (ns->endgrp && ns->endgrp->fdp.enabled) {
+        if (!nvme_ns_init_fdp(ns, errp)) {
+            return -1;
+        }
+    }
+
     return 0;
 }
 
@@ -525,6 +660,10 @@  void nvme_ns_cleanup(NvmeNamespace *ns)
         g_free(ns->zone_array);
         g_free(ns->zd_extensions);
     }
+
+    if (ns->endgrp && ns->endgrp->fdp.enabled) {
+        g_free(ns->fdp.phs);
+    }
 }
 
 static void nvme_ns_unrealize(DeviceState *dev)
@@ -562,6 +701,7 @@  static void nvme_ns_realize(DeviceState *dev, Error **errp)
             return;
         }
         ns->subsys = subsys;
+        ns->endgrp = &subsys->endgrp;
     }
 
     if (nvme_ns_setup(ns, errp)) {
@@ -648,6 +788,7 @@  static Property nvme_ns_props[] = {
     DEFINE_PROP_SIZE("zoned.zrwafg", NvmeNamespace, params.zrwafg, -1),
     DEFINE_PROP_BOOL("eui64-default", NvmeNamespace, params.eui64_default,
                      false),
+    DEFINE_PROP_STRING("fdp.ruhs", NvmeNamespace, params.fdp.ruhs),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index e489830478..209e8f5b4c 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -27,6 +27,8 @@ 
 #define NVME_MAX_CONTROLLERS 256
 #define NVME_MAX_NAMESPACES  256
 #define NVME_EUI64_DEFAULT ((uint64_t)0x5254000000000000)
+#define NVME_FDP_MAX_EVENTS 63
+#define NVME_FDP_MAXPIDS 128
 
 QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
 
@@ -45,8 +47,47 @@  typedef struct NvmeBus {
     OBJECT_CHECK(NvmeSubsystem, (obj), TYPE_NVME_SUBSYS)
 #define SUBSYS_SLOT_RSVD (void *)0xFFFF
 
+typedef struct NvmeReclaimUnit {
+    uint64_t ruamw;
+} NvmeReclaimUnit;
+
+typedef struct NvmeRuHandle {
+    uint8_t  ruht;
+    uint8_t  ruha;
+    uint64_t event_filter;
+    uint8_t  lbafi;
+    uint64_t ruamw;
+
+    /* reclaim units indexed by reclaim group */
+    NvmeReclaimUnit *rus;
+} NvmeRuHandle;
+
+typedef struct NvmeFdpEventBuffer {
+    NvmeFdpEvent     events[NVME_FDP_MAX_EVENTS];
+    unsigned int     nelems;
+    unsigned int     start;
+    unsigned int     next;
+} NvmeFdpEventBuffer;
+
 typedef struct NvmeEnduranceGroup {
     uint8_t event_conf;
+
+    struct {
+        NvmeFdpEventBuffer host_events, ctrl_events;
+
+        uint16_t nruh;
+        uint16_t nrg;
+        uint8_t  rgif;
+        uint64_t runs;
+
+        uint64_t hbmw;
+        uint64_t mbmw;
+        uint64_t mbe;
+
+        bool enabled;
+
+        NvmeRuHandle *ruhs;
+    } fdp;
 } NvmeEnduranceGroup;
 
 typedef struct NvmeSubsystem {
@@ -55,11 +96,19 @@  typedef struct NvmeSubsystem {
     uint8_t     subnqn[256];
     char        *serial;
 
-    NvmeCtrl      *ctrls[NVME_MAX_CONTROLLERS];
-    NvmeNamespace *namespaces[NVME_MAX_NAMESPACES + 1];
+    NvmeCtrl           *ctrls[NVME_MAX_CONTROLLERS];
+    NvmeNamespace      *namespaces[NVME_MAX_NAMESPACES + 1];
+    NvmeEnduranceGroup endgrp;
 
     struct {
         char *nqn;
+
+        struct {
+            bool     enabled;
+            uint64_t runs;
+            uint16_t nruh;
+            uint32_t nrg;
+        } fdp;
     } params;
 } NvmeSubsystem;
 
@@ -100,6 +149,21 @@  typedef struct NvmeZone {
     QTAILQ_ENTRY(NvmeZone) entry;
 } NvmeZone;
 
+#define FDP_EVT_MAX 0xff
+#define NVME_FDP_MAX_NS_RUHS 32u
+#define FDPVSS 0
+
+static const uint8_t nvme_fdp_evf_shifts[FDP_EVT_MAX] = {
+    /* Host events */
+    [FDP_EVT_RU_NOT_FULLY_WRITTEN]      = 0,
+    [FDP_EVT_RU_ATL_EXCEEDED]           = 1,
+    [FDP_EVT_CTRL_RESET_RUH]            = 2,
+    [FDP_EVT_INVALID_PID]               = 3,
+    /* CTRL events */
+    [FDP_EVT_MEDIA_REALLOC]             = 32,
+    [FDP_EVT_RUH_IMPLICIT_RU_CHANGE]    = 33,
+};
+
 typedef struct NvmeNamespaceParams {
     bool     detached;
     bool     shared;
@@ -129,6 +193,10 @@  typedef struct NvmeNamespaceParams {
     uint32_t numzrwa;
     uint64_t zrwas;
     uint64_t zrwafg;
+
+    struct {
+        char *ruhs;
+    } fdp;
 } NvmeNamespaceParams;
 
 typedef struct NvmeNamespace {
@@ -172,10 +240,17 @@  typedef struct NvmeNamespace {
 
     NvmeNamespaceParams params;
     NvmeSubsystem *subsys;
+    NvmeEnduranceGroup *endgrp;
 
     struct {
         uint32_t err_rec;
     } features;
+
+    struct {
+        uint16_t nphs;
+        /* reclaim unit handle identifiers indexed by placement handle */
+        uint16_t *phs;
+    } fdp;
 } NvmeNamespace;
 
 static inline uint32_t nvme_nsid(NvmeNamespace *ns)
@@ -279,6 +354,12 @@  static inline void nvme_aor_dec_active(NvmeNamespace *ns)
     assert(ns->nr_active_zones >= 0);
 }
 
+static inline void nvme_fdp_stat_inc(uint64_t *a, uint64_t b)
+{
+    uint64_t ret = *a + b;
+    *a = ret < *a ? UINT64_MAX : ret;
+}
+
 void nvme_ns_init_format(NvmeNamespace *ns);
 int nvme_ns_setup(NvmeNamespace *ns, Error **errp);
 void nvme_ns_drain(NvmeNamespace *ns);
diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
index 9d2643678b..24ddec860e 100644
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -7,10 +7,13 @@ 
  */
 
 #include "qemu/osdep.h"
+#include "qemu/units.h"
 #include "qapi/error.h"
 
 #include "nvme.h"
 
+#define NVME_DEFAULT_RU_SIZE (96 * MiB)
+
 static int nvme_subsys_reserve_cntlids(NvmeCtrl *n, int start, int num)
 {
     NvmeSubsystem *subsys = n->subsys;
@@ -109,13 +112,95 @@  void nvme_subsys_unregister_ctrl(NvmeSubsystem *subsys, NvmeCtrl *n)
     n->cntlid = -1;
 }
 
-static void nvme_subsys_setup(NvmeSubsystem *subsys)
+static bool nvme_calc_rgif(uint16_t nruh, uint16_t nrg, uint8_t *rgif)
+{
+    uint16_t val;
+    unsigned int i;
+
+    if (unlikely(nrg == 1)) {
+        /* PIDRG_NORGI scenario, all of pid is used for PHID */
+        *rgif = 0;
+        return true;
+    }
+
+    val = nrg;
+    i = 0;
+    while (val) {
+        val >>= 1;
+        i++;
+    }
+    *rgif = i;
+
+    /* ensure remaining bits suffice to represent number of phids in a RG */
+    if (unlikely((UINT16_MAX >> i) < nruh)) {
+        *rgif = 0;
+        return false;
+    }
+
+    return true;
+}
+
+static bool nvme_subsys_setup_fdp(NvmeSubsystem *subsys, Error **errp)
+{
+    NvmeEnduranceGroup *endgrp = &subsys->endgrp;
+
+    if (!subsys->params.fdp.runs) {
+        error_setg(errp, "fdp.runs must be non-zero");
+        return false;
+    }
+
+    endgrp->fdp.runs = subsys->params.fdp.runs;
+
+    if (!subsys->params.fdp.nrg) {
+        error_setg(errp, "fdp.nrg must be non-zero");
+        return false;
+    }
+
+    endgrp->fdp.nrg = subsys->params.fdp.nrg;
+
+    if (!subsys->params.fdp.nruh) {
+        error_setg(errp, "fdp.nruh must be non-zero");
+        return false;
+    }
+
+    endgrp->fdp.nruh = subsys->params.fdp.nruh;
+
+    if (!nvme_calc_rgif(endgrp->fdp.nruh, endgrp->fdp.nrg, &endgrp->fdp.rgif)) {
+        error_setg(errp,
+                   "cannot derive a valid rgif (nruh %"PRIu16" nrg %"PRIu32")",
+                   endgrp->fdp.nruh, endgrp->fdp.nrg);
+        return false;
+    }
+
+    endgrp->fdp.ruhs = g_new(NvmeRuHandle, endgrp->fdp.nruh);
+
+    for (uint16_t ruhid = 0; ruhid < endgrp->fdp.nruh; ruhid++) {
+        endgrp->fdp.ruhs[ruhid] = (NvmeRuHandle) {
+            .ruht = NVME_RUHT_INITIALLY_ISOLATED,
+            .ruha = NVME_RUHA_UNUSED,
+        };
+
+        endgrp->fdp.ruhs[ruhid].rus = g_new(NvmeReclaimUnit, endgrp->fdp.nrg);
+    }
+
+    endgrp->fdp.enabled = true;
+
+    return true;
+}
+
+static bool nvme_subsys_setup(NvmeSubsystem *subsys, Error **errp)
 {
     const char *nqn = subsys->params.nqn ?
         subsys->params.nqn : subsys->parent_obj.id;
 
     snprintf((char *)subsys->subnqn, sizeof(subsys->subnqn),
              "nqn.2019-08.org.qemu:%s", nqn);
+
+    if (subsys->params.fdp.enabled && !nvme_subsys_setup_fdp(subsys, errp)) {
+        return false;
+    }
+
+    return true;
 }
 
 static void nvme_subsys_realize(DeviceState *dev, Error **errp)
@@ -124,11 +209,16 @@  static void nvme_subsys_realize(DeviceState *dev, Error **errp)
 
     qbus_init(&subsys->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
 
-    nvme_subsys_setup(subsys);
+    nvme_subsys_setup(subsys, errp);
 }
 
 static Property nvme_subsystem_props[] = {
     DEFINE_PROP_STRING("nqn", NvmeSubsystem, params.nqn),
+    DEFINE_PROP_BOOL("fdp", NvmeSubsystem, params.fdp.enabled, false),
+    DEFINE_PROP_SIZE("fdp.runs", NvmeSubsystem, params.fdp.runs,
+                     NVME_DEFAULT_RU_SIZE),
+    DEFINE_PROP_UINT32("fdp.nrg", NvmeSubsystem, params.fdp.nrg, 1),
+    DEFINE_PROP_UINT16("fdp.nruh", NvmeSubsystem, params.fdp.nruh, 0),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index b16f2260b4..0eba68844f 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -117,6 +117,11 @@  pci_nvme_clear_ns_reset(uint32_t state, uint64_t slba) "zone state=%"PRIu32", sl
 pci_nvme_zoned_zrwa_implicit_flush(uint64_t zslba, uint32_t nlb) "zslba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_pci_reset(void) "PCI Function Level Reset"
 pci_nvme_virt_mngmt(uint16_t cid, uint16_t act, uint16_t cntlid, const char* rt, uint16_t nr) "cid %"PRIu16", act=0x%"PRIx16", ctrlid=%"PRIu16" %s nr=%"PRIu16""
+pci_nvme_fdp_init(uint64_t nru, uint64_t ru_size, uint16_t nrg, uint8_t nruhrg) "FDP init, nru=%"PRIu64", ru_size=%"PRIu64", nrg=%"PRIu16", nruhrg=%"PRIu8""
+pci_nvme_fdp_ruh_change(uint16_t rgid, uint16_t ruhid) "change RU on RUH rgid=%"PRIu16", ruhid=%"PRIu16""
+pci_nvme_fdp_ruh_write(uint16_t rgid, uint16_t ruhid, uint32_t ruamw, uint32_t nlb) "write RUH rgid=%"PRIu16", ruhid=%"PRIu16", ruamw=%"PRIu32", nlb=%"PRIu32""
+pci_nvme_fdp_get_feature_fdp(uint16_t result) "result=0x%"PRIx16""
+pci_nvme_fdp_log_ruh_usage(uint32_t endgrpid) "log RUH usage for endgrp %"PRIu32""
 
 # error conditions
 pci_nvme_err_mdts(size_t len) "len %zu"
diff --git a/include/block/nvme.h b/include/block/nvme.h
index d463d0ef5f..b0863687d4 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1,6 +1,8 @@ 
 #ifndef BLOCK_NVME_H
 #define BLOCK_NVME_H
 
+#include "hw/registerfields.h"
+
 typedef struct QEMU_PACKED NvmeBar {
     uint64_t    cap;
     uint32_t    vs;
@@ -631,7 +633,9 @@  enum NvmeIoCommands {
     NVME_CMD_WRITE_ZEROES       = 0x08,
     NVME_CMD_DSM                = 0x09,
     NVME_CMD_VERIFY             = 0x0c,
+    NVME_CMD_IO_MGMT_RECV       = 0x12,
     NVME_CMD_COPY               = 0x19,
+    NVME_CMD_IO_MGMT_SEND       = 0x1d,
     NVME_CMD_ZONE_MGMT_SEND     = 0x79,
     NVME_CMD_ZONE_MGMT_RECV     = 0x7a,
     NVME_CMD_ZONE_APPEND        = 0x7d,
@@ -724,7 +728,9 @@  typedef struct QEMU_PACKED NvmeRwCmd {
     uint64_t    slba;
     uint16_t    nlb;
     uint16_t    control;
-    uint32_t    dsmgmt;
+    uint8_t     dsmgmt;
+    uint8_t     rsvd;
+    uint16_t    dspec;
     uint32_t    reftag;
     uint16_t    apptag;
     uint16_t    appmask;
@@ -895,6 +901,8 @@  enum NvmeStatusCodes {
     NVME_INVALID_PRP_OFFSET     = 0x0013,
     NVME_CMD_SET_CMB_REJECTED   = 0x002b,
     NVME_INVALID_CMD_SET        = 0x002c,
+    NVME_FDP_DISABLED           = 0x0029,
+    NVME_INVALID_PHID_LIST      = 0x002a,
     NVME_LBA_RANGE              = 0x0080,
     NVME_CAP_EXCEEDED           = 0x0081,
     NVME_NS_NOT_READY           = 0x0082,
@@ -1031,6 +1039,10 @@  enum NvmeLogIdentifier {
     NVME_LOG_CHANGED_NSLIST             = 0x04,
     NVME_LOG_CMD_EFFECTS                = 0x05,
     NVME_LOG_ENDGRP                     = 0x09,
+    NVME_LOG_FDP_CONFS                  = 0x20,
+    NVME_LOG_FDP_RUH_USAGE              = 0x21,
+    NVME_LOG_FDP_STATS                  = 0x22,
+    NVME_LOG_FDP_EVENTS                 = 0x23,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -1160,6 +1172,7 @@  enum NvmeIdCtrlOaes {
 enum NvmeIdCtrlCtratt {
     NVME_CTRATT_ENDGRPS = 1 <<  4,
     NVME_CTRATT_ELBAS   = 1 << 15,
+    NVME_CTRATT_FDPS    = 1 << 19,
 };
 
 enum NvmeIdCtrlOacs {
@@ -1273,6 +1286,8 @@  enum NvmeFeatureIds {
     NVME_TIMESTAMP                  = 0xe,
     NVME_HOST_BEHAVIOR_SUPPORT      = 0x16,
     NVME_COMMAND_SET_PROFILE        = 0x19,
+    NVME_FDP_MODE                   = 0x1d,
+    NVME_FDP_EVENTS                 = 0x1e,
     NVME_SOFTWARE_PROGRESS_MARKER   = 0x80,
     NVME_FID_MAX                    = 0x100,
 };
@@ -1652,7 +1667,9 @@  typedef struct NvmeDirectiveIdentify {
     uint8_t unused1[31];
     uint8_t enabled;
     uint8_t unused33[31];
-    uint8_t rsvd64[4032];
+    uint8_t persistent;
+    uint8_t unused65[31];
+    uint8_t rsvd64[4000];
 } NvmeDirectiveIdentify;
 
 enum NvmeDirective {
@@ -1661,13 +1678,158 @@  enum NvmeDirective {
 };
 
 enum NvmeDirectiveTypes {
-    NVME_DIRECTIVE_IDENTIFY = 0x0,
+    NVME_DIRECTIVE_IDENTIFY       = 0x0,
+    NVME_DIRECTIVE_DATA_PLACEMENT = 0x2,
 };
 
 enum NvmeDirectiveOperations {
     NVME_DIRECTIVE_RETURN_PARAMS = 0x1,
 };
 
+typedef struct QEMU_PACKED NvmeFdpConfsHdr {
+    uint16_t num_confs;
+    uint8_t  version;
+    uint8_t  rsvd3;
+    uint32_t size;
+    uint8_t  rsvd8[8];
+} NvmeFdpConfsHdr;
+
+REG8(FDPA, 0x0)
+    FIELD(FDPA, RGIF, 0, 4)
+    FIELD(FDPA, VWC, 4, 1)
+    FIELD(FDPA, VALID, 7, 1);
+
+typedef struct QEMU_PACKED NvmeFdpDescrHdr {
+    uint16_t descr_size;
+    uint8_t  fdpa;
+    uint8_t  vss;
+    uint32_t nrg;
+    uint16_t nruh;
+    uint16_t maxpids;
+    uint32_t nnss;
+    uint64_t runs;
+    uint32_t erutl;
+    uint8_t  rsvd28[36];
+} NvmeFdpDescrHdr;
+
+enum NvmeRuhType {
+    NVME_RUHT_INITIALLY_ISOLATED = 1,
+    NVME_RUHT_PERSISTENTLY_ISOLATED = 2,
+};
+
+typedef struct QEMU_PACKED NvmeRuhDescr {
+    uint8_t ruht;
+    uint8_t rsvd1[3];
+} NvmeRuhDescr;
+
+typedef struct QEMU_PACKED NvmeRuhuLog {
+    uint16_t nruh;
+    uint8_t  rsvd2[6];
+} NvmeRuhuLog;
+
+enum NvmeRuhAttributes {
+    NVME_RUHA_UNUSED = 0,
+    NVME_RUHA_HOST = 1,
+    NVME_RUHA_CTRL = 2,
+};
+
+typedef struct QEMU_PACKED NvmeRuhuDescr {
+    uint8_t ruha;
+    uint8_t rsvd1[7];
+} NvmeRuhuDescr;
+
+typedef struct QEMU_PACKED NvmeFdpStatsLog {
+    uint64_t hbmw[2];
+    uint64_t mbmw[2];
+    uint64_t mbe[2];
+    uint8_t  rsvd48[16];
+} NvmeFdpStatsLog;
+
+typedef struct QEMU_PACKED NvmeFdpEventsLog {
+    uint32_t num_events;
+    uint8_t  rsvd4[60];
+} NvmeFdpEventsLog;
+
+enum NvmeFdpEventType {
+    FDP_EVT_RU_NOT_FULLY_WRITTEN = 0x0,
+    FDP_EVT_RU_ATL_EXCEEDED = 0x1,
+    FDP_EVT_CTRL_RESET_RUH = 0x2,
+    FDP_EVT_INVALID_PID = 0x3,
+    FDP_EVT_MEDIA_REALLOC = 0x80,
+    FDP_EVT_RUH_IMPLICIT_RU_CHANGE = 0x81,
+};
+
+enum NvmeFdpEventFlags {
+    FDPEF_PIV = 1 << 0,
+    FDPEF_NSIDV = 1 << 1,
+    FDPEF_LV = 1 << 2,
+};
+
+typedef struct QEMU_PACKED NvmeFdpEvent {
+    uint8_t  type;
+    uint8_t  flags;
+    uint16_t pid;
+    uint64_t timestamp;
+    uint32_t nsid;
+    uint64_t type_specific[2];
+    uint16_t rgid;
+    uint8_t  ruhid;
+    uint8_t  rsvd35[5];
+    uint64_t vendor[3];
+} NvmeFdpEvent;
+
+typedef struct QEMU_PACKED NvmePhidList {
+    uint16_t nnruhd;
+    uint8_t  rsvd2[6];
+} NvmePhidList;
+
+typedef struct QEMU_PACKED NvmePhidDescr {
+    uint8_t  ruht;
+    uint8_t  rsvd1;
+    uint16_t ruhid;
+} NvmePhidDescr;
+
+REG32(FEAT_FDP, 0x0)
+    FIELD(FEAT_FDP, FDPE, 0, 1)
+    FIELD(FEAT_FDP, CONF_NDX, 8, 8);
+
+typedef struct QEMU_PACKED NvmeFdpEventDescr {
+    uint8_t evt;
+    uint8_t evta;
+} NvmeFdpEventDescr;
+
+REG32(NVME_IOMR, 0x0)
+    FIELD(NVME_IOMR, MO, 0, 8)
+    FIELD(NVME_IOMR, MOS, 16, 16);
+
+enum NvmeIomr2Mo {
+    NVME_IOMR_MO_NOP = 0x0,
+    NVME_IOMR_MO_RUH_STATUS = 0x1,
+    NVME_IOMR_MO_VENDOR_SPECIFIC = 0x255,
+};
+
+typedef struct QEMU_PACKED NvmeRuhStatus {
+    uint8_t  rsvd0[14];
+    uint16_t nruhsd;
+} NvmeRuhStatus;
+
+typedef struct QEMU_PACKED NvmeRuhStatusDescr {
+    uint16_t pid;
+    uint16_t ruhid;
+    uint32_t earutr;
+    uint64_t ruamw;
+    uint8_t  rsvd16[16];
+} NvmeRuhStatusDescr;
+
+REG32(NVME_IOMS, 0x0)
+    FIELD(NVME_IOMS, MO, 0, 8)
+    FIELD(NVME_IOMS, MOS, 16, 16);
+
+enum NvmeIoms2Mo {
+    NVME_IOMS_MO_NOP = 0x0,
+    NVME_IOMS_MO_RUH_UPDATE = 0x1,
+};
+
 static inline void _nvme_check_size(void)
 {
     QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);