@@ -20,6 +20,21 @@ The nvme device (-device nvme) emulates an NVM Express Controller.
`zns.mor`; Specifies the number of open resources available. This is a 0s
based value.
+ `zns.pstate`; This parameter specifies another blockdev to be used for
+ storing zone state persistently.
+
+ -drive id=zns-pstate,file=zns-pstate.img,format=raw
+ -device nvme-ns,zns.pstate=zns-pstate,...
+
+ To reset (or initialize) state, the blockdev image should be of zero size:
+
+ qemu-img create -f raw zns-pstate.img 0
+
+ The image will be intialized with a file format header and truncated to
+ the required size. If the pstate given is of non-zero size, it will be
+ assumed to already contain zone state information and the header will be
+ checked.
+
Reference Specifications
------------------------
@@ -19,6 +19,15 @@
#define NVME_NS(obj) \
OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
+#define NVME_ZONE_PSTATE_MAGIC ((0x00 << 24) | ('S' << 16) | ('N' << 8) | 'Z')
+#define NVME_ZONE_PSTATE_V1 1
+
+typedef struct NvmeZonePStateHeader {
+ uint32_t magic;
+ uint32_t version;
+ uint8_t rsvd8[4088];
+} QEMU_PACKED NvmeZonePStateHeader;
+
typedef struct NvmeNamespaceParams {
uint32_t nsid;
uint8_t iocs;
@@ -74,6 +83,8 @@ typedef struct NvmeNamespace {
QTAILQ_HEAD(, NvmeZone) lru_open;
QTAILQ_HEAD(, NvmeZone) lru_active;
} resources;
+
+ BlockBackend *pstate;
} zns;
} NvmeNamespace;
@@ -186,4 +197,9 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp);
void nvme_ns_drain(NvmeNamespace *ns);
void nvme_ns_flush(NvmeNamespace *ns);
+static inline void _nvme_ns_check_size(void)
+{
+ QEMU_BUILD_BUG_ON(sizeof(NvmeZonePStateHeader) != 4096);
+}
+
#endif /* NVME_NS_H */
@@ -50,6 +50,31 @@ const char *nvme_zs_to_str(NvmeZoneState zs)
return "UNKNOWN";
}
+static int nvme_blk_truncate(BlockBackend *blk, size_t len, Error **errp)
+{
+ int ret;
+ uint64_t perm, shared_perm;
+
+ blk_get_perm(blk, &perm, &shared_perm);
+
+ ret = blk_set_perm(blk, perm | BLK_PERM_RESIZE, shared_perm, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = blk_truncate(blk, len, false, PREALLOC_MODE_OFF, 0, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ ret = blk_set_perm(blk, perm, shared_perm, errp);
+ if (ret < 0) {
+ return ret;
+ }
+
+ return 0;
+}
+
static void nvme_ns_zns_init_zones(NvmeNamespace *ns)
{
NvmeZone *zone;
@@ -153,6 +178,176 @@ static int nvme_ns_init(NvmeNamespace *ns, Error **errp)
return 0;
}
+static int nvme_ns_zns_restore_zone_state(NvmeNamespace *ns, Error **errp)
+{
+ for (int i = 0; i < ns->zns.num_zones; i++) {
+ NvmeZone *zone = &ns->zns.zones[i];
+ zone->zd = &ns->zns.zd[i];
+ if (ns->params.zns.zdes) {
+ zone->zde = &ns->zns.zde[i];
+ }
+
+ switch (nvme_zs(zone)) {
+ case NVME_ZS_ZSE:
+ case NVME_ZS_ZSF:
+ case NVME_ZS_ZSRO:
+ case NVME_ZS_ZSO:
+ break;
+
+ case NVME_ZS_ZSC:
+ if (nvme_wp(zone) == nvme_zslba(zone) &&
+ !(zone->zd->za & NVME_ZA_ZDEV)) {
+ nvme_zs_set(zone, NVME_ZS_ZSE);
+ break;
+ }
+
+ if (ns->zns.resources.active) {
+ ns->zns.resources.active--;
+ QTAILQ_INSERT_TAIL(&ns->zns.resources.lru_active, zone,
+ lru_entry);
+ break;
+ }
+
+ /* fallthrough */
+
+ case NVME_ZS_ZSIO:
+ case NVME_ZS_ZSEO:
+ zone->zd->wp = zone->zd->zslba;
+ nvme_zs_set(zone, NVME_ZS_ZSF);
+ break;
+
+ default:
+ error_setg(errp, "invalid zone state");
+ return -1;
+ }
+
+ zone->wp_staging = nvme_wp(zone);
+ }
+
+ return 0;
+}
+
+static int nvme_ns_zns_init_pstate(NvmeNamespace *ns, Error **errp)
+{
+ BlockBackend *blk = ns->zns.pstate;
+ NvmeZonePStateHeader header;
+ size_t zd_len, zde_len;
+ int ret;
+
+ zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+ zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns);
+
+ ret = nvme_blk_truncate(blk, zd_len + zde_len + sizeof(header), errp);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not truncate zone pstate");
+ return ret;
+ }
+
+ nvme_ns_zns_init_zones(ns);
+
+ ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not write zone descriptors to "
+ "zone pstate");
+ return ret;
+ }
+
+ header = (NvmeZonePStateHeader) {
+ .magic = cpu_to_le32(NVME_ZONE_PSTATE_MAGIC),
+ .version = cpu_to_le32(NVME_ZONE_PSTATE_V1),
+ };
+
+ ret = blk_pwrite(blk, zd_len + zde_len, &header, sizeof(header), 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not write zone pstate header");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int nvme_ns_zns_load_pstate(NvmeNamespace *ns, size_t len, Error **errp)
+{
+ BlockBackend *blk = ns->zns.pstate;
+ NvmeZonePStateHeader header;
+ size_t zd_len, zde_len;
+ int ret;
+
+ ret = blk_pread(blk, len - sizeof(header), &header, sizeof(header));
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not read zone pstate header");
+ return ret;
+ }
+
+ if (le32_to_cpu(header.magic) != NVME_ZONE_PSTATE_MAGIC) {
+ error_setg(errp, "invalid zone pstate header");
+ return -1;
+ } else if (le32_to_cpu(header.version) > NVME_ZONE_PSTATE_V1) {
+ error_setg(errp, "unsupported zone pstate version");
+ return -1;
+ }
+
+ zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+ zde_len = ns->zns.num_zones * nvme_ns_zdes_bytes(ns);
+
+ ret = blk_pread(blk, 0, ns->zns.zd, zd_len);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not read zone descriptors from "
+ "zone pstate");
+ return ret;
+ }
+
+ if (zde_len) {
+ ret = blk_pread(blk, zd_len, ns->zns.zde, zde_len);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not read zone descriptor "
+ "extensions from zone pstate");
+ return ret;
+ }
+ }
+
+ if (nvme_ns_zns_restore_zone_state(ns, errp)) {
+ return -1;
+ }
+
+ ret = blk_pwrite(blk, 0, ns->zns.zd, zd_len, 0);
+ if (ret < 0) {
+ error_setg_errno(errp, -ret, "could not write zone descriptors to "
+ "zone pstate");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int nvme_ns_zns_setup_pstate(NvmeNamespace *ns, Error **errp)
+{
+ BlockBackend *blk = ns->zns.pstate;
+ uint64_t perm, shared_perm;
+ ssize_t len;
+ int ret;
+
+ perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+ shared_perm = BLK_PERM_ALL;
+
+ ret = blk_set_perm(blk, perm, shared_perm, errp);
+ if (ret) {
+ return ret;
+ }
+
+ len = blk_getlength(blk);
+ if (len < 0) {
+ error_setg_errno(errp, -len, "could not determine zone pstate size");
+ return len;
+ }
+
+ if (!len) {
+ return nvme_ns_zns_init_pstate(ns, errp);
+ }
+
+ return nvme_ns_zns_load_pstate(ns, len, errp);
+}
+
static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
{
if (!blkconf_blocksizes(&ns->blkconf, errp)) {
@@ -236,7 +431,13 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
}
if (nvme_ns_zoned(ns)) {
- nvme_ns_zns_init_zones(ns);
+ if (ns->zns.pstate) {
+ if (nvme_ns_zns_setup_pstate(ns, errp)) {
+ return -1;
+ }
+ } else {
+ nvme_ns_zns_init_zones(ns);
+ }
}
if (nvme_register_namespace(n, ns, errp)) {
@@ -249,11 +450,19 @@ int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
void nvme_ns_drain(NvmeNamespace *ns)
{
blk_drain(ns->blkconf.blk);
+
+ if (ns->zns.pstate) {
+ blk_drain(ns->zns.pstate);
+ }
}
void nvme_ns_flush(NvmeNamespace *ns)
{
blk_flush(ns->blkconf.blk);
+
+ if (ns->zns.pstate) {
+ blk_flush(ns->zns.pstate);
+ }
}
static void nvme_ns_realize(DeviceState *dev, Error **errp)
@@ -283,6 +492,7 @@ static Property nvme_ns_props[] = {
DEFINE_PROP_UINT8("zns.zdes", NvmeNamespace, params.zns.zdes, 0),
DEFINE_PROP_UINT32("zns.mar", NvmeNamespace, params.zns.mar, 0xffffffff),
DEFINE_PROP_UINT32("zns.mor", NvmeNamespace, params.zns.mor, 0xffffffff),
+ DEFINE_PROP_DRIVE("zns.pstate", NvmeNamespace, zns.pstate),
DEFINE_PROP_END_OF_LIST(),
};
@@ -1023,6 +1023,46 @@ static uint16_t nvme_check_dulbe(NvmeNamespace *ns, uint64_t slba,
return NVME_SUCCESS;
}
+static int nvme_zns_commit_zone(NvmeNamespace *ns, NvmeZone *zone)
+{
+ uint64_t zslba;
+ int64_t offset;
+
+ if (!ns->zns.pstate) {
+ return 0;
+ }
+
+ trace_pci_nvme_zns_commit_zone(nvme_nsid(ns), nvme_zslba(zone));
+
+ zslba = nvme_zslba(zone);
+ offset = nvme_ns_zone_idx(ns, zslba) * sizeof(NvmeZoneDescriptor);
+
+ return blk_pwrite(ns->zns.pstate, offset, zone->zd,
+ sizeof(NvmeZoneDescriptor), 0);
+}
+
+static int nvme_zns_commit_zde(NvmeNamespace *ns, NvmeZone *zone)
+{
+ uint64_t zslba;
+ int zidx;
+ size_t zd_len, zdes_bytes;
+ int64_t offset;
+
+ if (!ns->zns.pstate) {
+ return 0;
+ }
+
+ trace_pci_nvme_zns_commit_zde(nvme_nsid(ns), nvme_zslba(zone));
+
+ zd_len = ns->zns.num_zones * sizeof(NvmeZoneDescriptor);
+ zslba = nvme_zslba(zone);
+ zidx = nvme_ns_zone_idx(ns, zslba);
+ zdes_bytes = nvme_ns_zdes_bytes(ns);
+ offset = zd_len + zidx * zdes_bytes;
+
+ return blk_pwrite(ns->zns.pstate, offset, zone->zde, zdes_bytes, 0);
+}
+
static inline void nvme_zone_reset_wp(NvmeZone *zone)
{
zone->zd->wp = zone->zd->zslba;
@@ -1058,6 +1098,10 @@ static uint16_t nvme_zrm_release_open(NvmeNamespace *ns)
return status;
}
+ if (nvme_zns_commit_zone(ns, candidate) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1252,6 +1296,10 @@ static uint16_t __nvme_zns_advance_wp(NvmeNamespace *ns, NvmeZone *zone,
if (status) {
return status;
}
+
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
}
return NVME_SUCCESS;
@@ -1307,6 +1355,10 @@ static void nvme_aio_err(NvmeRequest *req, int ret, NvmeZone *zone)
NVME_ZS_ZSRO : NVME_ZS_ZSO;
nvme_zrm_transition(ns, zone, zs);
+
+ if (nvme_zns_commit_zone(req->ns, zone) < 0) {
+ req->status = NVME_INTERNAL_DEV_ERROR;
+ }
}
/*
@@ -1618,6 +1670,10 @@ static void nvme_aio_zone_reset_cb(void *opaque, int ret)
nvme_aio_err(req, ret, zone);
}
+ if (nvme_zns_commit_zone(req->ns, zone) < 0) {
+ req->status = NVME_INTERNAL_DEV_ERROR;
+ }
+
(*resets)--;
if (*resets) {
@@ -1657,6 +1713,10 @@ static uint16_t nvme_zone_mgmt_send_close(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1678,6 +1738,10 @@ static uint16_t nvme_zone_mgmt_send_finish(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1699,6 +1763,10 @@ static uint16_t nvme_zone_mgmt_send_open(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -1754,6 +1822,10 @@ static uint16_t nvme_zone_mgmt_send_offline(NvmeCtrl *n, NvmeRequest *req,
case NVME_ZS_ZSRO:
nvme_zrm_transition(ns, zone, NVME_ZS_ZSO);
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
/* fallthrough */
case NVME_ZS_ZSO:
@@ -1793,6 +1865,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, NvmeRequest *req,
return status;
}
+ if (nvme_zns_commit_zde(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
status = nvme_zrm_transition(ns, zone, NVME_ZS_ZSC);
if (status) {
return status;
@@ -1800,6 +1876,10 @@ static uint16_t nvme_zone_mgmt_send_set_zde(NvmeCtrl *n, NvmeRequest *req,
NVME_ZA_SET(zone->zd->za, NVME_ZA_ZDEV);
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ return NVME_INTERNAL_DEV_ERROR;
+ }
+
return NVME_SUCCESS;
}
@@ -2502,6 +2582,11 @@ static uint16_t nvme_write(NvmeCtrl *n, NvmeRequest *req)
goto invalid;
}
+ if (nvme_zns_commit_zone(ns, zone) < 0) {
+ status = NVME_INTERNAL_DEV_ERROR;
+ goto invalid;
+ }
+
break;
}
@@ -3778,6 +3863,8 @@ static void nvme_ctrl_shutdown(NvmeCtrl *n)
nvme_zrm_transition(ns, zone, NVME_ZS_ZSE);
}
+ nvme_zns_commit_zone(ns, zone);
+
/* fallthrough */
default:
@@ -96,6 +96,8 @@ pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "c
pci_nvme_zrm_transition(uint32_t nsid, uint64_t zslba, const char *s_from, uint8_t from, const char *s_to, uint8_t to) "nsid %"PRIu32" zslba 0x%"PRIx64" from '%s' (%"PRIu8") to '%s' (%"PRIu8")"
pci_nvme_zrm_release_open(uint32_t nsid) "nsid %"PRIu32""
pci_nvme_zns_advance_wp(uint32_t nsid, uint64_t zslba, uint64_t wp_orig, uint32_t nlb) "nsid 0x%"PRIx32" zslba 0x%"PRIx64" wp_orig 0x%"PRIx64" nlb %"PRIu32""
+pci_nvme_zns_commit_zone(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" zslba 0x%"PRIx64""
+pci_nvme_zns_commit_zde(uint32_t nsid, uint64_t zslba) "nsid 0x%"PRIx32" zslba 0x%"PRIx64""
pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64""
pci_nvme_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) "cqid %"PRIu16" new_head %"PRIu16""