@@ -358,13 +358,263 @@ unmap:
return status;
}
-static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
- uint64_t prp1, uint64_t prp2, DMADirection dir,
+/*
+ * Map 'nsgld' data descriptors from 'segment'. The function will subtract the
+ * number of bytes mapped in len.
+ */
+static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
+ QEMUIOVector *iov,
+ NvmeSglDescriptor *segment, uint64_t nsgld,
+ size_t *len, NvmeRequest *req)
+{
+ dma_addr_t addr, trans_len;
+ uint32_t dlen;
+ uint16_t status;
+
+ for (int i = 0; i < nsgld; i++) {
+ uint8_t type = NVME_SGL_TYPE(segment[i].type);
+
+ switch (type) {
+ case NVME_SGL_DESCR_TYPE_DATA_BLOCK:
+ break;
+ case NVME_SGL_DESCR_TYPE_SEGMENT:
+ case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
+ return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
+ default:
+ return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
+ }
+
+ dlen = le32_to_cpu(segment[i].len);
+ if (!dlen) {
+ continue;
+ }
+
+ if (*len == 0) {
+ /*
+ * All data has been mapped, but the SGL contains additional
+ * segments and/or descriptors. The controller might accept
+ * ignoring the rest of the SGL.
+ */
+ uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
+ if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
+ break;
+ }
+
+ trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
+ return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ }
+
+ trans_len = MIN(*len, dlen);
+ addr = le64_to_cpu(segment[i].addr);
+
+ if (UINT64_MAX - addr < dlen) {
+ return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ }
+
+ status = nvme_map_addr(n, qsg, iov, addr, trans_len);
+ if (status) {
+ return status;
+ }
+
+ *len -= trans_len;
+ }
+
+ return NVME_SUCCESS;
+}
+
+static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ NvmeSglDescriptor sgl, size_t len,
NvmeRequest *req)
+{
+ /*
+ * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
+ * dynamically allocating a potentially huge SGL. The spec allows the SGL
+ * to be larger (as in number of bytes required to describe the SGL
+ * descriptors and segment chain) than the command transfer size, so it is
+ * not bounded by MDTS.
+ */
+ const int SEG_CHUNK_SIZE = 256;
+
+ NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *sgld, *last_sgld;
+ uint64_t nsgld;
+ uint32_t seg_len;
+ uint16_t status;
+ bool sgl_in_cmb = false;
+ hwaddr addr;
+ int ret;
+
+ sgld = &sgl;
+ addr = le64_to_cpu(sgl.addr);
+
+ trace_nvme_dev_map_sgl(nvme_cid(req), NVME_SGL_TYPE(sgl.type), req->nlb,
+ len);
+
+ /*
+ * If the entire transfer can be described with a single data block it can
+ * be mapped directly.
+ */
+ if (NVME_SGL_TYPE(sgl.type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+ status = nvme_map_sgl_data(n, qsg, iov, sgld, 1, &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ goto out;
+ }
+
+ /*
+ * If the segment is located in the CMB, the submission queue of the
+ * request must also reside there.
+ */
+ if (nvme_addr_is_cmb(n, addr)) {
+ if (!nvme_addr_is_cmb(n, req->sq->dma_addr)) {
+ return NVME_INVALID_USE_OF_CMB | NVME_DNR;
+ }
+
+ sgl_in_cmb = true;
+ }
+
+ for (;;) {
+ switch (NVME_SGL_TYPE(sgld->type)) {
+ case NVME_SGL_DESCR_TYPE_SEGMENT:
+ case NVME_SGL_DESCR_TYPE_LAST_SEGMENT:
+ break;
+ default:
+ return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+ }
+
+ seg_len = le32_to_cpu(sgld->len);
+
+ /* check the length of the (Last) Segment descriptor */
+ if (!seg_len || seg_len & 0xf) {
+ return NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+ }
+
+ if (UINT64_MAX - addr < seg_len) {
+ return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ }
+
+ nsgld = seg_len / sizeof(NvmeSglDescriptor);
+
+ while (nsgld > SEG_CHUNK_SIZE) {
+ if (nvme_addr_read(n, addr, segment, sizeof(segment))) {
+ trace_nvme_dev_err_addr_read(addr);
+ status = NVME_DATA_TRANSFER_ERROR;
+ goto unmap;
+ }
+
+ status = nvme_map_sgl_data(n, qsg, iov, segment, SEG_CHUNK_SIZE,
+ &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ nsgld -= SEG_CHUNK_SIZE;
+ addr += SEG_CHUNK_SIZE * sizeof(NvmeSglDescriptor);
+ }
+
+ ret = nvme_addr_read(n, addr, segment, nsgld *
+ sizeof(NvmeSglDescriptor));
+ if (ret) {
+ trace_nvme_dev_err_addr_read(addr);
+ status = NVME_DATA_TRANSFER_ERROR;
+ goto unmap;
+ }
+
+ last_sgld = &segment[nsgld - 1];
+
+ /* if the segment ends with a Data Block, then we are done */
+ if (NVME_SGL_TYPE(last_sgld->type) == NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
+ status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld, &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ goto out;
+ }
+
+ /*
+ * If the last descriptor was not a Data Block, then the current
+ * segment must not be a Last Segment.
+ */
+ if (NVME_SGL_TYPE(sgld->type) == NVME_SGL_DESCR_TYPE_LAST_SEGMENT) {
+ status = NVME_INVALID_SGL_SEG_DESCR | NVME_DNR;
+ goto unmap;
+ }
+
+ sgld = last_sgld;
+ addr = le64_to_cpu(sgld->addr);
+
+ /*
+ * Do not map the last descriptor; it will be a Segment or Last Segment
+ * descriptor and is handled by the next iteration.
+ */
+ status = nvme_map_sgl_data(n, qsg, iov, segment, nsgld - 1, &len, req);
+ if (status) {
+ goto unmap;
+ }
+
+ /*
+ * If the next segment is in the CMB, make sure that the sgl was
+ * already located there.
+ */
+ if (sgl_in_cmb != nvme_addr_is_cmb(n, addr)) {
+ status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
+ goto unmap;
+ }
+ }
+
+out:
+ /* if there is any residual left in len, the SGL was too short */
+ if (len) {
+ status = NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
+ goto unmap;
+ }
+
+ return NVME_SUCCESS;
+
+unmap:
+ if (iov->iov) {
+ qemu_iovec_destroy(iov);
+ }
+
+ if (qsg->sg) {
+ qemu_sglist_destroy(qsg);
+ }
+
+ return status;
+}
+
+static uint16_t nvme_map(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
+ size_t len, NvmeRequest *req)
+{
+ uint64_t prp1, prp2;
+
+ switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) {
+ case PSDT_PRP:
+ prp1 = le64_to_cpu(req->cmd.dptr.prp1);
+ prp2 = le64_to_cpu(req->cmd.dptr.prp2);
+
+ return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
+ case PSDT_SGL_MPTR_CONTIGUOUS:
+ case PSDT_SGL_MPTR_SGL:
+ /* SGLs shall not be used for Admin commands in NVMe over PCIe */
+ if (!req->sq->sqid) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ return nvme_map_sgl(n, qsg, iov, req->cmd.dptr.sgl, len, req);
+ default:
+ return NVME_INVALID_FIELD;
+ }
+}
+
+static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
+ DMADirection dir, NvmeRequest *req)
{
uint16_t status = NVME_SUCCESS;
- status = nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
+ status = nvme_map(n, &req->qsg, &req->iov, len, req);
if (status) {
return status;
}
@@ -400,16 +650,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
return status;
}
-static uint16_t nvme_map(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
- size_t len, NvmeRequest *req)
-{
- NvmeCmd *cmd = &req->cmd;
- uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
- uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
-
- return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
-}
-
static void nvme_aio_destroy(NvmeAIO *aio)
{
g_free(aio);
@@ -1016,10 +1256,7 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
uint64_t off, NvmeRequest *req)
{
- NvmeCmd *cmd = &req->cmd;
- uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
- uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
- uint32_t nsid = le32_to_cpu(cmd->nsid);
+ uint32_t nsid = le32_to_cpu(req->cmd.nsid);
uint32_t trans_len;
time_t current_ms;
@@ -1068,17 +1305,14 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
nvme_clear_events(n, NVME_AER_TYPE_SMART);
}
- return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
- DMA_DIRECTION_FROM_DEVICE, req);
+ return nvme_dma(n, (uint8_t *) &smart + off, trans_len,
+ DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
NvmeRequest *req)
{
uint32_t trans_len;
- NvmeCmd *cmd = &req->cmd;
- uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
- uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
NvmeFwSlotInfoLog fw_log;
if (off > sizeof(fw_log)) {
@@ -1089,17 +1323,14 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, uint32_t buf_len, uint64_t off,
trans_len = MIN(sizeof(fw_log) - off, buf_len);
- return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
- DMA_DIRECTION_FROM_DEVICE, req);
+ return nvme_dma(n, (uint8_t *) &fw_log + off, trans_len,
+ DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
uint64_t off, NvmeRequest *req)
{
uint32_t trans_len;
- NvmeCmd *cmd = &req->cmd;
- uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
- uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
NvmeErrorLog errlog;
if (!rae) {
@@ -1114,8 +1345,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
trans_len = MIN(sizeof(errlog) - off, buf_len);
- return nvme_dma_prp(n, (uint8_t *)&errlog, trans_len, prp1, prp2,
- DMA_DIRECTION_FROM_DEVICE, req);
+ return nvme_dma(n, (uint8_t *)&errlog, trans_len,
+ DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
@@ -1266,14 +1497,10 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeRequest *req)
{
- NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
- uint64_t prp1 = le64_to_cpu(c->prp1);
- uint64_t prp2 = le64_to_cpu(c->prp2);
-
trace_nvme_dev_identify_ctrl();
- return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
- prp2, DMA_DIRECTION_FROM_DEVICE, req);
+ return nvme_dma(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
+ DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
@@ -1281,8 +1508,6 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
NvmeNamespace *ns;
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
uint32_t nsid = le32_to_cpu(c->nsid);
- uint64_t prp1 = le64_to_cpu(c->prp1);
- uint64_t prp2 = le64_to_cpu(c->prp2);
trace_nvme_dev_identify_ns(nsid);
@@ -1293,8 +1518,8 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeRequest *req)
ns = &n->namespaces[nsid - 1];
- return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1,
- prp2, DMA_DIRECTION_FROM_DEVICE, req);
+ return nvme_dma(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns),
+ DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
@@ -1302,8 +1527,6 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
static const int data_len = NVME_IDENTIFY_DATA_SIZE;
uint32_t min_nsid = le32_to_cpu(c->nsid);
- uint64_t prp1 = le64_to_cpu(c->prp1);
- uint64_t prp2 = le64_to_cpu(c->prp2);
uint32_t *list;
uint16_t ret;
int i, j = 0;
@@ -1320,8 +1543,8 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeRequest *req)
break;
}
}
- ret = nvme_dma_prp(n, (uint8_t *)list, data_len, prp1, prp2,
- DMA_DIRECTION_FROM_DEVICE, req);
+ ret = nvme_dma(n, (uint8_t *)list, data_len, DMA_DIRECTION_FROM_DEVICE,
+ req);
g_free(list);
return ret;
}
@@ -1330,8 +1553,6 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
{
NvmeIdentify *c = (NvmeIdentify *)&req->cmd;
uint32_t nsid = le32_to_cpu(c->nsid);
- uint64_t prp1 = le64_to_cpu(c->prp1);
- uint64_t prp2 = le64_to_cpu(c->prp2);
uint8_t list[NVME_IDENTIFY_DATA_SIZE];
@@ -1361,8 +1582,8 @@ static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeRequest *req)
ns_descrs->uuid.hdr.nidl = NVME_NIDT_UUID_LEN;
stl_be_p(&ns_descrs->uuid.v, nsid);
- return nvme_dma_prp(n, list, NVME_IDENTIFY_DATA_SIZE, prp1, prp2,
- DMA_DIRECTION_FROM_DEVICE, req);
+ return nvme_dma(n, list, NVME_IDENTIFY_DATA_SIZE,
+ DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_identify(NvmeCtrl *n, NvmeRequest *req)
@@ -1438,14 +1659,10 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl *n)
static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
{
- NvmeCmd *cmd = &req->cmd;
- uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
- uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
-
uint64_t timestamp = nvme_get_timestamp(n);
- return nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1,
- prp2, DMA_DIRECTION_FROM_DEVICE, req);
+ return nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp),
+ DMA_DIRECTION_FROM_DEVICE, req);
}
static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest *req)
@@ -1528,12 +1745,9 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, NvmeRequest *req)
{
uint16_t ret;
uint64_t timestamp;
- NvmeCmd *cmd = &req->cmd;
- uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
- uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
- ret = nvme_dma_prp(n, (uint8_t *)×tamp, sizeof(timestamp), prp1,
- prp2, DMA_DIRECTION_TO_DEVICE, req);
+ ret = nvme_dma(n, (uint8_t *)×tamp, sizeof(timestamp),
+ DMA_DIRECTION_TO_DEVICE, req);
if (ret != NVME_SUCCESS) {
return ret;
}
@@ -2323,6 +2537,8 @@ static void nvme_init_ctrl(NvmeCtrl *n)
id->nn = cpu_to_le32(n->num_namespaces);
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROES | NVME_ONCS_TIMESTAMP);
+ id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORTED_NO_ALIGNMENT);
+
pstrcpy((char *) id->subnqn, sizeof(id->subnqn), "nqn.2019-08.org.qemu:");
pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
@@ -34,6 +34,7 @@ nvme_dev_irq_pin(void) "pulsing IRQ pin"
nvme_dev_irq_masked(void) "IRQ is masked"
nvme_dev_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
nvme_dev_map_prp(uint16_t cid, uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "cid %"PRIu16" trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
+nvme_dev_map_sgl(uint16_t cid, uint8_t typ, uint32_t nlb, uint64_t len) "cid %"PRIu16" type 0x%"PRIx8" nlb %"PRIu32" len %"PRIu64""
nvme_dev_req_add_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t count, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" count %"PRIu64" opc \"%s\" req %p"
nvme_dev_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p"
nvme_dev_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
@@ -89,6 +90,9 @@ nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts %"P
nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16""
nvme_dev_err_addr_read(uint64_t addr) "addr 0x%"PRIx64""
nvme_dev_err_addr_write(uint64_t addr) "addr 0x%"PRIx64""
+nvme_dev_err_invalid_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
+nvme_dev_err_invalid_num_sgld(uint16_t cid, uint8_t typ) "cid %"PRIu16" type 0x%"PRIx8""
+nvme_dev_err_invalid_sgl_excess_length(uint16_t cid) "cid %"PRIu16""
nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""