@@ -25,6 +25,8 @@
* Default: 64
* cmb_size_mb=<uint32> : Size of Controller Memory Buffer in MBs.
* Default: 0 (disabled)
+ * mdts=<uint8> : Maximum Data Transfer Size (power of two)
+ * Default: 7
*/
#include "qemu/osdep.h"
@@ -56,6 +58,7 @@
} while (0)
static void nvme_process_sq(void *opaque);
+static void nvme_aio_cb(void *opaque, int ret);
static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
{
@@ -197,7 +200,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, uint64_t prp1,
}
if (nvme_addr_is_cmb(n, prp1)) {
- req->is_cmb = true;
+ nvme_req_set_cmb(req);
}
pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
@@ -255,8 +258,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, uint64_t prp1,
}
addr_is_cmb = nvme_addr_is_cmb(n, prp_ent);
- if ((req->is_cmb && !addr_is_cmb) ||
- (!req->is_cmb && addr_is_cmb)) {
+ if ((nvme_req_is_cmb(req) && !addr_is_cmb) ||
+ (!nvme_req_is_cmb(req) && addr_is_cmb)) {
status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
goto unmap;
}
@@ -269,8 +272,8 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, uint64_t prp1,
}
} else {
bool addr_is_cmb = nvme_addr_is_cmb(n, prp2);
- if ((req->is_cmb && !addr_is_cmb) ||
- (!req->is_cmb && addr_is_cmb)) {
+ if ((nvme_req_is_cmb(req) && !addr_is_cmb) ||
+ (!nvme_req_is_cmb(req) && addr_is_cmb)) {
status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
goto unmap;
}
@@ -312,7 +315,7 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
return status;
}
- if (req->is_cmb) {
+ if (nvme_req_is_cmb(req)) {
QEMUIOVector iov;
qemu_iovec_init(&iov, qsg.nsg);
@@ -341,19 +344,18 @@ static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
uint64_t prp1, uint64_t prp2, NvmeRequest *req)
{
- QEMUSGList qsg;
uint16_t status = NVME_SUCCESS;
- status = nvme_map_prp(n, &qsg, prp1, prp2, len, req);
+ status = nvme_map_prp(n, &req->qsg, prp1, prp2, len, req);
if (status) {
return status;
}
- if (req->is_cmb) {
+ if (nvme_req_is_cmb(req)) {
QEMUIOVector iov;
- qemu_iovec_init(&iov, qsg.nsg);
- dma_to_cmb(n, &qsg, &iov);
+ qemu_iovec_init(&iov, req->qsg.nsg);
+ dma_to_cmb(n, &req->qsg, &iov);
if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) {
trace_nvme_err_invalid_dma();
@@ -365,17 +367,137 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
goto out;
}
- if (unlikely(dma_buf_read(ptr, len, &qsg))) {
+ if (unlikely(dma_buf_read(ptr, len, &req->qsg))) {
trace_nvme_err_invalid_dma();
status = NVME_INVALID_FIELD | NVME_DNR;
}
out:
- qemu_sglist_destroy(&qsg);
+ qemu_sglist_destroy(&req->qsg);
return status;
}
+static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+ NvmeNamespace *ns = req->ns;
+
+ uint32_t len = req->nlb << nvme_ns_lbads(ns);
+ uint64_t prp1 = le64_to_cpu(cmd->prp1);
+ uint64_t prp2 = le64_to_cpu(cmd->prp2);
+
+ return nvme_map_prp(n, &req->qsg, prp1, prp2, len, req);
+}
+
+static void nvme_aio_destroy(NvmeAIO *aio)
+{
+ if (aio->iov.nalloc) {
+ qemu_iovec_destroy(&aio->iov);
+ }
+
+ g_free(aio);
+}
+
+static NvmeAIO *nvme_aio_new(BlockBackend *blk, int64_t offset,
+ QEMUSGList *qsg, NvmeRequest *req, NvmeAIOCompletionFunc *cb)
+{
+ NvmeAIO *aio = g_malloc0(sizeof(*aio));
+
+ *aio = (NvmeAIO) {
+ .blk = blk,
+ .offset = offset,
+ .req = req,
+ .qsg = qsg,
+ .cb = cb,
+ };
+
+ if (qsg && nvme_req_is_cmb(req)) {
+ NvmeCtrl *n = nvme_ctrl(req);
+
+ qemu_iovec_init(&aio->iov, qsg->nsg);
+ dma_to_cmb(n, qsg, &aio->iov);
+
+ aio->qsg = NULL;
+ }
+
+ return aio;
+}
+
+static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
+ NvmeAIOOp opc)
+{
+ aio->opc = opc;
+
+ trace_nvme_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
+ aio->offset, aio->qsg ? aio->qsg->size : aio->iov.size,
+ nvme_aio_opc_str(aio), req);
+
+ if (req) {
+ QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
+ }
+}
+
+static void nvme_aio(NvmeAIO *aio)
+{
+ BlockBackend *blk = aio->blk;
+ BlockAcctCookie *acct = &aio->acct;
+ BlockAcctStats *stats = blk_get_stats(blk);
+
+ bool is_write, dma;
+
+ switch (aio->opc) {
+ case NVME_AIO_OPC_NONE:
+ break;
+
+ case NVME_AIO_OPC_FLUSH:
+ block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+ aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
+ break;
+
+ case NVME_AIO_OPC_WRITE_ZEROES:
+ block_acct_start(stats, acct, aio->iov.size, BLOCK_ACCT_WRITE);
+ aio->aiocb = blk_aio_pwrite_zeroes(aio->blk, aio->offset,
+ aio->iov.size, BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
+ break;
+
+ case NVME_AIO_OPC_READ:
+ case NVME_AIO_OPC_WRITE:
+ dma = aio->qsg != NULL;
+ is_write = (aio->opc == NVME_AIO_OPC_WRITE);
+
+ block_acct_start(stats, acct,
+ dma ? aio->qsg->size : aio->iov.size,
+ is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+ if (dma) {
+ aio->aiocb = is_write ?
+ dma_blk_write(blk, aio->qsg, aio->offset,
+ BDRV_SECTOR_SIZE, nvme_aio_cb, aio) :
+ dma_blk_read(blk, aio->qsg, aio->offset,
+ BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+
+ return;
+ }
+
+ aio->aiocb = is_write ?
+ blk_aio_pwritev(blk, aio->offset, &aio->iov, 0,
+ nvme_aio_cb, aio) :
+ blk_aio_preadv(blk, aio->offset, &aio->iov, 0,
+ nvme_aio_cb, aio);
+
+ break;
+ }
+}
+
+static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, QEMUSGList *qsg,
+ NvmeRequest *req)
+{
+ NvmeAIO *aio = nvme_aio_new(blk, offset, qsg, req, NULL);
+ nvme_req_register_aio(req, aio, nvme_req_is_write(req) ?
+ NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ);
+ nvme_aio(aio);
+}
+
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
@@ -411,6 +533,11 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
assert(cq->cqid == req->sq->cqid);
trace_nvme_enqueue_req_completion(req->cid, cq->cqid, req->status);
+
+ if (req->qsg.nalloc) {
+ qemu_sglist_destroy(&req->qsg);
+ }
+
QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
@@ -456,135 +583,266 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
}
}
-static void nvme_rw_cb(void *opaque, int ret)
+static void nvme_rw_cb(NvmeRequest *req, void *opaque)
{
- NvmeRequest *req = opaque;
NvmeSQueue *sq = req->sq;
NvmeCtrl *n = sq->ctrl;
NvmeCQueue *cq = n->cq[sq->cqid];
- NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd;
+
+ trace_nvme_rw_cb(req->cid, req->cmd.nsid);
+
+ nvme_enqueue_req_completion(cq, req);
+}
+
+static void nvme_aio_cb(void *opaque, int ret)
+{
+ NvmeAIO *aio = opaque;
+ NvmeRequest *req = aio->req;
+
+ BlockBackend *blk = aio->blk;
+ BlockAcctCookie *acct = &aio->acct;
+ BlockAcctStats *stats = blk_get_stats(blk);
+
+ Error *local_err = NULL;
+
+ trace_nvme_aio_cb(nvme_cid(req), aio, blk_name(aio->blk), aio->offset,
+ nvme_aio_opc_str(aio), req);
+
+ if (req) {
+ QTAILQ_REMOVE(&req->aio_tailq, aio, tailq_entry);
+ }
if (!ret) {
- block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
- req->status = NVME_SUCCESS;
+ block_acct_done(stats, acct);
+
+ if (aio->cb) {
+ aio->cb(aio, aio->cb_arg);
+ }
} else {
- block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
- nvme_set_error_page(n, sq->sqid, cpu_to_le16(req->cid),
- NVME_INTERNAL_DEV_ERROR, offsetof(NvmeRwCmd, slba), rw->slba, 1);
- req->status = NVME_INTERNAL_DEV_ERROR | NVME_MORE;
- }
+ block_acct_failed(stats, acct);
+
+ if (req) {
+ NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd;
+ NvmeSQueue *sq = req->sq;
+ NvmeCtrl *n = sq->ctrl;
+ uint16_t status;
+
+ switch (aio->opc) {
+ case NVME_AIO_OPC_READ:
+ status = NVME_UNRECOVERED_READ;
+ break;
+ case NVME_AIO_OPC_WRITE:
+ case NVME_AIO_OPC_WRITE_ZEROES:
+ status = NVME_WRITE_FAULT;
+ break;
+ default:
+ status = NVME_INTERNAL_DEV_ERROR;
+ break;
+ }
- if (req->qsg.nalloc) {
- qemu_sglist_destroy(&req->qsg);
+ trace_nvme_err_aio(nvme_cid(req), aio, blk_name(aio->blk),
+ aio->offset, nvme_aio_opc_str(aio), req, status);
+
+ nvme_set_error_page(n, sq->sqid, cpu_to_le16(req->cid), status,
+ offsetof(NvmeRwCmd, slba), rw->slba, 1);
+
+ error_setg_errno(&local_err, -ret, "aio failed");
+ error_report_err(local_err);
+
+ /*
+ * An Internal Error trumps all other errors. For other errors,
+ * only set the first error encountered. Any additional errors will
+ * be recorded in the error information log page.
+ */
+ if (!req->status ||
+ nvme_is_error(status, NVME_INTERNAL_DEV_ERROR)) {
+ req->status = status;
+ }
+ }
}
- if (req->iov.nalloc) {
- qemu_iovec_destroy(&req->iov);
+
+ if (req && QTAILQ_EMPTY(&req->aio_tailq)) {
+ if (req->cb) {
+ req->cb(req, req->cb_arg);
+ } else {
+ NvmeSQueue *sq = req->sq;
+ NvmeCtrl *n = sq->ctrl;
+ NvmeCQueue *cq = n->cq[sq->cqid];
+
+ nvme_enqueue_req_completion(cq, req);
+ }
}
- nvme_enqueue_req_completion(cq, req);
+ nvme_aio_destroy(aio);
}
-static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
+static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len,
NvmeRequest *req)
{
- block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
- BLOCK_ACCT_FLUSH);
- req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
+ uint8_t mdts = n->params.mdts;
- return NVME_NO_COMPLETE;
+ if (mdts && len > n->page_size << mdts) {
+ trace_nvme_err_mdts(nvme_cid(req), n->page_size << mdts, len);
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ return NVME_SUCCESS;
}
-static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
- NvmeRequest *req)
+static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeRequest *req)
{
- NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
- const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
- const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
- uint64_t slba = le64_to_cpu(rw->slba);
- uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
- uint64_t offset = slba << data_shift;
- uint32_t count = nlb << data_shift;
+ NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd;
+ NvmeNamespace *ns = req->ns;
+
+ uint16_t ctrl = le16_to_cpu(rw->control);
+
+ if ((ctrl & NVME_RW_PRINFO_PRACT) && !(ns->id_ns.dps & DPS_TYPE_MASK)) {
+ trace_nvme_err_prinfo(nvme_cid(req), ctrl);
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ return NVME_SUCCESS;
+}
- if (unlikely(slba + nlb > ns->id_ns.nsze)) {
- trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
+static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns = req->ns;
+ uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
+
+ if (unlikely((req->slba + req->nlb) > nsze)) {
+ block_acct_invalid(blk_get_stats(n->conf.blk),
+ nvme_req_is_write(req) ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+ trace_nvme_err_invalid_lba_range(req->slba, req->nlb, nsze);
return NVME_LBA_RANGE | NVME_DNR;
}
- block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
- BLOCK_ACCT_WRITE);
- req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
- BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
+ return NVME_SUCCESS;
+}
+
+static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns = req->ns;
+ size_t len = req->nlb << nvme_ns_lbads(ns);
+ uint16_t status;
+
+ status = nvme_check_mdts(n, len, req);
+ if (status) {
+ return status;
+ }
+
+ status = nvme_check_prinfo(n, req);
+ if (status) {
+ return status;
+ }
+
+ status = nvme_check_bounds(n, req);
+ if (status) {
+ return status;
+ }
+
+ return NVME_SUCCESS;
+}
+
+static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+ NvmeAIO *aio = nvme_aio_new(n->conf.blk, 0x0, NULL, req, NULL);
+
+ nvme_req_register_aio(req, aio, NVME_AIO_OPC_FLUSH);
+ nvme_aio(aio);
+
return NVME_NO_COMPLETE;
}
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
- NvmeRequest *req)
+static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
- NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
- uint32_t nlb = le32_to_cpu(rw->nlb) + 1;
- uint64_t slba = le64_to_cpu(rw->slba);
- uint64_t prp1 = le64_to_cpu(rw->prp1);
- uint64_t prp2 = le64_to_cpu(rw->prp2);
+ NvmeAIO *aio;
- uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
- uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
- uint64_t data_size = (uint64_t)nlb << data_shift;
- uint64_t data_offset = slba << data_shift;
- int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
- enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+ NvmeNamespace *ns = req->ns;
+ NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
- trace_nvme_rw(is_write ? "write" : "read", nlb, data_size, slba);
+ int64_t offset;
+ size_t count;
+ uint16_t status;
- if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
- block_acct_invalid(blk_get_stats(n->conf.blk), acct);
- trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
- return NVME_LBA_RANGE | NVME_DNR;
+ req->slba = le64_to_cpu(rw->slba);
+ req->nlb = le16_to_cpu(rw->nlb) + 1;
+
+ trace_nvme_write_zeros(req->cid, le32_to_cpu(cmd->nsid), req->slba,
+ req->nlb);
+
+ status = nvme_check_bounds(n, req);
+ if (unlikely(status)) {
+ block_acct_invalid(blk_get_stats(n->conf.blk), BLOCK_ACCT_WRITE);
+ return status;
}
- if (nvme_map_prp(n, &req->qsg, prp1, prp2, data_size, req)) {
+ offset = req->slba << nvme_ns_lbads(ns);
+ count = req->nlb << nvme_ns_lbads(ns);
+
+ aio = nvme_aio_new(n->conf.blk, offset, NULL, req, NULL);
+
+ aio->iov.size = count;
+
+ nvme_req_register_aio(req, aio, NVME_AIO_OPC_WRITE_ZEROES);
+ nvme_aio(aio);
+
+ return NVME_NO_COMPLETE;
+}
+
+static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+ NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
+ NvmeNamespace *ns = req->ns;
+ int status;
+
+ enum BlockAcctType acct =
+ nvme_req_is_write(req) ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
+
+ req->nlb = le16_to_cpu(rw->nlb) + 1;
+ req->slba = le64_to_cpu(rw->slba);
+
+ trace_nvme_rw(nvme_req_is_write(req) ? "write" : "read", req->nlb,
+ req->nlb << nvme_ns_lbads(req->ns), req->slba);
+
+ status = nvme_check_rw(n, req);
+ if (status) {
block_acct_invalid(blk_get_stats(n->conf.blk), acct);
- return NVME_INVALID_FIELD | NVME_DNR;
+ return status;
}
- dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
- if (!req->is_cmb) {
- req->aiocb = is_write ?
- dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
- nvme_rw_cb, req) :
- dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
- nvme_rw_cb, req);
- } else {
- qemu_iovec_init(&req->iov, req->qsg.nsg);
- dma_to_cmb(n, &req->qsg, &req->iov);
- req->aiocb = is_write ?
- blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
- req) :
- blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
- req);
+ status = nvme_map(n, cmd, req);
+ if (status) {
+ block_acct_invalid(blk_get_stats(n->conf.blk), acct);
+ return status;
}
+ nvme_rw_aio(n->conf.blk, req->slba << nvme_ns_lbads(ns), &req->qsg, req);
+ nvme_req_set_cb(req, nvme_rw_cb, NULL);
+
return NVME_NO_COMPLETE;
}
static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
- NvmeNamespace *ns;
uint32_t nsid = le32_to_cpu(cmd->nsid);
+ trace_nvme_io_cmd(req->cid, nsid, le16_to_cpu(req->sq->sqid), cmd->opcode);
+
if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
trace_nvme_err_invalid_ns(nsid, n->num_namespaces);
return NVME_INVALID_NSID | NVME_DNR;
}
- ns = &n->namespaces[nsid - 1];
+ req->ns = &n->namespaces[nsid - 1];
+
switch (cmd->opcode) {
case NVME_CMD_FLUSH:
- return nvme_flush(n, ns, cmd, req);
+ return nvme_flush(n, cmd, req);
case NVME_CMD_WRITE_ZEROS:
- return nvme_write_zeros(n, ns, cmd, req);
+ return nvme_write_zeros(n, cmd, req);
case NVME_CMD_WRITE:
case NVME_CMD_READ:
- return nvme_rw(n, ns, cmd, req);
+ return nvme_rw(n, cmd, req);
default:
trace_nvme_err_invalid_opc(cmd->opcode);
return NVME_INVALID_OPCODE | NVME_DNR;
@@ -609,6 +867,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
NvmeRequest *req, *next;
NvmeSQueue *sq;
NvmeCQueue *cq;
+ NvmeAIO *aio;
uint16_t qid = le16_to_cpu(c->qid);
if (unlikely(!qid || nvme_check_sqid(n, qid))) {
@@ -621,8 +880,11 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
sq = n->sq[qid];
while (!QTAILQ_EMPTY(&sq->out_req_list)) {
req = QTAILQ_FIRST(&sq->out_req_list);
- assert(req->aiocb);
- blk_aio_cancel(req->aiocb);
+ while (!QTAILQ_EMPTY(&req->aio_tailq)) {
+ aio = QTAILQ_FIRST(&req->aio_tailq);
+ assert(aio->aiocb);
+ blk_aio_cancel(aio->aiocb);
+ }
}
if (!nvme_check_cqid(n, sq->cqid)) {
cq = n->cq[sq->cqid];
@@ -659,6 +921,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
QTAILQ_INIT(&sq->out_req_list);
for (i = 0; i < sq->size; i++) {
sq->io_req[i].sq = sq;
+ QTAILQ_INIT(&(sq->io_req[i].aio_tailq));
QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
}
sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
@@ -819,6 +1082,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
uint32_t numdl, numdu;
uint64_t off, lpol, lpou;
size_t len;
+ uint16_t status;
numdl = (dw10 >> 16);
numdu = (dw11 & 0xffff);
@@ -834,6 +1098,11 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
trace_nvme_get_log(req->cid, lid, lsp, rae, len, off);
+ status = nvme_check_mdts(n, len, req);
+ if (status) {
+ return status;
+ }
+
switch (lid) {
case NVME_LOG_ERROR_INFO:
return nvme_error_info(n, cmd, rae, len, off, req);
@@ -1352,8 +1621,9 @@ static void nvme_init_req(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
memcpy(&req->cmd, cmd, sizeof(NvmeCmd));
req->status = NVME_SUCCESS;
- req->is_cmb = false;
- req->is_write = false;
+ req->flags = 0x0;
+ req->cb = NULL;
+ req->cb_arg = NULL;
}
static void nvme_process_sq(void *opaque)
@@ -1952,6 +2222,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
id->ieee[0] = 0x00;
id->ieee[1] = 0x02;
id->ieee[2] = 0xb3;
+ id->mdts = params->mdts;
id->ver = cpu_to_le32(0x00010300);
id->oacs = cpu_to_le16(0);
id->acl = 3;
@@ -8,7 +8,8 @@
DEFINE_PROP_UINT32("cmb_size_mb", _state, _props.cmb_size_mb, 0), \
DEFINE_PROP_UINT32("num_queues", _state, _props.num_queues, 64), \
DEFINE_PROP_UINT8("elpe", _state, _props.elpe, 24), \
- DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3)
+ DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3), \
+ DEFINE_PROP_UINT8("mdts", _state, _props.mdts, 7)
typedef struct NvmeParams {
char *serial;
@@ -16,6 +17,7 @@ typedef struct NvmeParams {
uint32_t cmb_size_mb;
uint8_t elpe;
uint8_t aerl;
+ uint8_t mdts;
} NvmeParams;
typedef struct NvmeAsyncEvent {
@@ -23,23 +25,124 @@ typedef struct NvmeAsyncEvent {
NvmeAerResult result;
} NvmeAsyncEvent;
-typedef struct NvmeRequest {
- struct NvmeSQueue *sq;
- BlockAIOCB *aiocb;
- uint16_t status;
- uint16_t cid;
- bool is_cmb;
- bool is_write;
- NvmeCqe cqe;
- BlockAcctCookie acct;
- QEMUSGList qsg;
- QEMUIOVector iov;
- NvmeCmd cmd;
- QTAILQ_ENTRY(NvmeRequest)entry;
-} NvmeRequest;
-
-typedef struct NvmeSQueue {
- struct NvmeCtrl *ctrl;
+typedef enum NvmeAIOOp {
+ NVME_AIO_OPC_NONE = 0x0,
+ NVME_AIO_OPC_FLUSH = 0x1,
+ NVME_AIO_OPC_READ = 0x2,
+ NVME_AIO_OPC_WRITE = 0x3,
+ NVME_AIO_OPC_WRITE_ZEROES = 0x4,
+} NvmeAIOOp;
+
+typedef struct NvmeRequest NvmeRequest;
+typedef struct NvmeAIO NvmeAIO;
+typedef void NvmeAIOCompletionFunc(NvmeAIO *aio, void *opaque);
+
+struct NvmeAIO {
+ NvmeRequest *req;
+
+ NvmeAIOOp opc;
+ int64_t offset;
+ BlockBackend *blk;
+ BlockAIOCB *aiocb;
+ BlockAcctCookie acct;
+
+ NvmeAIOCompletionFunc *cb;
+ void *cb_arg;
+
+ QEMUSGList *qsg;
+ QEMUIOVector iov;
+
+ QTAILQ_ENTRY(NvmeAIO) tailq_entry;
+};
+
+static inline const char *nvme_aio_opc_str(NvmeAIO *aio)
+{
+ switch (aio->opc) {
+ case NVME_AIO_OPC_NONE: return "NVME_AIO_OP_NONE";
+ case NVME_AIO_OPC_FLUSH: return "NVME_AIO_OP_FLUSH";
+ case NVME_AIO_OPC_READ: return "NVME_AIO_OP_READ";
+ case NVME_AIO_OPC_WRITE: return "NVME_AIO_OP_WRITE";
+ case NVME_AIO_OPC_WRITE_ZEROES: return "NVME_AIO_OP_WRITE_ZEROES";
+ default: return "NVME_AIO_OP_UNKNOWN";
+ }
+}
+
+#define NVME_REQ_TRANSFER_DMA 0x1
+#define NVME_REQ_TRANSFER_CMB 0x2
+#define NVME_REQ_TRANSFER_MASK 0x3
+
+typedef struct NvmeSQueue NvmeSQueue;
+typedef struct NvmeNamespace NvmeNamespace;
+typedef void NvmeRequestCompletionFunc(NvmeRequest *req, void *opaque);
+
+struct NvmeRequest {
+ NvmeSQueue *sq;
+ NvmeNamespace *ns;
+ NvmeCqe cqe;
+ NvmeCmd cmd;
+
+ uint64_t slba;
+ uint32_t nlb;
+ uint16_t status;
+ uint16_t cid;
+ int flags;
+
+ NvmeRequestCompletionFunc *cb;
+ void *cb_arg;
+
+ QEMUSGList qsg;
+
+ QTAILQ_HEAD(, NvmeAIO) aio_tailq;
+ QTAILQ_ENTRY(NvmeRequest) entry;
+};
+
+static inline void nvme_req_set_cb(NvmeRequest *req,
+ NvmeRequestCompletionFunc *cb, void *cb_arg)
+{
+ req->cb = cb;
+ req->cb_arg = cb_arg;
+}
+
+static inline void nvme_req_clear_cb(NvmeRequest *req)
+{
+ req->cb = req->cb_arg = NULL;
+}
+
+static inline uint16_t nvme_cid(NvmeRequest *req)
+{
+ if (req) {
+ return req->cid;
+ }
+
+ return 0xffff;
+}
+
+static inline bool nvme_req_is_cmb(NvmeRequest *req)
+{
+ return (req->flags & NVME_REQ_TRANSFER_MASK) == NVME_REQ_TRANSFER_CMB;
+}
+
+static void nvme_req_set_cmb(NvmeRequest *req)
+{
+ req->flags = NVME_REQ_TRANSFER_CMB;
+}
+
+static inline bool nvme_req_is_write(NvmeRequest *req)
+{
+ switch (req->cmd.opcode) {
+ case NVME_CMD_WRITE:
+ case NVME_CMD_WRITE_UNCOR:
+ case NVME_CMD_WRITE_ZEROS:
+ return true;
+ default:
+ return false;
+ }
+}
+
+typedef struct NvmeCtrl NvmeCtrl;
+
+struct NvmeSQueue {
+ NvmeCtrl *ctrl;
uint16_t sqid;
uint16_t cqid;
uint32_t head;
@@ -51,10 +154,12 @@ typedef struct NvmeSQueue {
QTAILQ_HEAD(, NvmeRequest) req_list;
QTAILQ_HEAD(, NvmeRequest) out_req_list;
QTAILQ_ENTRY(NvmeSQueue) entry;
-} NvmeSQueue;
+};
-typedef struct NvmeCQueue {
- struct NvmeCtrl *ctrl;
+typedef struct NvmeCQueue NvmeCQueue;
+
+struct NvmeCQueue {
+ NvmeCtrl *ctrl;
uint8_t phase;
uint16_t cqid;
uint16_t irq_enabled;
@@ -66,11 +171,11 @@ typedef struct NvmeCQueue {
QEMUTimer *timer;
QTAILQ_HEAD(, NvmeSQueue) sq_list;
QTAILQ_HEAD(, NvmeRequest) req_list;
-} NvmeCQueue;
+};
-typedef struct NvmeNamespace {
+struct NvmeNamespace {
NvmeIdNs id_ns;
-} NvmeNamespace;
+};
#define TYPE_NVME "nvme"
#define NVME(obj) \
@@ -122,6 +227,17 @@ typedef struct NvmeCtrl {
NvmeIdCtrl id_ctrl;
} NvmeCtrl;
+static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
+{
+ return req->sq->ctrl;
+}
+
+static inline bool nvme_is_error(uint16_t status, uint16_t err)
+{
+ /* strip DNR and MORE */
+ return (status & 0xfff) == err;
+}
+
static inline NvmeLBAF nvme_ns_lbaf(NvmeNamespace *ns)
{
NvmeIdNs *id_ns = &ns->id_ns;
@@ -143,5 +259,4 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
return n->ns_size >> nvme_ns_lbads(ns);
}
-
#endif /* HW_NVME_H */
@@ -34,7 +34,12 @@ nvme_irq_pin(void) "pulsing IRQ pin"
nvme_irq_masked(void) "IRQ is masked"
nvme_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
nvme_map_prp(uint16_t cid, uint8_t opc, uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "cid %"PRIu16" opc 0x%"PRIx8" trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
+nvme_req_register_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t count, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" count %"PRIu64" opc \"%s\" req %p"
+nvme_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p"
+nvme_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
nvme_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_rw_cb(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
+nvme_write_zeros(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
@@ -74,6 +79,9 @@ nvme_mmio_shutdown_set(void) "shutdown bit set"
nvme_mmio_shutdown_cleared(void) "shutdown bit cleared"
# nvme traces for error conditions
+nvme_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts %"PRIu64" len %"PRIu64""
+nvme_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16""
+nvme_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16""
nvme_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
nvme_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
nvme_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""