diff mbox series

[v3,08/18] hw/block/nvme: add support for the asynchronous event request command

Message ID 20200706061303.246057-9-its@irrelevant.dk
State New
Headers show
Series hw/block/nvme: bump to v1.3 | expand

Commit Message

Klaus Jensen July 6, 2020, 6:12 a.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

Add support for the Asynchronous Event Request command. Required for
compliance with NVMe revision 1.3d. See NVM Express 1.3d, Section 5.2
("Asynchronous Event Request command").

Mostly imported from Keith's qemu-nvme tree. Modified with a max number
of queued events (controllable with the aer_max_queued device
parameter). The spec states that the controller *should* retain
events, so we do best effort here.

Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Acked-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
---
 hw/block/nvme.c       | 180 ++++++++++++++++++++++++++++++++++++++++--
 hw/block/nvme.h       |  10 ++-
 hw/block/trace-events |   9 +++
 include/block/nvme.h  |   8 +-
 4 files changed, 198 insertions(+), 9 deletions(-)

Comments

Maxim Levitsky July 29, 2020, 10:43 a.m. UTC | #1
On Mon, 2020-07-06 at 08:12 +0200, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
> 
> Add support for the Asynchronous Event Request command. Required for
> compliance with NVMe revision 1.3d. See NVM Express 1.3d, Section 5.2
> ("Asynchronous Event Request command").
> 
> Mostly imported from Keith's qemu-nvme tree. Modified with a max number
> of queued events (controllable with the aer_max_queued device
> parameter). The spec states that the controller *should* retain
> events, so we do best effort here.
> 
> Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> Acked-by: Keith Busch <kbusch@kernel.org>
> Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
> Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> ---
>  hw/block/nvme.c       | 180 ++++++++++++++++++++++++++++++++++++++++--
>  hw/block/nvme.h       |  10 ++-
>  hw/block/trace-events |   9 +++
>  include/block/nvme.h  |   8 +-
>  4 files changed, 198 insertions(+), 9 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 7cb3787638f6..80c7285bc1cf 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -356,6 +356,85 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
>      timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>  }
>  
> +static void nvme_process_aers(void *opaque)
> +{
> +    NvmeCtrl *n = opaque;
> +    NvmeAsyncEvent *event, *next;
> +
> +    trace_pci_nvme_process_aers(n->aer_queued);
> +
> +    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
> +        NvmeRequest *req;
> +        NvmeAerResult *result;
> +
> +        /* can't post cqe if there is nothing to complete */
> +        if (!n->outstanding_aers) {
> +            trace_pci_nvme_no_outstanding_aers();
> +            break;
> +        }
> +
> +        /* ignore if masked (cqe posted, but event not cleared) */
> +        if (n->aer_mask & (1 << event->result.event_type)) {
> +            trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
> +            continue;
> +        }
> +
> +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> +        n->aer_queued--;
> +
> +        n->aer_mask |= 1 << event->result.event_type;
> +        n->outstanding_aers--;
> +
> +        req = n->aer_reqs[n->outstanding_aers];
> +
> +        result = (NvmeAerResult *) &req->cqe.result;
> +        result->event_type = event->result.event_type;
> +        result->event_info = event->result.event_info;
> +        result->log_page = event->result.log_page;
> +        g_free(event);
> +
> +        req->status = NVME_SUCCESS;
> +
> +        trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
> +                                    result->log_page);
> +
> +        nvme_enqueue_req_completion(&n->admin_cq, req);
> +    }
> +}
> +
> +static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
> +                               uint8_t event_info, uint8_t log_page)
> +{
> +    NvmeAsyncEvent *event;
> +
> +    trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
> +
> +    if (n->aer_queued == n->params.aer_max_queued) {
> +        trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
> +        return;
> +    }
> +
> +    event = g_new(NvmeAsyncEvent, 1);
> +    event->result = (NvmeAerResult) {
> +        .event_type = event_type,
> +        .event_info = event_info,
> +        .log_page   = log_page,
> +    };
> +
> +    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
> +    n->aer_queued++;
> +
> +    nvme_process_aers(n);
> +}
> +
> +static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
> +{
> +    n->aer_mask &= ~(1 << event_type);
> +    if (!QTAILQ_EMPTY(&n->aer_queue)) {
> +        nvme_process_aers(n);
> +    }
> +}
> +
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>      NvmeRequest *req = opaque;
> @@ -606,8 +685,9 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
>      return NVME_SUCCESS;
>  }
>  
> -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> -                                uint64_t off, NvmeRequest *req)
> +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> +                                uint32_t buf_len, uint64_t off,
> +                                NvmeRequest *req)
>  {
>      uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
>      uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> @@ -655,6 +735,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
>      smart.power_on_hours[0] =
>          cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
>  
> +    if (!rae) {
> +        nvme_clear_events(n, NVME_AER_TYPE_SMART);
> +    }
> +
>      return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
>                               prp2);
>  }
> @@ -681,14 +765,19 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
>                               prp2);
>  }
>  
> -static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> -                                uint64_t off, NvmeRequest *req)
> +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> +                                uint32_t buf_len, uint64_t off,
> +                                NvmeRequest *req)
>  {
>      uint32_t trans_len;
>      uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
>      uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
>      NvmeErrorLog errlog;
>  
> +    if (!rae) {
> +        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
> +    }
> +
>      if (off > sizeof(errlog)) {
>          return NVME_INVALID_FIELD | NVME_DNR;
>      }
> @@ -729,9 +818,9 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>  
>      switch (lid) {
>      case NVME_LOG_ERROR_INFO:
> -        return nvme_error_info(n, cmd, len, off, req);
> +        return nvme_error_info(n, cmd, rae, len, off, req);
>      case NVME_LOG_SMART_INFO:
> -        return nvme_smart_info(n, cmd, len, off, req);
> +        return nvme_smart_info(n, cmd, rae, len, off, req);
>      case NVME_LOG_FW_SLOT_INFO:
>          return nvme_fw_log_info(n, cmd, len, off, req);
>      default:
> @@ -1013,6 +1102,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>              ((n->params.max_ioqpairs - 1) << 16);
>          trace_pci_nvme_getfeat_numq(result);
>          break;
> +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> +        result = n->features.async_config;
> +        break;
>      case NVME_TIMESTAMP:
>          return nvme_get_feature_timestamp(n, cmd);
>      default:
> @@ -1064,6 +1156,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>              return NVME_INVALID_FIELD | NVME_DNR;
>          }
>  
> +        if (((n->temperature >= n->features.temp_thresh_hi) ||
> +            (n->temperature <= n->features.temp_thresh_low)) &&
> +            NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) {
> +            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
> +                               NVME_AER_INFO_SMART_TEMP_THRESH,
> +                               NVME_LOG_SMART_INFO);
> +        }
> +
>          break;
>      case NVME_VOLATILE_WRITE_CACHE:
>          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
> @@ -1076,6 +1176,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>          req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
>                                        ((n->params.max_ioqpairs - 1) << 16));
>          break;
> +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> +        n->features.async_config = dw11;
> +        break;
>      case NVME_TIMESTAMP:
>          return nvme_set_feature_timestamp(n, cmd);
>      default:
> @@ -1085,6 +1188,25 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>      return NVME_SUCCESS;
>  }
>  
> +static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +    trace_pci_nvme_aer(nvme_cid(req));
> +
> +    if (n->outstanding_aers > n->params.aerl) {
> +        trace_pci_nvme_aer_aerl_exceeded();
> +        return NVME_AER_LIMIT_EXCEEDED;
> +    }
> +
> +    n->aer_reqs[n->outstanding_aers] = req;
> +    n->outstanding_aers++;
> +
> +    if (!QTAILQ_EMPTY(&n->aer_queue)) {
> +        nvme_process_aers(n);
> +    }
> +
> +    return NVME_NO_COMPLETE;
> +}

Looks good so far

> +
>  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>  {
>      trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), cmd->opcode);
> @@ -1108,6 +1230,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>          return nvme_set_feature(n, cmd, req);
>      case NVME_ADM_CMD_GET_FEATURES:
>          return nvme_get_feature(n, cmd, req);
> +    case NVME_ADM_CMD_ASYNC_EV_REQ:
> +        return nvme_aer(n, cmd, req);
>      default:
>          trace_pci_nvme_err_invalid_admin_opc(cmd->opcode);
>          return NVME_INVALID_OPCODE | NVME_DNR;
> @@ -1162,6 +1286,15 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
>          }
>      }
>  
> +    while (!QTAILQ_EMPTY(&n->aer_queue)) {
> +        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
> +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> +        g_free(event);
> +    }
> +
> +    n->aer_queued = 0;
> +    n->outstanding_aers = 0;
> +
>      blk_flush(n->conf.blk);
>      n->bar.cc = 0;
>  }
> @@ -1258,6 +1391,8 @@ static int nvme_start_ctrl(NvmeCtrl *n)
>  
>      nvme_set_timestamp(n, 0ULL);
>  
> +    QTAILQ_INIT(&n->aer_queue);
> +
>      return 0;
>  }
>  
> @@ -1479,6 +1614,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             "completion queue doorbell write"
>                             " for nonexistent queue,"
>                             " sqid=%"PRIu32", ignoring", qid);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> +                                   NVME_LOG_ERROR_INFO);
> +            }
To be honest I would move the check for outstanding AERs to nvme_enqueue_event.

Also the logic seems a bit off. The code checks that we have outstanding AER requests,
however we do have internal AER queue for this situation.
It seems that SMART events are generated without this check but ERROR events only when
outstanding AERs exist.
Could you explain? I am probably forgot something from the spec which I haven't read for long time.


> +
>              return;
>          }
>  
> @@ -1489,6 +1631,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             " beyond queue size, sqid=%"PRIu32","
>                             " new_head=%"PRIu16", ignoring",
>                             qid, new_head);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
> +                                   NVME_LOG_ERROR_INFO);
> +            }
> +
>              return;
>          }
>  
> @@ -1519,6 +1668,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             "submission queue doorbell write"
>                             " for nonexistent queue,"
>                             " sqid=%"PRIu32", ignoring", qid);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> +                                   NVME_LOG_ERROR_INFO);
> +            }
> +
>              return;
>          }
>  
> @@ -1529,6 +1685,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
>                             " beyond queue size, sqid=%"PRIu32","
>                             " new_tail=%"PRIu16", ignoring",
>                             qid, new_tail);
> +
> +            if (n->outstanding_aers) {
> +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> +                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
> +                                   NVME_LOG_ERROR_INFO);
> +            }
> +
>              return;
>          }
>  
> @@ -1650,6 +1813,7 @@ static void nvme_init_state(NvmeCtrl *n)
>      n->temperature = NVME_TEMPERATURE;
>      n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
>      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
>  }
>  
>  static void nvme_init_blk(NvmeCtrl *n, Error **errp)
> @@ -1805,6 +1969,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
>       * inconsequential.
>       */
>      id->acl = 3;
> +    id->aerl = n->params.aerl;
Name a tiny bit unclear. I know that this is from the spec but still.

>      id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
>      id->lpa = NVME_LPA_EXTENDED;
>  
> @@ -1879,6 +2044,7 @@ static void nvme_exit(PCIDevice *pci_dev)
>      g_free(n->namespaces);
>      g_free(n->cq);
>      g_free(n->sq);
> +    g_free(n->aer_reqs);
>  
>      if (n->params.cmb_size_mb) {
>          g_free(n->cmbuf);
> @@ -1899,6 +2065,8 @@ static Property nvme_props[] = {
>      DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
>      DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
>      DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
> +    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
So this is number of AERs that we allow the user to be outstanding

> +    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
And this is the number of AERs that we keep in our internal AER queue untill user posts and AER so that we
can complete it.

>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 8228978e93de..1837233617bb 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -9,10 +9,12 @@ typedef struct NvmeParams {
>      uint32_t max_ioqpairs;
>      uint16_t msix_qsize;
>      uint32_t cmb_size_mb;
> +    uint8_t  aerl;
> +    uint32_t aer_max_queued;
>  } NvmeParams;
>  
>  typedef struct NvmeAsyncEvent {
> -    QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry;
> +    QTAILQ_ENTRY(NvmeAsyncEvent) entry;
>      NvmeAerResult result;
>  } NvmeAsyncEvent;
>  
> @@ -94,6 +96,7 @@ typedef struct NvmeCtrl {
>      uint32_t    num_namespaces;
>      uint32_t    max_q_ents;
>      uint64_t    ns_size;
> +    uint8_t     outstanding_aers;
>      uint8_t     *cmbuf;
>      uint32_t    irq_status;
>      uint64_t    host_timestamp;                 /* Timestamp sent by the host */
> @@ -103,6 +106,11 @@ typedef struct NvmeCtrl {
>  
>      HostMemoryBackend *pmrdev;
>  
> +    uint8_t     aer_mask;
> +    NvmeRequest **aer_reqs;
> +    QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
> +    int         aer_queued;
> +
>      NvmeNamespace   *namespaces;
>      NvmeSQueue      **sq;
>      NvmeCQueue      **cq;
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 3330d74e48db..091af16ca7d7 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -51,6 +51,15 @@ pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
>  pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
>  pci_nvme_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64""
>  pci_nvme_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64""
> +pci_nvme_process_aers(int queued) "queued %d"
> +pci_nvme_aer(uint16_t cid) "cid %"PRIu16""
> +pci_nvme_aer_aerl_exceeded(void) "aerl exceeded"
> +pci_nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 0x%"PRIx8""
> +pci_nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
> +pci_nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
> +pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
> +pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
> +pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
>  pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
>  pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
>  pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64""
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 49ce97ae1ab4..2101292ed5e8 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -597,8 +597,8 @@ enum NvmeAsyncEventRequest {
>      NVME_AER_TYPE_SMART                     = 1,
>      NVME_AER_TYPE_IO_SPECIFIC               = 6,
>      NVME_AER_TYPE_VENDOR_SPECIFIC           = 7,
> -    NVME_AER_INFO_ERR_INVALID_SQ            = 0,
> -    NVME_AER_INFO_ERR_INVALID_DB            = 1,
> +    NVME_AER_INFO_ERR_INVALID_DB_REGISTER   = 0,
> +    NVME_AER_INFO_ERR_INVALID_DB_VALUE      = 1,
>      NVME_AER_INFO_ERR_DIAG_FAIL             = 2,
>      NVME_AER_INFO_ERR_PERS_INTERNAL_ERR     = 3,
>      NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR    = 4,
> @@ -899,6 +899,10 @@ typedef struct NvmeFeatureVal {
>  
>  #define NVME_TEMP_TMPTH(temp) ((temp >>  0) & 0xffff)
>  
> +#define NVME_AEC_SMART(aec)         (aec & 0xff)
> +#define NVME_AEC_NS_ATTR(aec)       ((aec >> 8) & 0x1)
> +#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
> +
>  enum NvmeFeatureIds {
>      NVME_ARBITRATION                = 0x1,
>      NVME_POWER_MANAGEMENT           = 0x2,


Best regards,
	Maxim Levitsky
Klaus Jensen July 29, 2020, 1:37 p.m. UTC | #2
On Jul 29 13:43, Maxim Levitsky wrote:
> On Mon, 2020-07-06 at 08:12 +0200, Klaus Jensen wrote:
> > From: Klaus Jensen <k.jensen@samsung.com>
> > 
> > Add support for the Asynchronous Event Request command. Required for
> > compliance with NVMe revision 1.3d. See NVM Express 1.3d, Section 5.2
> > ("Asynchronous Event Request command").
> > 
> > Mostly imported from Keith's qemu-nvme tree. Modified with a max number
> > of queued events (controllable with the aer_max_queued device
> > parameter). The spec states that the controller *should* retain
> > events, so we do best effort here.
> > 
> > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > Acked-by: Keith Busch <kbusch@kernel.org>
> > Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
> > Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > ---
> >  hw/block/nvme.c       | 180 ++++++++++++++++++++++++++++++++++++++++--
> >  hw/block/nvme.h       |  10 ++-
> >  hw/block/trace-events |   9 +++
> >  include/block/nvme.h  |   8 +-
> >  4 files changed, 198 insertions(+), 9 deletions(-)
> > 
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 7cb3787638f6..80c7285bc1cf 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -356,6 +356,85 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
> >      timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> >  }
> >  
> > +static void nvme_process_aers(void *opaque)
> > +{
> > +    NvmeCtrl *n = opaque;
> > +    NvmeAsyncEvent *event, *next;
> > +
> > +    trace_pci_nvme_process_aers(n->aer_queued);
> > +
> > +    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
> > +        NvmeRequest *req;
> > +        NvmeAerResult *result;
> > +
> > +        /* can't post cqe if there is nothing to complete */
> > +        if (!n->outstanding_aers) {
> > +            trace_pci_nvme_no_outstanding_aers();
> > +            break;
> > +        }
> > +
> > +        /* ignore if masked (cqe posted, but event not cleared) */
> > +        if (n->aer_mask & (1 << event->result.event_type)) {
> > +            trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
> > +            continue;
> > +        }
> > +
> > +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> > +        n->aer_queued--;
> > +
> > +        n->aer_mask |= 1 << event->result.event_type;
> > +        n->outstanding_aers--;
> > +
> > +        req = n->aer_reqs[n->outstanding_aers];
> > +
> > +        result = (NvmeAerResult *) &req->cqe.result;
> > +        result->event_type = event->result.event_type;
> > +        result->event_info = event->result.event_info;
> > +        result->log_page = event->result.log_page;
> > +        g_free(event);
> > +
> > +        req->status = NVME_SUCCESS;
> > +
> > +        trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
> > +                                    result->log_page);
> > +
> > +        nvme_enqueue_req_completion(&n->admin_cq, req);
> > +    }
> > +}
> > +
> > +static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
> > +                               uint8_t event_info, uint8_t log_page)
> > +{
> > +    NvmeAsyncEvent *event;
> > +
> > +    trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
> > +
> > +    if (n->aer_queued == n->params.aer_max_queued) {
> > +        trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
> > +        return;
> > +    }
> > +
> > +    event = g_new(NvmeAsyncEvent, 1);
> > +    event->result = (NvmeAerResult) {
> > +        .event_type = event_type,
> > +        .event_info = event_info,
> > +        .log_page   = log_page,
> > +    };
> > +
> > +    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
> > +    n->aer_queued++;
> > +
> > +    nvme_process_aers(n);
> > +}
> > +
> > +static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
> > +{
> > +    n->aer_mask &= ~(1 << event_type);
> > +    if (!QTAILQ_EMPTY(&n->aer_queue)) {
> > +        nvme_process_aers(n);
> > +    }
> > +}
> > +
> >  static void nvme_rw_cb(void *opaque, int ret)
> >  {
> >      NvmeRequest *req = opaque;
> > @@ -606,8 +685,9 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
> >      return NVME_SUCCESS;
> >  }
> >  
> > -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> > -                                uint64_t off, NvmeRequest *req)
> > +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> > +                                uint32_t buf_len, uint64_t off,
> > +                                NvmeRequest *req)
> >  {
> >      uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> >      uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> > @@ -655,6 +735,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> >      smart.power_on_hours[0] =
> >          cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
> >  
> > +    if (!rae) {
> > +        nvme_clear_events(n, NVME_AER_TYPE_SMART);
> > +    }
> > +
> >      return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
> >                               prp2);
> >  }
> > @@ -681,14 +765,19 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> >                               prp2);
> >  }
> >  
> > -static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> > -                                uint64_t off, NvmeRequest *req)
> > +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> > +                                uint32_t buf_len, uint64_t off,
> > +                                NvmeRequest *req)
> >  {
> >      uint32_t trans_len;
> >      uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> >      uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> >      NvmeErrorLog errlog;
> >  
> > +    if (!rae) {
> > +        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
> > +    }
> > +
> >      if (off > sizeof(errlog)) {
> >          return NVME_INVALID_FIELD | NVME_DNR;
> >      }
> > @@ -729,9 +818,9 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >  
> >      switch (lid) {
> >      case NVME_LOG_ERROR_INFO:
> > -        return nvme_error_info(n, cmd, len, off, req);
> > +        return nvme_error_info(n, cmd, rae, len, off, req);
> >      case NVME_LOG_SMART_INFO:
> > -        return nvme_smart_info(n, cmd, len, off, req);
> > +        return nvme_smart_info(n, cmd, rae, len, off, req);
> >      case NVME_LOG_FW_SLOT_INFO:
> >          return nvme_fw_log_info(n, cmd, len, off, req);
> >      default:
> > @@ -1013,6 +1102,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >              ((n->params.max_ioqpairs - 1) << 16);
> >          trace_pci_nvme_getfeat_numq(result);
> >          break;
> > +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> > +        result = n->features.async_config;
> > +        break;
> >      case NVME_TIMESTAMP:
> >          return nvme_get_feature_timestamp(n, cmd);
> >      default:
> > @@ -1064,6 +1156,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >              return NVME_INVALID_FIELD | NVME_DNR;
> >          }
> >  
> > +        if (((n->temperature >= n->features.temp_thresh_hi) ||
> > +            (n->temperature <= n->features.temp_thresh_low)) &&
> > +            NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) {
> > +            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
> > +                               NVME_AER_INFO_SMART_TEMP_THRESH,
> > +                               NVME_LOG_SMART_INFO);
> > +        }
> > +
> >          break;
> >      case NVME_VOLATILE_WRITE_CACHE:
> >          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
> > @@ -1076,6 +1176,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >          req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
> >                                        ((n->params.max_ioqpairs - 1) << 16));
> >          break;
> > +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> > +        n->features.async_config = dw11;
> > +        break;
> >      case NVME_TIMESTAMP:
> >          return nvme_set_feature_timestamp(n, cmd);
> >      default:
> > @@ -1085,6 +1188,25 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >      return NVME_SUCCESS;
> >  }
> >  
> > +static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > +{
> > +    trace_pci_nvme_aer(nvme_cid(req));
> > +
> > +    if (n->outstanding_aers > n->params.aerl) {
> > +        trace_pci_nvme_aer_aerl_exceeded();
> > +        return NVME_AER_LIMIT_EXCEEDED;
> > +    }
> > +
> > +    n->aer_reqs[n->outstanding_aers] = req;
> > +    n->outstanding_aers++;
> > +
> > +    if (!QTAILQ_EMPTY(&n->aer_queue)) {
> > +        nvme_process_aers(n);
> > +    }
> > +
> > +    return NVME_NO_COMPLETE;
> > +}
> 
> Looks good so far
> 
> > +
> >  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >  {
> >      trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), cmd->opcode);
> > @@ -1108,6 +1230,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >          return nvme_set_feature(n, cmd, req);
> >      case NVME_ADM_CMD_GET_FEATURES:
> >          return nvme_get_feature(n, cmd, req);
> > +    case NVME_ADM_CMD_ASYNC_EV_REQ:
> > +        return nvme_aer(n, cmd, req);
> >      default:
> >          trace_pci_nvme_err_invalid_admin_opc(cmd->opcode);
> >          return NVME_INVALID_OPCODE | NVME_DNR;
> > @@ -1162,6 +1286,15 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
> >          }
> >      }
> >  
> > +    while (!QTAILQ_EMPTY(&n->aer_queue)) {
> > +        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
> > +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> > +        g_free(event);
> > +    }
> > +
> > +    n->aer_queued = 0;
> > +    n->outstanding_aers = 0;
> > +
> >      blk_flush(n->conf.blk);
> >      n->bar.cc = 0;
> >  }
> > @@ -1258,6 +1391,8 @@ static int nvme_start_ctrl(NvmeCtrl *n)
> >  
> >      nvme_set_timestamp(n, 0ULL);
> >  
> > +    QTAILQ_INIT(&n->aer_queue);
> > +
> >      return 0;
> >  }
> >  
> > @@ -1479,6 +1614,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >                             "completion queue doorbell write"
> >                             " for nonexistent queue,"
> >                             " sqid=%"PRIu32", ignoring", qid);
> > +
> > +            if (n->outstanding_aers) {
> > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > +                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> > +                                   NVME_LOG_ERROR_INFO);
> > +            }
> To be honest I would move the check for outstanding AERs to nvme_enqueue_event.
> 
> Also the logic seems a bit off. The code checks that we have outstanding AER requests,
> however we do have internal AER queue for this situation.
> It seems that SMART events are generated without this check but ERROR events only when
> outstanding AERs exist.
> Could you explain? I am probably forgot something from the spec which I haven't read for long time.
> 

I'm pretty sure this has been mentioned before, but I can't find it
anywhere, maybe it was an internal review...

Anyway, I'm interpreting the AER logic as a special case for doorbell writes:

NVM Express v1.3d, Section 4.1 state: "If host software writes an
invalid value to the Submission Queue Tail Doorbell or Completion Queue
Head Doorbell regiter and an Asynchronous Event Request command is
outstanding, then an asynchronous event is posted to the Admin
Completion Queue with a status code of Invalid Doorbell Write Value."

> 
> > +
> >              return;
> >          }
> >  
> > @@ -1489,6 +1631,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >                             " beyond queue size, sqid=%"PRIu32","
> >                             " new_head=%"PRIu16", ignoring",
> >                             qid, new_head);
> > +
> > +            if (n->outstanding_aers) {
> > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > +                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
> > +                                   NVME_LOG_ERROR_INFO);
> > +            }
> > +
> >              return;
> >          }
> >  
> > @@ -1519,6 +1668,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >                             "submission queue doorbell write"
> >                             " for nonexistent queue,"
> >                             " sqid=%"PRIu32", ignoring", qid);
> > +
> > +            if (n->outstanding_aers) {
> > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > +                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> > +                                   NVME_LOG_ERROR_INFO);
> > +            }
> > +
> >              return;
> >          }
> >  
> > @@ -1529,6 +1685,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> >                             " beyond queue size, sqid=%"PRIu32","
> >                             " new_tail=%"PRIu16", ignoring",
> >                             qid, new_tail);
> > +
> > +            if (n->outstanding_aers) {
> > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > +                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
> > +                                   NVME_LOG_ERROR_INFO);
> > +            }
> > +
> >              return;
> >          }
> >  
> > @@ -1650,6 +1813,7 @@ static void nvme_init_state(NvmeCtrl *n)
> >      n->temperature = NVME_TEMPERATURE;
> >      n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
> >      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> > +    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
> >  }
> >  
> >  static void nvme_init_blk(NvmeCtrl *n, Error **errp)
> > @@ -1805,6 +1969,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
> >       * inconsequential.
> >       */
> >      id->acl = 3;
> > +    id->aerl = n->params.aerl;
> Name a tiny bit unclear. I know that this is from the spec but still.
> 

Yes I know, but I really prefer the spec names if possible (makes it
easy to look them up).

> >      id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
> >      id->lpa = NVME_LPA_EXTENDED;
> >  
> > @@ -1879,6 +2044,7 @@ static void nvme_exit(PCIDevice *pci_dev)
> >      g_free(n->namespaces);
> >      g_free(n->cq);
> >      g_free(n->sq);
> > +    g_free(n->aer_reqs);
> >  
> >      if (n->params.cmb_size_mb) {
> >          g_free(n->cmbuf);
> > @@ -1899,6 +2065,8 @@ static Property nvme_props[] = {
> >      DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
> >      DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
> >      DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
> > +    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
> So this is number of AERs that we allow the user to be outstanding

Yeah, and per the spec, 0's based.

> 
> > +    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
> And this is the number of AERs that we keep in our internal AER queue untill user posts and AER so that we
> can complete it.
> 

Correct.
Maxim Levitsky July 29, 2020, 6:45 p.m. UTC | #3
On Wed, 2020-07-29 at 15:37 +0200, Klaus Jensen wrote:
> On Jul 29 13:43, Maxim Levitsky wrote:
> > On Mon, 2020-07-06 at 08:12 +0200, Klaus Jensen wrote:
> > > From: Klaus Jensen <k.jensen@samsung.com>
> > > 
> > > Add support for the Asynchronous Event Request command. Required for
> > > compliance with NVMe revision 1.3d. See NVM Express 1.3d, Section 5.2
> > > ("Asynchronous Event Request command").
> > > 
> > > Mostly imported from Keith's qemu-nvme tree. Modified with a max number
> > > of queued events (controllable with the aer_max_queued device
> > > parameter). The spec states that the controller *should* retain
> > > events, so we do best effort here.
> > > 
> > > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > > Acked-by: Keith Busch <kbusch@kernel.org>
> > > Reviewed-by: Maxim Levitsky <mlevitsk@redhat.com>
> > > Reviewed-by: Dmitry Fomichev <dmitry.fomichev@wdc.com>
> > > ---
> > >  hw/block/nvme.c       | 180 ++++++++++++++++++++++++++++++++++++++++--
> > >  hw/block/nvme.h       |  10 ++-
> > >  hw/block/trace-events |   9 +++
> > >  include/block/nvme.h  |   8 +-
> > >  4 files changed, 198 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 7cb3787638f6..80c7285bc1cf 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -356,6 +356,85 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
> > >      timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> > >  }
> > >  
> > > +static void nvme_process_aers(void *opaque)
> > > +{
> > > +    NvmeCtrl *n = opaque;
> > > +    NvmeAsyncEvent *event, *next;
> > > +
> > > +    trace_pci_nvme_process_aers(n->aer_queued);
> > > +
> > > +    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
> > > +        NvmeRequest *req;
> > > +        NvmeAerResult *result;
> > > +
> > > +        /* can't post cqe if there is nothing to complete */
> > > +        if (!n->outstanding_aers) {
> > > +            trace_pci_nvme_no_outstanding_aers();
> > > +            break;
> > > +        }
> > > +
> > > +        /* ignore if masked (cqe posted, but event not cleared) */
> > > +        if (n->aer_mask & (1 << event->result.event_type)) {
> > > +            trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
> > > +            continue;
> > > +        }
> > > +
> > > +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> > > +        n->aer_queued--;
> > > +
> > > +        n->aer_mask |= 1 << event->result.event_type;
> > > +        n->outstanding_aers--;
> > > +
> > > +        req = n->aer_reqs[n->outstanding_aers];
> > > +
> > > +        result = (NvmeAerResult *) &req->cqe.result;
> > > +        result->event_type = event->result.event_type;
> > > +        result->event_info = event->result.event_info;
> > > +        result->log_page = event->result.log_page;
> > > +        g_free(event);
> > > +
> > > +        req->status = NVME_SUCCESS;
> > > +
> > > +        trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
> > > +                                    result->log_page);
> > > +
> > > +        nvme_enqueue_req_completion(&n->admin_cq, req);
> > > +    }
> > > +}
> > > +
> > > +static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
> > > +                               uint8_t event_info, uint8_t log_page)
> > > +{
> > > +    NvmeAsyncEvent *event;
> > > +
> > > +    trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
> > > +
> > > +    if (n->aer_queued == n->params.aer_max_queued) {
> > > +        trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
> > > +        return;
> > > +    }
> > > +
> > > +    event = g_new(NvmeAsyncEvent, 1);
> > > +    event->result = (NvmeAerResult) {
> > > +        .event_type = event_type,
> > > +        .event_info = event_info,
> > > +        .log_page   = log_page,
> > > +    };
> > > +
> > > +    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
> > > +    n->aer_queued++;
> > > +
> > > +    nvme_process_aers(n);
> > > +}
> > > +
> > > +static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
> > > +{
> > > +    n->aer_mask &= ~(1 << event_type);
> > > +    if (!QTAILQ_EMPTY(&n->aer_queue)) {
> > > +        nvme_process_aers(n);
> > > +    }
> > > +}
> > > +
> > >  static void nvme_rw_cb(void *opaque, int ret)
> > >  {
> > >      NvmeRequest *req = opaque;
> > > @@ -606,8 +685,9 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
> > >      return NVME_SUCCESS;
> > >  }
> > >  
> > > -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> > > -                                uint64_t off, NvmeRequest *req)
> > > +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> > > +                                uint32_t buf_len, uint64_t off,
> > > +                                NvmeRequest *req)
> > >  {
> > >      uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> > >      uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> > > @@ -655,6 +735,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> > >      smart.power_on_hours[0] =
> > >          cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
> > >  
> > > +    if (!rae) {
> > > +        nvme_clear_events(n, NVME_AER_TYPE_SMART);
> > > +    }
> > > +
> > >      return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
> > >                               prp2);
> > >  }
> > > @@ -681,14 +765,19 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> > >                               prp2);
> > >  }
> > >  
> > > -static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> > > -                                uint64_t off, NvmeRequest *req)
> > > +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> > > +                                uint32_t buf_len, uint64_t off,
> > > +                                NvmeRequest *req)
> > >  {
> > >      uint32_t trans_len;
> > >      uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> > >      uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> > >      NvmeErrorLog errlog;
> > >  
> > > +    if (!rae) {
> > > +        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
> > > +    }
> > > +
> > >      if (off > sizeof(errlog)) {
> > >          return NVME_INVALID_FIELD | NVME_DNR;
> > >      }
> > > @@ -729,9 +818,9 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >  
> > >      switch (lid) {
> > >      case NVME_LOG_ERROR_INFO:
> > > -        return nvme_error_info(n, cmd, len, off, req);
> > > +        return nvme_error_info(n, cmd, rae, len, off, req);
> > >      case NVME_LOG_SMART_INFO:
> > > -        return nvme_smart_info(n, cmd, len, off, req);
> > > +        return nvme_smart_info(n, cmd, rae, len, off, req);
> > >      case NVME_LOG_FW_SLOT_INFO:
> > >          return nvme_fw_log_info(n, cmd, len, off, req);
> > >      default:
> > > @@ -1013,6 +1102,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >              ((n->params.max_ioqpairs - 1) << 16);
> > >          trace_pci_nvme_getfeat_numq(result);
> > >          break;
> > > +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> > > +        result = n->features.async_config;
> > > +        break;
> > >      case NVME_TIMESTAMP:
> > >          return nvme_get_feature_timestamp(n, cmd);
> > >      default:
> > > @@ -1064,6 +1156,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >              return NVME_INVALID_FIELD | NVME_DNR;
> > >          }
> > >  
> > > +        if (((n->temperature >= n->features.temp_thresh_hi) ||
> > > +            (n->temperature <= n->features.temp_thresh_low)) &&
> > > +            NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) {
> > > +            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
> > > +                               NVME_AER_INFO_SMART_TEMP_THRESH,
> > > +                               NVME_LOG_SMART_INFO);
> > > +        }
> > > +
> > >          break;
> > >      case NVME_VOLATILE_WRITE_CACHE:
> > >          blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
> > > @@ -1076,6 +1176,9 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >          req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
> > >                                        ((n->params.max_ioqpairs - 1) << 16));
> > >          break;
> > > +    case NVME_ASYNCHRONOUS_EVENT_CONF:
> > > +        n->features.async_config = dw11;
> > > +        break;
> > >      case NVME_TIMESTAMP:
> > >          return nvme_set_feature_timestamp(n, cmd);
> > >      default:
> > > @@ -1085,6 +1188,25 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >      return NVME_SUCCESS;
> > >  }
> > >  
> > > +static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > > +{
> > > +    trace_pci_nvme_aer(nvme_cid(req));
> > > +
> > > +    if (n->outstanding_aers > n->params.aerl) {
> > > +        trace_pci_nvme_aer_aerl_exceeded();
> > > +        return NVME_AER_LIMIT_EXCEEDED;
> > > +    }
> > > +
> > > +    n->aer_reqs[n->outstanding_aers] = req;
> > > +    n->outstanding_aers++;
> > > +
> > > +    if (!QTAILQ_EMPTY(&n->aer_queue)) {
> > > +        nvme_process_aers(n);
> > > +    }
> > > +
> > > +    return NVME_NO_COMPLETE;
> > > +}
> > 
> > Looks good so far
> > 
> > > +
> > >  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >  {
> > >      trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), cmd->opcode);
> > > @@ -1108,6 +1230,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > >          return nvme_set_feature(n, cmd, req);
> > >      case NVME_ADM_CMD_GET_FEATURES:
> > >          return nvme_get_feature(n, cmd, req);
> > > +    case NVME_ADM_CMD_ASYNC_EV_REQ:
> > > +        return nvme_aer(n, cmd, req);
> > >      default:
> > >          trace_pci_nvme_err_invalid_admin_opc(cmd->opcode);
> > >          return NVME_INVALID_OPCODE | NVME_DNR;
> > > @@ -1162,6 +1286,15 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
> > >          }
> > >      }
> > >  
> > > +    while (!QTAILQ_EMPTY(&n->aer_queue)) {
> > > +        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
> > > +        QTAILQ_REMOVE(&n->aer_queue, event, entry);
> > > +        g_free(event);
> > > +    }
> > > +
> > > +    n->aer_queued = 0;
> > > +    n->outstanding_aers = 0;
> > > +
> > >      blk_flush(n->conf.blk);
> > >      n->bar.cc = 0;
> > >  }
> > > @@ -1258,6 +1391,8 @@ static int nvme_start_ctrl(NvmeCtrl *n)
> > >  
> > >      nvme_set_timestamp(n, 0ULL);
> > >  
> > > +    QTAILQ_INIT(&n->aer_queue);
> > > +
> > >      return 0;
> > >  }
> > >  
> > > @@ -1479,6 +1614,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > >                             "completion queue doorbell write"
> > >                             " for nonexistent queue,"
> > >                             " sqid=%"PRIu32", ignoring", qid);
> > > +
> > > +            if (n->outstanding_aers) {
> > > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > > +                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> > > +                                   NVME_LOG_ERROR_INFO);
> > > +            }
> > To be honest I would move the check for outstanding AERs to nvme_enqueue_event.
> > 
> > Also the logic seems a bit off. The code checks that we have outstanding AER requests,
> > however we do have internal AER queue for this situation.
> > It seems that SMART events are generated without this check but ERROR events only when
> > outstanding AERs exist.
> > Could you explain? I am probably forgot something from the spec which I haven't read for long time.
> > 
> 
> I'm pretty sure this has been mentioned before, but I can't find it
> anywhere, maybe it was an internal review...
> 
> Anyway, I'm interpreting the AER logic as a special case for doorbell writes:
> 
> NVM Express v1.3d, Section 4.1 state: "If host software writes an
> invalid value to the Submission Queue Tail Doorbell or Completion Queue
> Head Doorbell regiter and an Asynchronous Event Request command is
> outstanding, then an asynchronous event is posted to the Admin
> Completion Queue with a status code of Invalid Doorbell Write Value."
Ah, they indeed mention this. So let it be like that, it probably really 
doesn't matter anyway.
If you respin the patches, you could add the above as a comment to why this is done
to avoid confusions.

> 
> > > +
> > >              return;
> > >          }
> > >  
> > > @@ -1489,6 +1631,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > >                             " beyond queue size, sqid=%"PRIu32","
> > >                             " new_head=%"PRIu16", ignoring",
> > >                             qid, new_head);
> > > +
> > > +            if (n->outstanding_aers) {
> > > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > > +                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
> > > +                                   NVME_LOG_ERROR_INFO);
> > > +            }
> > > +
> > >              return;
> > >          }
> > >  
> > > @@ -1519,6 +1668,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > >                             "submission queue doorbell write"
> > >                             " for nonexistent queue,"
> > >                             " sqid=%"PRIu32", ignoring", qid);
> > > +
> > > +            if (n->outstanding_aers) {
> > > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > > +                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
> > > +                                   NVME_LOG_ERROR_INFO);
> > > +            }
> > > +
> > >              return;
> > >          }
> > >  
> > > @@ -1529,6 +1685,13 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
> > >                             " beyond queue size, sqid=%"PRIu32","
> > >                             " new_tail=%"PRIu16", ignoring",
> > >                             qid, new_tail);
> > > +
> > > +            if (n->outstanding_aers) {
> > > +                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
> > > +                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
> > > +                                   NVME_LOG_ERROR_INFO);
> > > +            }
> > > +
> > >              return;
> > >          }
> > >  
> > > @@ -1650,6 +1813,7 @@ static void nvme_init_state(NvmeCtrl *n)
> > >      n->temperature = NVME_TEMPERATURE;
> > >      n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
> > >      n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> > > +    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
> > >  }
> > >  
> > >  static void nvme_init_blk(NvmeCtrl *n, Error **errp)
> > > @@ -1805,6 +1969,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
> > >       * inconsequential.
> > >       */
> > >      id->acl = 3;
> > > +    id->aerl = n->params.aerl;
> > Name a tiny bit unclear. I know that this is from the spec but still.
> > 
> 
> Yes I know, but I really prefer the spec names if possible (makes it
> easy to look them up).
I understand. I only complained a bit about that name beeing exposed to
outside as a device property. It doesn't matter that much to me though.
> 
> > >      id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
> > >      id->lpa = NVME_LPA_EXTENDED;
> > >  
> > > @@ -1879,6 +2044,7 @@ static void nvme_exit(PCIDevice *pci_dev)
> > >      g_free(n->namespaces);
> > >      g_free(n->cq);
> > >      g_free(n->sq);
> > > +    g_free(n->aer_reqs);
> > >  
> > >      if (n->params.cmb_size_mb) {
> > >          g_free(n->cmbuf);
> > > @@ -1899,6 +2065,8 @@ static Property nvme_props[] = {
> > >      DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
> > >      DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
> > >      DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
> > > +    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
> > So this is number of AERs that we allow the user to be outstanding
> 
> Yeah, and per the spec, 0's based.
> 
> > > +    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
> > And this is the number of AERs that we keep in our internal AER queue untill user posts and AER so that we
> > can complete it.
> > 
> 
> Correct.

Yep - this is what I understood after examining all of the patch, but from the names itself it is hard to understand this.
Maybe a comment next to property to at least make it easier for advanced user (e.g user that reads code)
to understand?

(I often end up reading source to understand various qemu device parameters).

Best regards,
	Maxim Levitsky
Klaus Jensen July 29, 2020, 8:08 p.m. UTC | #4
On Jul 29 21:45, Maxim Levitsky wrote:
> On Wed, 2020-07-29 at 15:37 +0200, Klaus Jensen wrote:
> > On Jul 29 13:43, Maxim Levitsky wrote:
> > > On Mon, 2020-07-06 at 08:12 +0200, Klaus Jensen wrote:
> > > > +    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
> > > So this is number of AERs that we allow the user to be outstanding
> > 
> > Yeah, and per the spec, 0's based.
> > 
> > > > +    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
> > > And this is the number of AERs that we keep in our internal AER queue untill user posts and AER so that we
> > > can complete it.
> > > 
> > 
> > Correct.
> 
> Yep - this is what I understood after examining all of the patch, but from the names itself it is hard to understand this.
> Maybe a comment next to property to at least make it easier for advanced user (e.g user that reads code)
> to understand?
> 
> (I often end up reading source to understand various qemu device parameters).
> 

I should add this in docs/specs/nvme.txt (which shows up in one of my
next series when I add a new PCI id for the device). For now, I will add
it to the top of the file like the rest of the parameters.

Subsequent series contains a lot more additions of new parameters that
is directly from the spec and to me it really only makes sense that they
share the names if they can.

We could consider having them under a "spec namespace"? So, say, we do
DEFINE_PROP_UINT("spec.aerl", ...)?
Maxim Levitsky July 30, 2020, 8:50 a.m. UTC | #5
On Wed, 2020-07-29 at 22:08 +0200, Klaus Jensen wrote:
> On Jul 29 21:45, Maxim Levitsky wrote:
> > On Wed, 2020-07-29 at 15:37 +0200, Klaus Jensen wrote:
> > > On Jul 29 13:43, Maxim Levitsky wrote:
> > > > On Mon, 2020-07-06 at 08:12 +0200, Klaus Jensen wrote:
> > > > > +    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
> > > > So this is number of AERs that we allow the user to be outstanding
> > > 
> > > Yeah, and per the spec, 0's based.
> > > 
> > > > > +    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
> > > > And this is the number of AERs that we keep in our internal AER queue untill user posts and AER so that we
> > > > can complete it.
> > > > 
> > > 
> > > Correct.
> > 
> > Yep - this is what I understood after examining all of the patch, but from the names itself it is hard to understand this.
> > Maybe a comment next to property to at least make it easier for advanced user (e.g user that reads code)
> > to understand?
> > 
> > (I often end up reading source to understand various qemu device parameters).
> > 
> 
> I should add this in docs/specs/nvme.txt (which shows up in one of my
> next series when I add a new PCI id for the device). For now, I will add
> it to the top of the file like the rest of the parameters.
This is a good idea!
> 
> Subsequent series contains a lot more additions of new parameters that
> is directly from the spec and to me it really only makes sense that they
> share the names if they can.
> 
> We could consider having them under a "spec namespace"? So, say, we do
> DEFINE_PROP_UINT("spec.aerl", ...)?
I personally tend to think that it won't make it much more readable.

Best regards,
	Maxim Levitsky
>
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 7cb3787638f6..80c7285bc1cf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -356,6 +356,85 @@  static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
     timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
 }
 
+static void nvme_process_aers(void *opaque)
+{
+    NvmeCtrl *n = opaque;
+    NvmeAsyncEvent *event, *next;
+
+    trace_pci_nvme_process_aers(n->aer_queued);
+
+    QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
+        NvmeRequest *req;
+        NvmeAerResult *result;
+
+        /* can't post cqe if there is nothing to complete */
+        if (!n->outstanding_aers) {
+            trace_pci_nvme_no_outstanding_aers();
+            break;
+        }
+
+        /* ignore if masked (cqe posted, but event not cleared) */
+        if (n->aer_mask & (1 << event->result.event_type)) {
+            trace_pci_nvme_aer_masked(event->result.event_type, n->aer_mask);
+            continue;
+        }
+
+        QTAILQ_REMOVE(&n->aer_queue, event, entry);
+        n->aer_queued--;
+
+        n->aer_mask |= 1 << event->result.event_type;
+        n->outstanding_aers--;
+
+        req = n->aer_reqs[n->outstanding_aers];
+
+        result = (NvmeAerResult *) &req->cqe.result;
+        result->event_type = event->result.event_type;
+        result->event_info = event->result.event_info;
+        result->log_page = event->result.log_page;
+        g_free(event);
+
+        req->status = NVME_SUCCESS;
+
+        trace_pci_nvme_aer_post_cqe(result->event_type, result->event_info,
+                                    result->log_page);
+
+        nvme_enqueue_req_completion(&n->admin_cq, req);
+    }
+}
+
+static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
+                               uint8_t event_info, uint8_t log_page)
+{
+    NvmeAsyncEvent *event;
+
+    trace_pci_nvme_enqueue_event(event_type, event_info, log_page);
+
+    if (n->aer_queued == n->params.aer_max_queued) {
+        trace_pci_nvme_enqueue_event_noqueue(n->aer_queued);
+        return;
+    }
+
+    event = g_new(NvmeAsyncEvent, 1);
+    event->result = (NvmeAerResult) {
+        .event_type = event_type,
+        .event_info = event_info,
+        .log_page   = log_page,
+    };
+
+    QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
+    n->aer_queued++;
+
+    nvme_process_aers(n);
+}
+
+static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
+{
+    n->aer_mask &= ~(1 << event_type);
+    if (!QTAILQ_EMPTY(&n->aer_queue)) {
+        nvme_process_aers(n);
+    }
+}
+
 static void nvme_rw_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -606,8 +685,9 @@  static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
     return NVME_SUCCESS;
 }
 
-static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
-                                uint64_t off, NvmeRequest *req)
+static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
+                                uint32_t buf_len, uint64_t off,
+                                NvmeRequest *req)
 {
     uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
     uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
@@ -655,6 +735,10 @@  static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
     smart.power_on_hours[0] =
         cpu_to_le64((((current_ms - n->starttime_ms) / 1000) / 60) / 60);
 
+    if (!rae) {
+        nvme_clear_events(n, NVME_AER_TYPE_SMART);
+    }
+
     return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
                              prp2);
 }
@@ -681,14 +765,19 @@  static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
                              prp2);
 }
 
-static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
-                                uint64_t off, NvmeRequest *req)
+static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
+                                uint32_t buf_len, uint64_t off,
+                                NvmeRequest *req)
 {
     uint32_t trans_len;
     uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
     uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
     NvmeErrorLog errlog;
 
+    if (!rae) {
+        nvme_clear_events(n, NVME_AER_TYPE_ERROR);
+    }
+
     if (off > sizeof(errlog)) {
         return NVME_INVALID_FIELD | NVME_DNR;
     }
@@ -729,9 +818,9 @@  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 
     switch (lid) {
     case NVME_LOG_ERROR_INFO:
-        return nvme_error_info(n, cmd, len, off, req);
+        return nvme_error_info(n, cmd, rae, len, off, req);
     case NVME_LOG_SMART_INFO:
-        return nvme_smart_info(n, cmd, len, off, req);
+        return nvme_smart_info(n, cmd, rae, len, off, req);
     case NVME_LOG_FW_SLOT_INFO:
         return nvme_fw_log_info(n, cmd, len, off, req);
     default:
@@ -1013,6 +1102,9 @@  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
             ((n->params.max_ioqpairs - 1) << 16);
         trace_pci_nvme_getfeat_numq(result);
         break;
+    case NVME_ASYNCHRONOUS_EVENT_CONF:
+        result = n->features.async_config;
+        break;
     case NVME_TIMESTAMP:
         return nvme_get_feature_timestamp(n, cmd);
     default:
@@ -1064,6 +1156,14 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
             return NVME_INVALID_FIELD | NVME_DNR;
         }
 
+        if (((n->temperature >= n->features.temp_thresh_hi) ||
+            (n->temperature <= n->features.temp_thresh_low)) &&
+            NVME_AEC_SMART(n->features.async_config) & NVME_SMART_TEMPERATURE) {
+            nvme_enqueue_event(n, NVME_AER_TYPE_SMART,
+                               NVME_AER_INFO_SMART_TEMP_THRESH,
+                               NVME_LOG_SMART_INFO);
+        }
+
         break;
     case NVME_VOLATILE_WRITE_CACHE:
         blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
@@ -1076,6 +1176,9 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
                                       ((n->params.max_ioqpairs - 1) << 16));
         break;
+    case NVME_ASYNCHRONOUS_EVENT_CONF:
+        n->features.async_config = dw11;
+        break;
     case NVME_TIMESTAMP:
         return nvme_set_feature_timestamp(n, cmd);
     default:
@@ -1085,6 +1188,25 @@  static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_aer(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+    trace_pci_nvme_aer(nvme_cid(req));
+
+    if (n->outstanding_aers > n->params.aerl) {
+        trace_pci_nvme_aer_aerl_exceeded();
+        return NVME_AER_LIMIT_EXCEEDED;
+    }
+
+    n->aer_reqs[n->outstanding_aers] = req;
+    n->outstanding_aers++;
+
+    if (!QTAILQ_EMPTY(&n->aer_queue)) {
+        nvme_process_aers(n);
+    }
+
+    return NVME_NO_COMPLETE;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     trace_pci_nvme_admin_cmd(nvme_cid(req), nvme_sqid(req), cmd->opcode);
@@ -1108,6 +1230,8 @@  static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         return nvme_set_feature(n, cmd, req);
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
+    case NVME_ADM_CMD_ASYNC_EV_REQ:
+        return nvme_aer(n, cmd, req);
     default:
         trace_pci_nvme_err_invalid_admin_opc(cmd->opcode);
         return NVME_INVALID_OPCODE | NVME_DNR;
@@ -1162,6 +1286,15 @@  static void nvme_clear_ctrl(NvmeCtrl *n)
         }
     }
 
+    while (!QTAILQ_EMPTY(&n->aer_queue)) {
+        NvmeAsyncEvent *event = QTAILQ_FIRST(&n->aer_queue);
+        QTAILQ_REMOVE(&n->aer_queue, event, entry);
+        g_free(event);
+    }
+
+    n->aer_queued = 0;
+    n->outstanding_aers = 0;
+
     blk_flush(n->conf.blk);
     n->bar.cc = 0;
 }
@@ -1258,6 +1391,8 @@  static int nvme_start_ctrl(NvmeCtrl *n)
 
     nvme_set_timestamp(n, 0ULL);
 
+    QTAILQ_INIT(&n->aer_queue);
+
     return 0;
 }
 
@@ -1479,6 +1614,13 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            "completion queue doorbell write"
                            " for nonexistent queue,"
                            " sqid=%"PRIu32", ignoring", qid);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
+                                   NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1489,6 +1631,13 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            " beyond queue size, sqid=%"PRIu32","
                            " new_head=%"PRIu16", ignoring",
                            qid, new_head);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
+                                   NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1519,6 +1668,13 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            "submission queue doorbell write"
                            " for nonexistent queue,"
                            " sqid=%"PRIu32", ignoring", qid);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                                   NVME_AER_INFO_ERR_INVALID_DB_REGISTER,
+                                   NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1529,6 +1685,13 @@  static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
                            " beyond queue size, sqid=%"PRIu32","
                            " new_tail=%"PRIu16", ignoring",
                            qid, new_tail);
+
+            if (n->outstanding_aers) {
+                nvme_enqueue_event(n, NVME_AER_TYPE_ERROR,
+                                   NVME_AER_INFO_ERR_INVALID_DB_VALUE,
+                                   NVME_LOG_ERROR_INFO);
+            }
+
             return;
         }
 
@@ -1650,6 +1813,7 @@  static void nvme_init_state(NvmeCtrl *n)
     n->temperature = NVME_TEMPERATURE;
     n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
     n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
+    n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
 }
 
 static void nvme_init_blk(NvmeCtrl *n, Error **errp)
@@ -1805,6 +1969,7 @@  static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
      * inconsequential.
      */
     id->acl = 3;
+    id->aerl = n->params.aerl;
     id->frmw = (NVME_NUM_FW_SLOTS << 1) | NVME_FRMW_SLOT1_RO;
     id->lpa = NVME_LPA_EXTENDED;
 
@@ -1879,6 +2044,7 @@  static void nvme_exit(PCIDevice *pci_dev)
     g_free(n->namespaces);
     g_free(n->cq);
     g_free(n->sq);
+    g_free(n->aer_reqs);
 
     if (n->params.cmb_size_mb) {
         g_free(n->cmbuf);
@@ -1899,6 +2065,8 @@  static Property nvme_props[] = {
     DEFINE_PROP_UINT32("num_queues", NvmeCtrl, params.num_queues, 0),
     DEFINE_PROP_UINT32("max_ioqpairs", NvmeCtrl, params.max_ioqpairs, 64),
     DEFINE_PROP_UINT16("msix_qsize", NvmeCtrl, params.msix_qsize, 65),
+    DEFINE_PROP_UINT8("aerl", NvmeCtrl, params.aerl, 3),
+    DEFINE_PROP_UINT32("aer_max_queued", NvmeCtrl, params.aer_max_queued, 64),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 8228978e93de..1837233617bb 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -9,10 +9,12 @@  typedef struct NvmeParams {
     uint32_t max_ioqpairs;
     uint16_t msix_qsize;
     uint32_t cmb_size_mb;
+    uint8_t  aerl;
+    uint32_t aer_max_queued;
 } NvmeParams;
 
 typedef struct NvmeAsyncEvent {
-    QSIMPLEQ_ENTRY(NvmeAsyncEvent) entry;
+    QTAILQ_ENTRY(NvmeAsyncEvent) entry;
     NvmeAerResult result;
 } NvmeAsyncEvent;
 
@@ -94,6 +96,7 @@  typedef struct NvmeCtrl {
     uint32_t    num_namespaces;
     uint32_t    max_q_ents;
     uint64_t    ns_size;
+    uint8_t     outstanding_aers;
     uint8_t     *cmbuf;
     uint32_t    irq_status;
     uint64_t    host_timestamp;                 /* Timestamp sent by the host */
@@ -103,6 +106,11 @@  typedef struct NvmeCtrl {
 
     HostMemoryBackend *pmrdev;
 
+    uint8_t     aer_mask;
+    NvmeRequest **aer_reqs;
+    QTAILQ_HEAD(, NvmeAsyncEvent) aer_queue;
+    int         aer_queued;
+
     NvmeNamespace   *namespaces;
     NvmeSQueue      **sq;
     NvmeCQueue      **cq;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 3330d74e48db..091af16ca7d7 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -51,6 +51,15 @@  pci_nvme_getfeat_numq(int result) "get feature number of queues, result=%d"
 pci_nvme_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
 pci_nvme_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64""
 pci_nvme_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64""
+pci_nvme_process_aers(int queued) "queued %d"
+pci_nvme_aer(uint16_t cid) "cid %"PRIu16""
+pci_nvme_aer_aerl_exceeded(void) "aerl exceeded"
+pci_nvme_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 0x%"PRIx8""
+pci_nvme_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
+pci_nvme_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
+pci_nvme_enqueue_event_noqueue(int queued) "queued %d"
+pci_nvme_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
+pci_nvme_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
 pci_nvme_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
 pci_nvme_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
 pci_nvme_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 0x%"PRIx64""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 49ce97ae1ab4..2101292ed5e8 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -597,8 +597,8 @@  enum NvmeAsyncEventRequest {
     NVME_AER_TYPE_SMART                     = 1,
     NVME_AER_TYPE_IO_SPECIFIC               = 6,
     NVME_AER_TYPE_VENDOR_SPECIFIC           = 7,
-    NVME_AER_INFO_ERR_INVALID_SQ            = 0,
-    NVME_AER_INFO_ERR_INVALID_DB            = 1,
+    NVME_AER_INFO_ERR_INVALID_DB_REGISTER   = 0,
+    NVME_AER_INFO_ERR_INVALID_DB_VALUE      = 1,
     NVME_AER_INFO_ERR_DIAG_FAIL             = 2,
     NVME_AER_INFO_ERR_PERS_INTERNAL_ERR     = 3,
     NVME_AER_INFO_ERR_TRANS_INTERNAL_ERR    = 4,
@@ -899,6 +899,10 @@  typedef struct NvmeFeatureVal {
 
 #define NVME_TEMP_TMPTH(temp) ((temp >>  0) & 0xffff)
 
+#define NVME_AEC_SMART(aec)         (aec & 0xff)
+#define NVME_AEC_NS_ATTR(aec)       ((aec >> 8) & 0x1)
+#define NVME_AEC_FW_ACTIVATION(aec) ((aec >> 9) & 0x1)
+
 enum NvmeFeatureIds {
     NVME_ARBITRATION                = 0x1,
     NVME_POWER_MANAGEMENT           = 0x2,