diff mbox series

[RFC,1/4] hw/block/nvme: convert dsm to aiocb

Message ID 20210302111040.289244-2-its@irrelevant.dk
State New
Headers show
Series hw/block/nvme: convert ad-hoc aio tracking to aiocbs | expand

Commit Message

Klaus Jensen March 2, 2021, 11:10 a.m. UTC
From: Klaus Jensen <k.jensen@samsung.com>

Convert dataset management from ad-hoc multi aio tracking to use
standard QEMU AIOCB processing.

Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
---
 hw/block/nvme.c       | 187 ++++++++++++++++++++++++++++--------------
 hw/block/trace-events |   2 +-
 2 files changed, 125 insertions(+), 64 deletions(-)

Comments

Stefan Hajnoczi March 8, 2021, 4:37 p.m. UTC | #1
On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> +{
> +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> +
> +    /* break loop */
> +    iocb->curr.len = 0;
> +    iocb->curr.idx = iocb->nr;
> +
> +    iocb->ret = -ECANCELED;
> +
> +    if (iocb->aiocb) {
> +        blk_aio_cancel_async(iocb->aiocb);
> +        iocb->aiocb = NULL;
> +    }
> +}

Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
called after the last discard has completed but before the BH runs? I
want to make sure there are no other cases because nothing would call
iocb->common.cb().

>  static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
>  {
>      NvmeNamespace *ns = req->ns;
>      NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> -
>      uint32_t attr = le32_to_cpu(dsm->attributes);
>      uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> -
>      uint16_t status = NVME_SUCCESS;
>  
>      trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
>  
>      if (attr & NVME_DSMGMT_AD) {
> -        int64_t offset;
> -        size_t len;
> -        NvmeDsmRange range[nr];
> -        uintptr_t *discards = (uintptr_t *)&req->opaque;
> +        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
> +                                         nvme_misc_cb, req);
>  
> -        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
> +        iocb->req = req;
> +        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
> +        iocb->ret = 0;
> +        iocb->range = g_new(NvmeDsmRange, nr);
> +        iocb->nr = nr;
> +        iocb->curr.len = 0;
> +        iocb->curr.idx = 0;
> +
> +        status = nvme_dma(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
>                            DMA_DIRECTION_TO_DEVICE, req);
>          if (status) {
>              return status;
>          }
>  
> -        /*
> -         * AIO callbacks may be called immediately, so initialize discards to 1
> -         * to make sure the the callback does not complete the request before
> -         * all discards have been issued.
> -         */
> -        *discards = 1;
> +        nvme_dsm_aio_cb(iocb, 0);
> +        req->aiocb = &iocb->common;

Want to move this line up one just in case something in
nvme_dsm_aio_cb() accesses req->aiocb?
Klaus Jensen March 8, 2021, 6:05 p.m. UTC | #2
On Mar  8 16:37, Stefan Hajnoczi wrote:
> On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> > +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> > +{
> > +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> > +
> > +    /* break loop */
> > +    iocb->curr.len = 0;
> > +    iocb->curr.idx = iocb->nr;
> > +
> > +    iocb->ret = -ECANCELED;
> > +
> > +    if (iocb->aiocb) {
> > +        blk_aio_cancel_async(iocb->aiocb);
> > +        iocb->aiocb = NULL;
> > +    }
> > +}
> 
> Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
> called after the last discard has completed but before the BH runs? I
> want to make sure there are no other cases because nothing would call
> iocb->common.cb().
> 

Yes - that case *can* happen, right?

I modeled this after the appoach in the ide trim code (hw/ide/core.c).

> >  static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      NvmeNamespace *ns = req->ns;
> >      NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
> > -
> >      uint32_t attr = le32_to_cpu(dsm->attributes);
> >      uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
> > -
> >      uint16_t status = NVME_SUCCESS;
> >  
> >      trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
> >  
> >      if (attr & NVME_DSMGMT_AD) {
> > -        int64_t offset;
> > -        size_t len;
> > -        NvmeDsmRange range[nr];
> > -        uintptr_t *discards = (uintptr_t *)&req->opaque;
> > +        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
> > +                                         nvme_misc_cb, req);
> >  
> > -        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
> > +        iocb->req = req;
> > +        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
> > +        iocb->ret = 0;
> > +        iocb->range = g_new(NvmeDsmRange, nr);
> > +        iocb->nr = nr;
> > +        iocb->curr.len = 0;
> > +        iocb->curr.idx = 0;
> > +
> > +        status = nvme_dma(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
> >                            DMA_DIRECTION_TO_DEVICE, req);
> >          if (status) {
> >              return status;
> >          }
> >  
> > -        /*
> > -         * AIO callbacks may be called immediately, so initialize discards to 1
> > -         * to make sure the the callback does not complete the request before
> > -         * all discards have been issued.
> > -         */
> > -        *discards = 1;
> > +        nvme_dsm_aio_cb(iocb, 0);
> > +        req->aiocb = &iocb->common;
> 
> Want to move this line up one just in case something in
> nvme_dsm_aio_cb() accesses req->aiocb?

Sounds reasonable! Thanks!
Stefan Hajnoczi March 9, 2021, 4:03 p.m. UTC | #3
On Mon, Mar 08, 2021 at 07:05:40PM +0100, Klaus Jensen wrote:
> On Mar  8 16:37, Stefan Hajnoczi wrote:
> > On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> > > +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> > > +{
> > > +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> > > +
> > > +    /* break loop */
> > > +    iocb->curr.len = 0;
> > > +    iocb->curr.idx = iocb->nr;
> > > +
> > > +    iocb->ret = -ECANCELED;
> > > +
> > > +    if (iocb->aiocb) {
> > > +        blk_aio_cancel_async(iocb->aiocb);
> > > +        iocb->aiocb = NULL;
> > > +    }
> > > +}
> > 
> > Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
> > called after the last discard has completed but before the BH runs? I
> > want to make sure there are no other cases because nothing would call
> > iocb->common.cb().
> > 
> 
> Yes - that case *can* happen, right?
> 
> I modeled this after the appoach in the ide trim code (hw/ide/core.c).

Yes, nvme_dsm_bh() may run after other event loop activity. Therefore we
have to take the iocb->aiocb == NULL case into account because some
event loop activity could call nvme_dsm_cancel() before the BH runs.

Another (wild?) possibility is that nvme_dsm_cancel() is called twice.
That's okay, nvme_dsm_cancel() supports that nicely.

But I wasn't sure if there are any other cases where iocb->aiocb can be
NULL? It could be nice to include an assertion or comment to clarify
this. For example:

  if (iocb->aiocb) {
      blk_aio_cancel_async(iocb->aiocb);
      iocb->aiocb = NULL;
  } else {
      /*
       * We only get here if nvme_dsm_cancel() was already called or
       * nvme_dsm_bh() is about to run.
       */
      assert(iocb->curr.idx == iocb->nr);
  }

  /* break loop */
  iocb->curr.len = 0;
  iocb->curr.idx = iocb->nr;

  iocb->ret = -ECANCELED;

(I'm not sure if my assert is correct, but hopefully this explains what
I mean.)

The reason why this assertion is important is because nvme_dsm_cancel()
does not support other iocb->aiocb = NULL cases. The cancelled request
could hang if nothing completes it. The assertion will complain loudly
if this every happens (may not now, but if someone changes the code in
the future).

Stefan
Klaus Jensen March 9, 2021, 6:27 p.m. UTC | #4
On Mar  9 16:03, Stefan Hajnoczi wrote:
> On Mon, Mar 08, 2021 at 07:05:40PM +0100, Klaus Jensen wrote:
> > On Mar  8 16:37, Stefan Hajnoczi wrote:
> > > On Tue, Mar 02, 2021 at 12:10:37PM +0100, Klaus Jensen wrote:
> > > > +static void nvme_dsm_cancel(BlockAIOCB *aiocb)
> > > > +{
> > > > +    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
> > > > +
> > > > +    /* break loop */
> > > > +    iocb->curr.len = 0;
> > > > +    iocb->curr.idx = iocb->nr;
> > > > +
> > > > +    iocb->ret = -ECANCELED;
> > > > +
> > > > +    if (iocb->aiocb) {
> > > > +        blk_aio_cancel_async(iocb->aiocb);
> > > > +        iocb->aiocb = NULL;
> > > > +    }
> > > > +}
> > > 
> > > Is the case where iocb->aiocb == NULL just in case nvme_dsm_cancel() is
> > > called after the last discard has completed but before the BH runs? I
> > > want to make sure there are no other cases because nothing would call
> > > iocb->common.cb().
> > > 
> > 
> > Yes - that case *can* happen, right?
> > 
> > I modeled this after the appoach in the ide trim code (hw/ide/core.c).
> 
> Yes, nvme_dsm_bh() may run after other event loop activity. Therefore we
> have to take the iocb->aiocb == NULL case into account because some
> event loop activity could call nvme_dsm_cancel() before the BH runs.
> 
> Another (wild?) possibility is that nvme_dsm_cancel() is called twice.
> That's okay, nvme_dsm_cancel() supports that nicely.
> 
> But I wasn't sure if there are any other cases where iocb->aiocb can be
> NULL? It could be nice to include an assertion or comment to clarify
> this. For example:
> 
>   if (iocb->aiocb) {
>       blk_aio_cancel_async(iocb->aiocb);
>       iocb->aiocb = NULL;
>   } else {
>       /*
>        * We only get here if nvme_dsm_cancel() was already called or
>        * nvme_dsm_bh() is about to run.
>        */
>       assert(iocb->curr.idx == iocb->nr);
>   }
> 
>   /* break loop */
>   iocb->curr.len = 0;
>   iocb->curr.idx = iocb->nr;
> 
>   iocb->ret = -ECANCELED;
> 
> (I'm not sure if my assert is correct, but hopefully this explains what
> I mean.)
> 

Understood! I'll fix that up.

> The reason why this assertion is important is because nvme_dsm_cancel()
> does not support other iocb->aiocb = NULL cases. The cancelled request
> could hang if nothing completes it. The assertion will complain loudly
> if this every happens (may not now, but if someone changes the code in
> the future).
> 

Yeah, I understand that there is a risk of dead-lock due to "weird"
scheduling if one is not careful.

Thanks Stefan, these kinds of comments are super helpful when trying to
wrap ones head around this!

I'll give it another spin and post a v2 taking your comments into
account :)
diff mbox series

Patch

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 54c87c8f5fe3..8830d72b959f 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1494,23 +1494,16 @@  static void nvme_aio_flush_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
-static void nvme_aio_discard_cb(void *opaque, int ret)
+static void nvme_misc_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
-    uintptr_t *discards = (uintptr_t *)&req->opaque;
 
-    trace_pci_nvme_aio_discard_cb(nvme_cid(req));
+    trace_pci_nvme_misc_cb(nvme_cid(req));
 
     if (ret) {
         nvme_aio_err(req, ret);
     }
 
-    (*discards)--;
-
-    if (*discards) {
-        return;
-    }
-
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
@@ -1736,78 +1729,146 @@  out:
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+typedef struct NvmeDSMAIOCB {
+    BlockAIOCB common;
+    BlockAIOCB *aiocb;
+    NvmeRequest *req;
+    QEMUBH *bh;
+    int ret;
+
+    NvmeDsmRange *range;
+    int nr;
+    struct {
+        int64_t offset;
+        size_t  len;
+        int     idx;
+    } curr;
+} NvmeDSMAIOCB;
+
+static void nvme_dsm_cancel(BlockAIOCB *aiocb)
+{
+    NvmeDSMAIOCB *iocb = container_of(aiocb, NvmeDSMAIOCB, common);
+
+    /* break loop */
+    iocb->curr.len = 0;
+    iocb->curr.idx = iocb->nr;
+
+    iocb->ret = -ECANCELED;
+
+    if (iocb->aiocb) {
+        blk_aio_cancel_async(iocb->aiocb);
+        iocb->aiocb = NULL;
+    }
+}
+
+static const AIOCBInfo nvme_dsm_aiocb_info = {
+    .aiocb_size   = sizeof(NvmeDSMAIOCB),
+    .cancel_async = nvme_dsm_cancel,
+};
+
+static void nvme_dsm_bh(void *opaque)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+
+    iocb->common.cb(iocb->common.opaque, iocb->ret);
+
+    qemu_bh_delete(iocb->bh);
+    iocb->bh = NULL;
+    qemu_aio_unref(iocb);
+}
+
+static void nvme_dsm_aio_cb(void *opaque, int ret)
+{
+    NvmeDSMAIOCB *iocb = opaque;
+    NvmeRequest *req = iocb->req;
+    NvmeCtrl *n = nvme_ctrl(req);
+    NvmeNamespace *ns = req->ns;
+    NvmeDsmRange *range;
+    uint64_t slba;
+    uint32_t nlb;
+    size_t bytes;
+
+    if (ret < 0) {
+        iocb->ret = ret;
+        goto done;
+    }
+
+    if (iocb->curr.len == 0) {
+next:
+        if (iocb->curr.idx == iocb->nr) {
+            goto done;
+        }
+
+        range = &iocb->range[iocb->curr.idx++];
+        slba = le64_to_cpu(range->slba);
+        nlb = le32_to_cpu(range->nlb);
+
+        trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
+                                      nlb);
+
+        if (nlb > n->dmrsl) {
+            trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
+        }
+
+        if (nvme_check_bounds(ns, slba, nlb)) {
+            trace_pci_nvme_err_invalid_lba_range(slba, nlb,
+                                                 ns->id_ns.nsze);
+            goto next;
+        }
+
+        iocb->curr.offset = nvme_l2b(ns, slba);
+        iocb->curr.len = nvme_l2b(ns, nlb);
+    }
+
+    bytes = MIN(BDRV_REQUEST_MAX_BYTES, iocb->curr.len);
+
+    iocb->aiocb = blk_aio_pdiscard(ns->blkconf.blk, iocb->curr.offset, bytes,
+                                   nvme_dsm_aio_cb, iocb);
+
+    iocb->curr.offset += bytes;
+    iocb->curr.len -= bytes;
+
+    return;
+
+done:
+    iocb->aiocb = NULL;
+    if (iocb->bh) {
+        qemu_bh_schedule(iocb->bh);
+    }
+}
+
 static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeNamespace *ns = req->ns;
     NvmeDsmCmd *dsm = (NvmeDsmCmd *) &req->cmd;
-
     uint32_t attr = le32_to_cpu(dsm->attributes);
     uint32_t nr = (le32_to_cpu(dsm->nr) & 0xff) + 1;
-
     uint16_t status = NVME_SUCCESS;
 
     trace_pci_nvme_dsm(nvme_cid(req), nvme_nsid(ns), nr, attr);
 
     if (attr & NVME_DSMGMT_AD) {
-        int64_t offset;
-        size_t len;
-        NvmeDsmRange range[nr];
-        uintptr_t *discards = (uintptr_t *)&req->opaque;
+        NvmeDSMAIOCB *iocb = blk_aio_get(&nvme_dsm_aiocb_info, ns->blkconf.blk,
+                                         nvme_misc_cb, req);
 
-        status = nvme_dma(n, (uint8_t *)range, sizeof(range),
+        iocb->req = req;
+        iocb->bh = qemu_bh_new(nvme_dsm_bh, iocb);
+        iocb->ret = 0;
+        iocb->range = g_new(NvmeDsmRange, nr);
+        iocb->nr = nr;
+        iocb->curr.len = 0;
+        iocb->curr.idx = 0;
+
+        status = nvme_dma(n, (uint8_t *)iocb->range, sizeof(NvmeDsmRange) * nr,
                           DMA_DIRECTION_TO_DEVICE, req);
         if (status) {
             return status;
         }
 
-        /*
-         * AIO callbacks may be called immediately, so initialize discards to 1
-         * to make sure the the callback does not complete the request before
-         * all discards have been issued.
-         */
-        *discards = 1;
+        nvme_dsm_aio_cb(iocb, 0);
+        req->aiocb = &iocb->common;
 
-        for (int i = 0; i < nr; i++) {
-            uint64_t slba = le64_to_cpu(range[i].slba);
-            uint32_t nlb = le32_to_cpu(range[i].nlb);
-
-            if (nvme_check_bounds(ns, slba, nlb)) {
-                trace_pci_nvme_err_invalid_lba_range(slba, nlb,
-                                                     ns->id_ns.nsze);
-                continue;
-            }
-
-            trace_pci_nvme_dsm_deallocate(nvme_cid(req), nvme_nsid(ns), slba,
-                                          nlb);
-
-            if (nlb > n->dmrsl) {
-                trace_pci_nvme_dsm_single_range_limit_exceeded(nlb, n->dmrsl);
-            }
-
-            offset = nvme_l2b(ns, slba);
-            len = nvme_l2b(ns, nlb);
-
-            while (len) {
-                size_t bytes = MIN(BDRV_REQUEST_MAX_BYTES, len);
-
-                (*discards)++;
-
-                blk_aio_pdiscard(ns->blkconf.blk, offset, bytes,
-                                 nvme_aio_discard_cb, req);
-
-                offset += bytes;
-                len -= bytes;
-            }
-        }
-
-        /* account for the 1-initialization */
-        (*discards)--;
-
-        if (*discards) {
-            status = NVME_NO_COMPLETE;
-        } else {
-            status = req->status;
-        }
+        return NVME_NO_COMPLETE;
     }
 
     return status;
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 8deeacc8c35c..0e5bddbdd48b 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -54,7 +54,7 @@  pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb
 pci_nvme_dsm_single_range_limit_exceeded(uint32_t nlb, uint32_t dmrsl) "nlb %"PRIu32" dmrsl %"PRIu32""
 pci_nvme_compare(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba 0x%"PRIx64" nlb %"PRIu32""
 pci_nvme_compare_cb(uint16_t cid) "cid %"PRIu16""
-pci_nvme_aio_discard_cb(uint16_t cid) "cid %"PRIu16""
+pci_nvme_misc_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_copy_in_cb(uint16_t cid) "cid %"PRIu16""
 pci_nvme_aio_zone_reset_cb(uint16_t cid, uint64_t zslba) "cid %"PRIu16" zslba 0x%"PRIx64""
 pci_nvme_aio_flush_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"