Message ID | 20240606122444.2914576-2-luchangqi.123@bytedance.com |
---|---|
State | New |
Headers | show |
Series | Support persistent reservation operations | expand |
On Thu, Jun 06, 2024 at 08:24:35PM +0800, Changqi Lu wrote: > Add persistent reservation in/out operations > at the block level. The following operations > are included: > > - read_keys: retrieves the list of registered keys. > - read_reservation: retrieves the current reservation status. > - register: registers a new reservation key. > - reserve: initiates a reservation for a specific key. > - release: releases a reservation for a specific key. > - clear: clears all existing reservations. > - preempt: preempts a reservation held by another key. > > Signed-off-by: Changqi Lu <luchangqi.123@bytedance.com> > Signed-off-by: zhenwei pi <pizhenwei@bytedance.com> > --- > block/block-backend.c | 397 ++++++++++++++++++++++++++++++ > block/io.c | 163 ++++++++++++ > include/block/block-common.h | 40 +++ > include/block/block-io.h | 20 ++ > include/block/block_int-common.h | 84 +++++++ > include/sysemu/block-backend-io.h | 24 ++ > 6 files changed, 728 insertions(+) > > diff --git a/block/block-backend.c b/block/block-backend.c > index db6f9b92a3..6707d94df7 100644 > --- a/block/block-backend.c > +++ b/block/block-backend.c > @@ -1770,6 +1770,403 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, > return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque); > } > > +typedef struct BlkPrInCo { > + BlockBackend *blk; > + uint32_t *generation; > + uint32_t num_keys; > + BlockPrType *type; > + uint64_t *keys; > + int ret; > +} BlkPrInCo; > + > +typedef struct BlkPrInCB { > + BlockAIOCB common; > + BlkPrInCo prco; > + bool has_returned; > +} BlkPrInCB; > + > +static const AIOCBInfo blk_pr_in_aiocb_info = { > + .aiocb_size = sizeof(BlkPrInCB), > +}; > + > +static void blk_pr_in_complete(BlkPrInCB *acb) > +{ > + if (acb->has_returned) { > + acb->common.cb(acb->common.opaque, acb->prco.ret); > + blk_dec_in_flight(acb->prco.blk); Did you receive my replies to v1 of this patch series? Please take a look at them and respond: https://lore.kernel.org/qemu-devel/20240508093629.441057-1-luchangqi.123@bytedance.com/ Thanks, Stefan > + qemu_aio_unref(acb); > + } > +} > + > +static void blk_pr_in_complete_bh(void *opaque) > +{ > + BlkPrInCB *acb = opaque; > + assert(acb->has_returned); > + blk_pr_in_complete(acb); > +} > + > +static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t *generation, > + uint32_t num_keys, BlockPrType *type, > + uint64_t *keys, CoroutineEntry co_entry, > + BlockCompletionFunc *cb, void *opaque) > +{ > + BlkPrInCB *acb; > + Coroutine *co; > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque); > + acb->prco = (BlkPrInCo) { > + .blk = blk, > + .generation = generation, > + .num_keys = num_keys, > + .type = type, > + .ret = NOT_DONE, > + .keys = keys, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(co_entry, acb); > + aio_co_enter(qemu_get_current_aio_context(), co); > + > + acb->has_returned = true; > + if (acb->prco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), > + blk_pr_in_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ > +static int coroutine_fn > +blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation, > + uint32_t num_keys, uint64_t *keys) > +{ > + IO_CODE(); > + > + blk_wait_while_drained(blk); > + GRAPH_RDLOCK_GUARD(); > + > + if (!blk_co_is_available(blk)) { > + return -ENOMEDIUM; > + } > + > + return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys); > +} > + > +static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque) > +{ > + BlkPrInCB *acb = opaque; > + BlkPrInCo *prco = &acb->prco; > + > + prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation, > + prco->num_keys, prco->keys); > + blk_pr_in_complete(acb); > +} > + > +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation, > + uint32_t num_keys, uint64_t *keys, > + BlockCompletionFunc *cb, void *opaque) > +{ > + IO_CODE(); > + return blk_aio_pr_in(blk, generation, num_keys, NULL, keys, > + blk_aio_pr_read_keys_entry, cb, opaque); > +} > + > +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ > +static int coroutine_fn > +blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation, > + uint64_t *key, BlockPrType *type) > +{ > + IO_CODE(); > + > + blk_wait_while_drained(blk); > + GRAPH_RDLOCK_GUARD(); > + > + if (!blk_co_is_available(blk)) { > + return -ENOMEDIUM; > + } > + > + return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key, type); > +} > + > +static void coroutine_fn blk_aio_pr_read_reservation_entry(void *opaque) > +{ > + BlkPrInCB *acb = opaque; > + BlkPrInCo *prco = &acb->prco; > + > + prco->ret = blk_aio_pr_do_read_reservation(prco->blk, prco->generation, > + prco->keys, prco->type); > + blk_pr_in_complete(acb); > +} > + > +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation, > + uint64_t *key, BlockPrType *type, > + BlockCompletionFunc *cb, void *opaque) > +{ > + IO_CODE(); > + return blk_aio_pr_in(blk, generation, 0, type, key, > + blk_aio_pr_read_reservation_entry, cb, opaque); > +} > + > +typedef struct BlkPrOutCo { > + BlockBackend *blk; > + uint64_t old_key; > + uint64_t new_key; > + bool ptpl; > + BlockPrType type; > + bool ignore_key; > + bool abort; > + int ret; > +} BlkPrOutCo; > + > +typedef struct BlkPrOutCB { > + BlockAIOCB common; > + BlkPrOutCo prco; > + bool has_returned; > +} BlkPrOutCB; > + > +static const AIOCBInfo blk_pr_out_aiocb_info = { > + .aiocb_size = sizeof(BlkPrOutCB), > +}; > + > +static void blk_pr_out_complete(BlkPrOutCB *acb) > +{ > + if (acb->has_returned) { > + acb->common.cb(acb->common.opaque, acb->prco.ret); > + blk_dec_in_flight(acb->prco.blk); > + qemu_aio_unref(acb); > + } > +} > + > +static void blk_pr_out_complete_bh(void *opaque) > +{ > + BlkPrOutCB *acb = opaque; > + assert(acb->has_returned); > + blk_pr_out_complete(acb); > +} > + > +static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key, > + uint64_t new_key, bool ptpl, > + BlockPrType type, bool ignore_key, > + bool abort, CoroutineEntry co_entry, > + BlockCompletionFunc *cb, void *opaque) > +{ > + BlkPrOutCB *acb; > + Coroutine *co; > + > + blk_inc_in_flight(blk); > + acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque); > + acb->prco = (BlkPrOutCo) { > + .blk = blk, > + .old_key = old_key, > + .new_key = new_key, > + .ptpl = ptpl, > + .type = type, > + .ignore_key = ignore_key, > + .abort = abort, > + .ret = NOT_DONE, > + }; > + acb->has_returned = false; > + > + co = qemu_coroutine_create(co_entry, acb); > + aio_co_enter(qemu_get_current_aio_context(), co); > + > + acb->has_returned = true; > + if (acb->prco.ret != NOT_DONE) { > + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), > + blk_pr_out_complete_bh, acb); > + } > + > + return &acb->common; > +} > + > +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ > +static int coroutine_fn > +blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key, > + uint64_t new_key, BlockPrType type, > + bool ptpl, bool ignore_key) > +{ > + IO_CODE(); > + > + blk_wait_while_drained(blk); > + GRAPH_RDLOCK_GUARD(); > + > + if (!blk_co_is_available(blk)) { > + return -ENOMEDIUM; > + } > + > + return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type, > + ptpl, ignore_key); > +} > + > +static void coroutine_fn blk_aio_pr_register_entry(void *opaque) > +{ > + BlkPrOutCB *acb = opaque; > + BlkPrOutCo *prco = &acb->prco; > + > + prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key, prco->new_key, > + prco->type, prco->ptpl, > + prco->ignore_key); > + blk_pr_out_complete(acb); > +} > + > +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key, > + uint64_t new_key, BlockPrType type, > + bool ptpl, bool ignore_key, > + BlockCompletionFunc *cb, > + void *opaque) > +{ > + IO_CODE(); > + return blk_aio_pr_out(blk, old_key, new_key, ptpl, type, ignore_key, false, > + blk_aio_pr_register_entry, cb, opaque); > +} > + > +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ > +static int coroutine_fn > +blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType type) > +{ > + IO_CODE(); > + > + blk_wait_while_drained(blk); > + GRAPH_RDLOCK_GUARD(); > + > + if (!blk_co_is_available(blk)) { > + return -ENOMEDIUM; > + } > + > + return bdrv_co_pr_reserve(blk_bs(blk), key, type); > +} > + > +static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque) > +{ > + BlkPrOutCB *acb = opaque; > + BlkPrOutCo *prco = &acb->prco; > + > + prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key, > + prco->type); > + blk_pr_out_complete(acb); > +} > + > + > +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key, > + BlockPrType type, > + BlockCompletionFunc *cb, > + void *opaque) > +{ > + IO_CODE(); > + return blk_aio_pr_out(blk, key, 0, false, type, false, false, > + blk_aio_pr_reserve_entry, cb, opaque); > +} > + > +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ > +static int coroutine_fn > +blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType type) > +{ > + IO_CODE(); > + > + blk_wait_while_drained(blk); > + GRAPH_RDLOCK_GUARD(); > + > + if (!blk_co_is_available(blk)) { > + return -ENOMEDIUM; > + } > + > + return bdrv_co_pr_release(blk_bs(blk), key, type); > +} > + > +static void coroutine_fn blk_aio_pr_release_entry(void *opaque) > +{ > + BlkPrOutCB *acb = opaque; > + BlkPrOutCo *prco = &acb->prco; > + > + prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key, prco->type); > + blk_pr_out_complete(acb); > +} > + > + > +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key, > + BlockPrType type, BlockCompletionFunc *cb, > + void *opaque) > +{ > + IO_CODE(); > + return blk_aio_pr_out(blk, key, 0, false, type, false, false, > + blk_aio_pr_release_entry, cb, opaque); > +} > + > +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ > +static int coroutine_fn > +blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key) > +{ > + IO_CODE(); > + > + blk_wait_while_drained(blk); > + GRAPH_RDLOCK_GUARD(); > + > + if (!blk_co_is_available(blk)) { > + return -ENOMEDIUM; > + } > + > + return bdrv_co_pr_clear(blk_bs(blk), key); > +} > + > +static void coroutine_fn blk_aio_pr_clear_entry(void *opaque) > +{ > + BlkPrOutCB *acb = opaque; > + BlkPrOutCo *prco = &acb->prco; > + > + prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key); > + blk_pr_out_complete(acb); > +} > + > + > +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key, > + BlockCompletionFunc *cb, void *opaque) > +{ > + IO_CODE(); > + return blk_aio_pr_out(blk, key, 0, false, 0, false, false, > + blk_aio_pr_clear_entry, cb, opaque); > +} > + > +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ > +static int coroutine_fn > +blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t cr_key, > + uint64_t pr_key, BlockPrType type, bool abort) > +{ > + IO_CODE(); > + > + blk_wait_while_drained(blk); > + GRAPH_RDLOCK_GUARD(); > + > + if (!blk_co_is_available(blk)) { > + return -ENOMEDIUM; > + } > + > + return bdrv_co_pr_preempt(blk_bs(blk), cr_key, pr_key, type, abort); > +} > + > +static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque) > +{ > + BlkPrOutCB *acb = opaque; > + BlkPrOutCo *prco = &acb->prco; > + > + prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key, > + prco->new_key, prco->type, > + prco->abort); > + blk_pr_out_complete(acb); > +} > + > + > +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key, > + uint64_t pr_key, BlockPrType type, > + bool abort, BlockCompletionFunc *cb, > + void *opaque) > +{ > + IO_CODE(); > + return blk_aio_pr_out(blk, cr_key, pr_key, false, type, false, abort, > + blk_aio_pr_preempt_entry, cb, opaque); > +} > + > /* To be called between exactly one pair of blk_inc/dec_in_flight() */ > static int coroutine_fn > blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes) > diff --git a/block/io.c b/block/io.c > index 7217cf811b..87a363c94f 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -3220,6 +3220,169 @@ out: > return co.ret; > } > > +int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs, > + uint32_t *generation, uint32_t num_keys, > + uint64_t *keys) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + > + IO_CODE(); > + assert_bdrv_graph_readable(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_pr_read_keys) { > + co.ret = -ENOTSUP; > + goto out; > + } > + > + co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs, > + uint32_t *generation, uint64_t *key, BlockPrType *type) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + > + IO_CODE(); > + assert_bdrv_graph_readable(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_pr_read_reservation) { > + co.ret = -ENOTSUP; > + goto out; > + } > + > + co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key, > + uint64_t new_key, BlockPrType type, bool ptpl, > + bool ignore_key) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + > + IO_CODE(); > + assert_bdrv_graph_readable(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_pr_register) { > + co.ret = -ENOTSUP; > + goto out; > + } > + > + co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type, > + ptpl, ignore_key); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, > + BlockPrType type) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + > + IO_CODE(); > + assert_bdrv_graph_readable(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_pr_reserve) { > + co.ret = -ENOTSUP; > + goto out; > + } > + > + co.ret = drv->bdrv_co_pr_reserve(bs, key, type); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, > + BlockPrType type) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + > + IO_CODE(); > + assert_bdrv_graph_readable(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_pr_release) { > + co.ret = -ENOTSUP; > + goto out; > + } > + > + co.ret = drv->bdrv_co_pr_release(bs, key, type); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + > + IO_CODE(); > + assert_bdrv_graph_readable(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_pr_clear) { > + co.ret = -ENOTSUP; > + goto out; > + } > + > + co.ret = drv->bdrv_co_pr_clear(bs, key); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > +int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, > + uint64_t pr_key, BlockPrType type, bool abort) > +{ > + BlockDriver *drv = bs->drv; > + CoroutineIOCompletion co = { > + .coroutine = qemu_coroutine_self(), > + }; > + > + IO_CODE(); > + assert_bdrv_graph_readable(); > + > + bdrv_inc_in_flight(bs); > + if (!drv || !drv->bdrv_co_pr_preempt) { > + co.ret = -ENOTSUP; > + goto out; > + } > + > + co.ret = drv->bdrv_co_pr_preempt(bs, cr_key, pr_key, type, abort); > +out: > + bdrv_dec_in_flight(bs); > + return co.ret; > +} > + > int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, > unsigned int *nr_zones, > BlockZoneDescriptor *zones) > diff --git a/include/block/block-common.h b/include/block/block-common.h > index a846023a09..7ca4e2328f 100644 > --- a/include/block/block-common.h > +++ b/include/block/block-common.h > @@ -524,6 +524,46 @@ typedef enum { > BDRV_FIX_ERRORS = 2, > } BdrvCheckMode; > > +/** > + * According SCSI protocol(chapter 5.9 of SCSI Primary Commands - 4) > + * and NVMe protocol(chapter 7.2 of NVMe Base Specification 2.0), > + * the persistent reservation types and persistent capabilities of > + * the public layer block are abstracted. > + */ > +typedef enum { > + BLK_PR_WRITE_EXCLUSIVE = 0x1, > + BLK_PR_EXCLUSIVE_ACCESS = 0x2, > + BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY = 0x3, > + BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY = 0x4, > + BLK_PR_WRITE_EXCLUSIVE_ALL_REGS = 0x5, > + BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS = 0x6, > +} BlockPrType; > + > +typedef enum BLKPrCap { > + /* Persist Through Power Loss */ > + BLK_PR_CAP_PTPL = 1 << 0, > + /* Write Exclusive reservation type */ > + BLK_PR_CAP_WR_EX = 1 << 1, > + /* Exclusive Access reservation type */ > + BLK_PR_CAP_EX_AC = 1 << 2, > + /* Write Exclusive Registrants Only reservation type */ > + BLK_PR_CAP_WR_EX_RO = 1 << 3, > + /* Exclusive Access Registrants Only reservation type */ > + BLK_PR_CAP_EX_AC_RO = 1 << 4, > + /* Write Exclusive All Registrants reservation type */ > + BLK_PR_CAP_WR_EX_AR = 1 << 5, > + /* Exclusive Access All Registrants reservation type */ > + BLK_PR_CAP_EX_AC_AR = 1 << 6, > + > + BLK_PR_CAP_ALL = (BLK_PR_CAP_PTPL | > + BLK_PR_CAP_WR_EX | > + BLK_PR_CAP_EX_AC | > + BLK_PR_CAP_WR_EX_RO | > + BLK_PR_CAP_EX_AC_RO | > + BLK_PR_CAP_WR_EX_AR | > + BLK_PR_CAP_EX_AC_AR), > +} BLKPrCap; > + > typedef struct BlockSizes { > uint32_t phys; > uint32_t log; > diff --git a/include/block/block-io.h b/include/block/block-io.h > index b49e0537dd..908361862b 100644 > --- a/include/block/block-io.h > +++ b/include/block/block-io.h > @@ -106,6 +106,26 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb); > int coroutine_fn GRAPH_RDLOCK > bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); > > +int coroutine_fn GRAPH_RDLOCK > +bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation, > + uint32_t num_keys, uint64_t *keys); > +int coroutine_fn GRAPH_RDLOCK > +bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation, > + uint64_t *key, BlockPrType *type); > +int coroutine_fn GRAPH_RDLOCK > +bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key, > + uint64_t new_key, BlockPrType type, > + bool ptpl, bool ignore_key); > +int coroutine_fn GRAPH_RDLOCK > +bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType type); > +int coroutine_fn GRAPH_RDLOCK > +bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType type); > +int coroutine_fn GRAPH_RDLOCK > +bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key); > +int coroutine_fn GRAPH_RDLOCK > +bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, uint64_t pr_key, > + BlockPrType type, bool abort); > + > /* Ensure contents are flushed to disk. */ > int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs); > > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h > index 761276127e..6e628069e9 100644 > --- a/include/block/block_int-common.h > +++ b/include/block/block_int-common.h > @@ -766,6 +766,87 @@ struct BlockDriver { > int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)( > BlockDriverState *bs, unsigned long int req, void *buf); > > + /* > + * Persistent reservation series api. > + * Please refer to chapter 5.9 of SCSI Primary Commands - 4 or > + * chapter 7 of NVMe Base Specification 2.0. > + * > + * The block layer driver should implement all the following APIs > + * or none at all, including: bdrv_co_pr_read_keys, > + * bdrv_co_pr_read_reservation, bdrv_co_pr_register, > + * bdrv_co_pr_reserve, bdrv_co_pr_release, > + * bdrv_co_pr_clear and bdrv_co_pr_preempt. > + * > + * Read the registered keys and return them in the @keys. > + * @generation: The generation of the reservation key. > + * @num_keys: The maximum number of keys that can be transmitted. > + * @keys: Registered keys array. > + * > + * On success, store generation in @generation and store keys @keys > + * and return the number of @keys. > + * On failure return -errno. > + */ > + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)( > + BlockDriverState *bs, uint32_t *generation, > + uint32_t num_keys, uint64_t *keys); > + /* > + * Read the reservation key and store it in the @key. > + * @generation: The generation of the reservation key. > + * @key: The reservation key. > + * @type: Type of the reservation key. > + * > + * On success, store generation in @generation, store the > + * reservation key in @key and return the number of @key > + * which used to determine whether the reservation key exists. > + * On failure return -errno. > + */ > + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)( > + BlockDriverState *bs, uint32_t *generation, > + uint64_t *key, BlockPrType *type); > + /* > + * Register, unregister, or replace a reservation key. > + * @old_key: The current reservation key associated with the host. > + * @new_key: The new reservation Key. > + * @type: Type of the reservation key. > + * @ignore_key: Ignore or not @old_key. > + * @ptpl: Whether to support Persist Through Power Loss(PTPL). > + */ > + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)( > + BlockDriverState *bs, uint64_t old_key, > + uint64_t new_key, BlockPrType type, > + bool ptpl, bool ignore_key); > + /* > + * Acquire a reservation on a host. > + * @key: The current reservation key associated with the host. > + * @type: Type of the reservation key. > + */ > + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)( > + BlockDriverState *bs, uint64_t key, BlockPrType type); > + /* > + * Release a reservation on a host. > + * @key: The current reservation key associated with the host. > + * @type: Type of the reservation key. > + */ > + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)( > + BlockDriverState *bs, uint64_t key, BlockPrType type); > + /** > + * Clear reservations on a host. > + * @key: The current reservation key associated with the host. > + */ > + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)( > + BlockDriverState *bs, uint64_t key); > + /* > + * Preempt a reservation held on a host. > + * @cr_key: The current reservation key associated with the host. > + * @pr_key: The preempt reservation Key which to be > + * unregistered from the namespace. > + * @type: Type of the reservation key. > + * @abort: Whether to abort a reservation held on a host. > + */ > + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)( > + BlockDriverState *bs, uint64_t cr_key, > + uint64_t pr_key, BlockPrType type, bool abort); > + > /* > * Returns 0 for completed check, -errno for internal errors. > * The check results are stored in result. > @@ -899,6 +980,9 @@ typedef struct BlockLimits { > uint32_t max_active_zones; > > uint32_t write_granularity; > + > + /* Persistent reservation capacities. */ > + uint8_t pr_cap; > } BlockLimits; > > typedef struct BdrvOpBlocker BdrvOpBlocker; > diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h > index d174275a5c..b3d49a3c6f 100644 > --- a/include/sysemu/block-backend-io.h > +++ b/include/sysemu/block-backend-io.h > @@ -62,6 +62,30 @@ void blk_aio_cancel_async(BlockAIOCB *acb); > BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, > BlockCompletionFunc *cb, void *opaque); > > +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation, > + uint32_t num_keys, uint64_t *keys, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation, > + uint64_t *key, BlockPrType *type, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key, > + uint64_t new_key, BlockPrType type, > + bool ptpl, bool ignore_key, > + BlockCompletionFunc *cb, > + void *opaque); > +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key, > + BlockPrType type, > + BlockCompletionFunc *cb, > + void *opaque); > +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key, > + BlockPrType type, BlockCompletionFunc *cb, > + void *opaque); > +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key, > + BlockCompletionFunc *cb, void *opaque); > +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key, > + uint64_t pr_key, BlockPrType type, bool abort, > + BlockCompletionFunc *cb, void *opaque); > + > void blk_inc_in_flight(BlockBackend *blk); > void blk_dec_in_flight(BlockBackend *blk); > > -- > 2.20.1 >
Hi, Thanks for your advices! I will add it. On 2024/6/11 01:26, Stefan Hajnoczi wrote: > On Thu, Jun 06, 2024 at 08:24:35PM +0800, Changqi Lu wrote: >> Add persistent reservation in/out operations >> at the block level. The following operations >> are included: >> >> - read_keys: retrieves the list of registered keys. >> - read_reservation: retrieves the current reservation status. >> - register: registers a new reservation key. >> - reserve: initiates a reservation for a specific key. >> - release: releases a reservation for a specific key. >> - clear: clears all existing reservations. >> - preempt: preempts a reservation held by another key. >> >> Signed-off-by: Changqi Lu >> Signed-off-by: zhenwei pi >> --- >> block/block-backend.c | 397 ++++++++++++++++++++++++++++++ >> block/io.c | 163 ++++++++++++ >> include/block/block-common.h | 40 +++ >> include/block/block-io.h | 20 ++ >> include/block/block_int-common.h | 84 +++++++ >> include/sysemu/block-backend-io.h | 24 ++ >> 6 files changed, 728 insertions(+) >> >> diff --git a/block/block-backend.c b/block/block-backend.c >> index db6f9b92a3..6707d94df7 100644 >> --- a/block/block-backend.c >> +++ b/block/block-backend.c >> @@ -1770,6 +1770,403 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, >> return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque); >> } >> >> +typedef struct BlkPrInCo { >> + BlockBackend *blk; >> + uint32_t *generation; >> + uint32_t num_keys; >> + BlockPrType *type; >> + uint64_t *keys; >> + int ret; >> +} BlkPrInCo; >> + >> +typedef struct BlkPrInCB { >> + BlockAIOCB common; >> + BlkPrInCo prco; >> + bool has_returned; >> +} BlkPrInCB; >> + >> +static const AIOCBInfo blk_pr_in_aiocb_info = { >> + .aiocb_size = sizeof(BlkPrInCB), >> +}; >> + >> +static void blk_pr_in_complete(BlkPrInCB *acb) >> +{ >> + if (acb->has_returned) { >> + acb->common.cb(acb->common.opaque, acb->prco.ret); >> + blk_dec_in_flight(acb->prco.blk); > > Did you receive my replies to v1 of this patch series? > > Please take a look at them and respond: > https://lore.kernel.org/qemu-devel/20240508093629.441057-1-luchangqi.123@bytedance.com/ > > Thanks, > Stefan > >> + qemu_aio_unref(acb); >> + } >> +} >> + >> +static void blk_pr_in_complete_bh(void *opaque) >> +{ >> + BlkPrInCB *acb = opaque; >> + assert(acb->has_returned); >> + blk_pr_in_complete(acb); >> +} >> + >> +static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t *generation, >> + uint32_t num_keys, BlockPrType *type, >> + uint64_t *keys, CoroutineEntry co_entry, >> + BlockCompletionFunc *cb, void *opaque) >> +{ >> + BlkPrInCB *acb; >> + Coroutine *co; >> + >> + blk_inc_in_flight(blk); >> + acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque); >> + acb->prco = (BlkPrInCo) { >> + .blk = blk, >> + .generation = generation, >> + .num_keys = num_keys, >> + .type = type, >> + .ret = NOT_DONE, >> + .keys = keys, >> + }; >> + acb->has_returned = false; >> + >> + co = qemu_coroutine_create(co_entry, acb); >> + aio_co_enter(qemu_get_current_aio_context(), co); >> + >> + acb->has_returned = true; >> + if (acb->prco.ret != NOT_DONE) { >> + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), >> + blk_pr_in_complete_bh, acb); >> + } >> + >> + return &acb->common; >> +} >> + >> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> +static int coroutine_fn >> +blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation, >> + uint32_t num_keys, uint64_t *keys) >> +{ >> + IO_CODE(); >> + >> + blk_wait_while_drained(blk); >> + GRAPH_RDLOCK_GUARD(); >> + >> + if (!blk_co_is_available(blk)) { >> + return -ENOMEDIUM; >> + } >> + >> + return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys); >> +} >> + >> +static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque) >> +{ >> + BlkPrInCB *acb = opaque; >> + BlkPrInCo *prco = &acb->prco; >> + >> + prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation, >> + prco->num_keys, prco->keys); >> + blk_pr_in_complete(acb); >> +} >> + >> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation, >> + uint32_t num_keys, uint64_t *keys, >> + BlockCompletionFunc *cb, void *opaque) >> +{ >> + IO_CODE(); >> + return blk_aio_pr_in(blk, generation, num_keys, NULL, keys, >> + blk_aio_pr_read_keys_entry, cb, opaque); >> +} >> + >> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> +static int coroutine_fn >> +blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation, >> + uint64_t *key, BlockPrType *type) >> +{ >> + IO_CODE(); >> + >> + blk_wait_while_drained(blk); >> + GRAPH_RDLOCK_GUARD(); >> + >> + if (!blk_co_is_available(blk)) { >> + return -ENOMEDIUM; >> + } >> + >> + return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key, type); >> +} >> + >> +static void coroutine_fn blk_aio_pr_read_reservation_entry(void *opaque) >> +{ >> + BlkPrInCB *acb = opaque; >> + BlkPrInCo *prco = &acb->prco; >> + >> + prco->ret = blk_aio_pr_do_read_reservation(prco->blk, prco->generation, >> + prco->keys, prco->type); >> + blk_pr_in_complete(acb); >> +} >> + >> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation, >> + uint64_t *key, BlockPrType *type, >> + BlockCompletionFunc *cb, void *opaque) >> +{ >> + IO_CODE(); >> + return blk_aio_pr_in(blk, generation, 0, type, key, >> + blk_aio_pr_read_reservation_entry, cb, opaque); >> +} >> + >> +typedef struct BlkPrOutCo { >> + BlockBackend *blk; >> + uint64_t old_key; >> + uint64_t new_key; >> + bool ptpl; >> + BlockPrType type; >> + bool ignore_key; >> + bool abort; >> + int ret; >> +} BlkPrOutCo; >> + >> +typedef struct BlkPrOutCB { >> + BlockAIOCB common; >> + BlkPrOutCo prco; >> + bool has_returned; >> +} BlkPrOutCB; >> + >> +static const AIOCBInfo blk_pr_out_aiocb_info = { >> + .aiocb_size = sizeof(BlkPrOutCB), >> +}; >> + >> +static void blk_pr_out_complete(BlkPrOutCB *acb) >> +{ >> + if (acb->has_returned) { >> + acb->common.cb(acb->common.opaque, acb->prco.ret); >> + blk_dec_in_flight(acb->prco.blk); >> + qemu_aio_unref(acb); >> + } >> +} >> + >> +static void blk_pr_out_complete_bh(void *opaque) >> +{ >> + BlkPrOutCB *acb = opaque; >> + assert(acb->has_returned); >> + blk_pr_out_complete(acb); >> +} >> + >> +static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key, >> + uint64_t new_key, bool ptpl, >> + BlockPrType type, bool ignore_key, >> + bool abort, CoroutineEntry co_entry, >> + BlockCompletionFunc *cb, void *opaque) >> +{ >> + BlkPrOutCB *acb; >> + Coroutine *co; >> + >> + blk_inc_in_flight(blk); >> + acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque); >> + acb->prco = (BlkPrOutCo) { >> + .blk = blk, >> + .old_key = old_key, >> + .new_key = new_key, >> + .ptpl = ptpl, >> + .type = type, >> + .ignore_key = ignore_key, >> + .abort = abort, >> + .ret = NOT_DONE, >> + }; >> + acb->has_returned = false; >> + >> + co = qemu_coroutine_create(co_entry, acb); >> + aio_co_enter(qemu_get_current_aio_context(), co); >> + >> + acb->has_returned = true; >> + if (acb->prco.ret != NOT_DONE) { >> + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), >> + blk_pr_out_complete_bh, acb); >> + } >> + >> + return &acb->common; >> +} >> + >> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> +static int coroutine_fn >> +blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key, >> + uint64_t new_key, BlockPrType type, >> + bool ptpl, bool ignore_key) >> +{ >> + IO_CODE(); >> + >> + blk_wait_while_drained(blk); >> + GRAPH_RDLOCK_GUARD(); >> + >> + if (!blk_co_is_available(blk)) { >> + return -ENOMEDIUM; >> + } >> + >> + return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type, >> + ptpl, ignore_key); >> +} >> + >> +static void coroutine_fn blk_aio_pr_register_entry(void *opaque) >> +{ >> + BlkPrOutCB *acb = opaque; >> + BlkPrOutCo *prco = &acb->prco; >> + >> + prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key, prco->new_key, >> + prco->type, prco->ptpl, >> + prco->ignore_key); >> + blk_pr_out_complete(acb); >> +} >> + >> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key, >> + uint64_t new_key, BlockPrType type, >> + bool ptpl, bool ignore_key, >> + BlockCompletionFunc *cb, >> + void *opaque) >> +{ >> + IO_CODE(); >> + return blk_aio_pr_out(blk, old_key, new_key, ptpl, type, ignore_key, false, >> + blk_aio_pr_register_entry, cb, opaque); >> +} >> + >> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> +static int coroutine_fn >> +blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType type) >> +{ >> + IO_CODE(); >> + >> + blk_wait_while_drained(blk); >> + GRAPH_RDLOCK_GUARD(); >> + >> + if (!blk_co_is_available(blk)) { >> + return -ENOMEDIUM; >> + } >> + >> + return bdrv_co_pr_reserve(blk_bs(blk), key, type); >> +} >> + >> +static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque) >> +{ >> + BlkPrOutCB *acb = opaque; >> + BlkPrOutCo *prco = &acb->prco; >> + >> + prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key, >> + prco->type); >> + blk_pr_out_complete(acb); >> +} >> + >> + >> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key, >> + BlockPrType type, >> + BlockCompletionFunc *cb, >> + void *opaque) >> +{ >> + IO_CODE(); >> + return blk_aio_pr_out(blk, key, 0, false, type, false, false, >> + blk_aio_pr_reserve_entry, cb, opaque); >> +} >> + >> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> +static int coroutine_fn >> +blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType type) >> +{ >> + IO_CODE(); >> + >> + blk_wait_while_drained(blk); >> + GRAPH_RDLOCK_GUARD(); >> + >> + if (!blk_co_is_available(blk)) { >> + return -ENOMEDIUM; >> + } >> + >> + return bdrv_co_pr_release(blk_bs(blk), key, type); >> +} >> + >> +static void coroutine_fn blk_aio_pr_release_entry(void *opaque) >> +{ >> + BlkPrOutCB *acb = opaque; >> + BlkPrOutCo *prco = &acb->prco; >> + >> + prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key, prco->type); >> + blk_pr_out_complete(acb); >> +} >> + >> + >> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key, >> + BlockPrType type, BlockCompletionFunc *cb, >> + void *opaque) >> +{ >> + IO_CODE(); >> + return blk_aio_pr_out(blk, key, 0, false, type, false, false, >> + blk_aio_pr_release_entry, cb, opaque); >> +} >> + >> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> +static int coroutine_fn >> +blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key) >> +{ >> + IO_CODE(); >> + >> + blk_wait_while_drained(blk); >> + GRAPH_RDLOCK_GUARD(); >> + >> + if (!blk_co_is_available(blk)) { >> + return -ENOMEDIUM; >> + } >> + >> + return bdrv_co_pr_clear(blk_bs(blk), key); >> +} >> + >> +static void coroutine_fn blk_aio_pr_clear_entry(void *opaque) >> +{ >> + BlkPrOutCB *acb = opaque; >> + BlkPrOutCo *prco = &acb->prco; >> + >> + prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key); >> + blk_pr_out_complete(acb); >> +} >> + >> + >> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key, >> + BlockCompletionFunc *cb, void *opaque) >> +{ >> + IO_CODE(); >> + return blk_aio_pr_out(blk, key, 0, false, 0, false, false, >> + blk_aio_pr_clear_entry, cb, opaque); >> +} >> + >> +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> +static int coroutine_fn >> +blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t cr_key, >> + uint64_t pr_key, BlockPrType type, bool abort) >> +{ >> + IO_CODE(); >> + >> + blk_wait_while_drained(blk); >> + GRAPH_RDLOCK_GUARD(); >> + >> + if (!blk_co_is_available(blk)) { >> + return -ENOMEDIUM; >> + } >> + >> + return bdrv_co_pr_preempt(blk_bs(blk), cr_key, pr_key, type, abort); >> +} >> + >> +static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque) >> +{ >> + BlkPrOutCB *acb = opaque; >> + BlkPrOutCo *prco = &acb->prco; >> + >> + prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key, >> + prco->new_key, prco->type, >> + prco->abort); >> + blk_pr_out_complete(acb); >> +} >> + >> + >> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key, >> + uint64_t pr_key, BlockPrType type, >> + bool abort, BlockCompletionFunc *cb, >> + void *opaque) >> +{ >> + IO_CODE(); >> + return blk_aio_pr_out(blk, cr_key, pr_key, false, type, false, abort, >> + blk_aio_pr_preempt_entry, cb, opaque); >> +} >> + >> /* To be called between exactly one pair of blk_inc/dec_in_flight() */ >> static int coroutine_fn >> blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes) >> diff --git a/block/io.c b/block/io.c >> index 7217cf811b..87a363c94f 100644 >> --- a/block/io.c >> +++ b/block/io.c >> @@ -3220,6 +3220,169 @@ out: >> return co.ret; >> } >> >> +int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs, >> + uint32_t *generation, uint32_t num_keys, >> + uint64_t *keys) >> +{ >> + BlockDriver *drv = bs->drv; >> + CoroutineIOCompletion co = { >> + .coroutine = qemu_coroutine_self(), >> + }; >> + >> + IO_CODE(); >> + assert_bdrv_graph_readable(); >> + >> + bdrv_inc_in_flight(bs); >> + if (!drv || !drv->bdrv_co_pr_read_keys) { >> + co.ret = -ENOTSUP; >> + goto out; >> + } >> + >> + co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys); >> +out: >> + bdrv_dec_in_flight(bs); >> + return co.ret; >> +} >> + >> +int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs, >> + uint32_t *generation, uint64_t *key, BlockPrType *type) >> +{ >> + BlockDriver *drv = bs->drv; >> + CoroutineIOCompletion co = { >> + .coroutine = qemu_coroutine_self(), >> + }; >> + >> + IO_CODE(); >> + assert_bdrv_graph_readable(); >> + >> + bdrv_inc_in_flight(bs); >> + if (!drv || !drv->bdrv_co_pr_read_reservation) { >> + co.ret = -ENOTSUP; >> + goto out; >> + } >> + >> + co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type); >> +out: >> + bdrv_dec_in_flight(bs); >> + return co.ret; >> +} >> + >> +int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key, >> + uint64_t new_key, BlockPrType type, bool ptpl, >> + bool ignore_key) >> +{ >> + BlockDriver *drv = bs->drv; >> + CoroutineIOCompletion co = { >> + .coroutine = qemu_coroutine_self(), >> + }; >> + >> + IO_CODE(); >> + assert_bdrv_graph_readable(); >> + >> + bdrv_inc_in_flight(bs); >> + if (!drv || !drv->bdrv_co_pr_register) { >> + co.ret = -ENOTSUP; >> + goto out; >> + } >> + >> + co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type, >> + ptpl, ignore_key); >> +out: >> + bdrv_dec_in_flight(bs); >> + return co.ret; >> +} >> + >> +int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, >> + BlockPrType type) >> +{ >> + BlockDriver *drv = bs->drv; >> + CoroutineIOCompletion co = { >> + .coroutine = qemu_coroutine_self(), >> + }; >> + >> + IO_CODE(); >> + assert_bdrv_graph_readable(); >> + >> + bdrv_inc_in_flight(bs); >> + if (!drv || !drv->bdrv_co_pr_reserve) { >> + co.ret = -ENOTSUP; >> + goto out; >> + } >> + >> + co.ret = drv->bdrv_co_pr_reserve(bs, key, type); >> +out: >> + bdrv_dec_in_flight(bs); >> + return co.ret; >> +} >> + >> +int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, >> + BlockPrType type) >> +{ >> + BlockDriver *drv = bs->drv; >> + CoroutineIOCompletion co = { >> + .coroutine = qemu_coroutine_self(), >> + }; >> + >> + IO_CODE(); >> + assert_bdrv_graph_readable(); >> + >> + bdrv_inc_in_flight(bs); >> + if (!drv || !drv->bdrv_co_pr_release) { >> + co.ret = -ENOTSUP; >> + goto out; >> + } >> + >> + co.ret = drv->bdrv_co_pr_release(bs, key, type); >> +out: >> + bdrv_dec_in_flight(bs); >> + return co.ret; >> +} >> + >> +int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key) >> +{ >> + BlockDriver *drv = bs->drv; >> + CoroutineIOCompletion co = { >> + .coroutine = qemu_coroutine_self(), >> + }; >> + >> + IO_CODE(); >> + assert_bdrv_graph_readable(); >> + >> + bdrv_inc_in_flight(bs); >> + if (!drv || !drv->bdrv_co_pr_clear) { >> + co.ret = -ENOTSUP; >> + goto out; >> + } >> + >> + co.ret = drv->bdrv_co_pr_clear(bs, key); >> +out: >> + bdrv_dec_in_flight(bs); >> + return co.ret; >> +} >> + >> +int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, >> + uint64_t pr_key, BlockPrType type, bool abort) >> +{ >> + BlockDriver *drv = bs->drv; >> + CoroutineIOCompletion co = { >> + .coroutine = qemu_coroutine_self(), >> + }; >> + >> + IO_CODE(); >> + assert_bdrv_graph_readable(); >> + >> + bdrv_inc_in_flight(bs); >> + if (!drv || !drv->bdrv_co_pr_preempt) { >> + co.ret = -ENOTSUP; >> + goto out; >> + } >> + >> + co.ret = drv->bdrv_co_pr_preempt(bs, cr_key, pr_key, type, abort); >> +out: >> + bdrv_dec_in_flight(bs); >> + return co.ret; >> +} >> + >> int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, >> unsigned int *nr_zones, >> BlockZoneDescriptor *zones) >> diff --git a/include/block/block-common.h b/include/block/block-common.h >> index a846023a09..7ca4e2328f 100644 >> --- a/include/block/block-common.h >> +++ b/include/block/block-common.h >> @@ -524,6 +524,46 @@ typedef enum { >> BDRV_FIX_ERRORS = 2, >> } BdrvCheckMode; >> >> +/** >> + * According SCSI protocol(chapter 5.9 of SCSI Primary Commands - 4) >> + * and NVMe protocol(chapter 7.2 of NVMe Base Specification 2.0), >> + * the persistent reservation types and persistent capabilities of >> + * the public layer block are abstracted. >> + */ >> +typedef enum { >> + BLK_PR_WRITE_EXCLUSIVE = 0x1, >> + BLK_PR_EXCLUSIVE_ACCESS = 0x2, >> + BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY = 0x3, >> + BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY = 0x4, >> + BLK_PR_WRITE_EXCLUSIVE_ALL_REGS = 0x5, >> + BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS = 0x6, >> +} BlockPrType; >> + >> +typedef enum BLKPrCap { >> + /* Persist Through Power Loss */ >> + BLK_PR_CAP_PTPL = 1 << 0, >> + /* Write Exclusive reservation type */ >> + BLK_PR_CAP_WR_EX = 1 << 1, >> + /* Exclusive Access reservation type */ >> + BLK_PR_CAP_EX_AC = 1 << 2, >> + /* Write Exclusive Registrants Only reservation type */ >> + BLK_PR_CAP_WR_EX_RO = 1 << 3, >> + /* Exclusive Access Registrants Only reservation type */ >> + BLK_PR_CAP_EX_AC_RO = 1 << 4, >> + /* Write Exclusive All Registrants reservation type */ >> + BLK_PR_CAP_WR_EX_AR = 1 << 5, >> + /* Exclusive Access All Registrants reservation type */ >> + BLK_PR_CAP_EX_AC_AR = 1 << 6, >> + >> + BLK_PR_CAP_ALL = (BLK_PR_CAP_PTPL | >> + BLK_PR_CAP_WR_EX | >> + BLK_PR_CAP_EX_AC | >> + BLK_PR_CAP_WR_EX_RO | >> + BLK_PR_CAP_EX_AC_RO | >> + BLK_PR_CAP_WR_EX_AR | >> + BLK_PR_CAP_EX_AC_AR), >> +} BLKPrCap; >> + >> typedef struct BlockSizes { >> uint32_t phys; >> uint32_t log; >> diff --git a/include/block/block-io.h b/include/block/block-io.h >> index b49e0537dd..908361862b 100644 >> --- a/include/block/block-io.h >> +++ b/include/block/block-io.h >> @@ -106,6 +106,26 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb); >> int coroutine_fn GRAPH_RDLOCK >> bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); >> >> +int coroutine_fn GRAPH_RDLOCK >> +bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation, >> + uint32_t num_keys, uint64_t *keys); >> +int coroutine_fn GRAPH_RDLOCK >> +bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation, >> + uint64_t *key, BlockPrType *type); >> +int coroutine_fn GRAPH_RDLOCK >> +bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key, >> + uint64_t new_key, BlockPrType type, >> + bool ptpl, bool ignore_key); >> +int coroutine_fn GRAPH_RDLOCK >> +bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType type); >> +int coroutine_fn GRAPH_RDLOCK >> +bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType type); >> +int coroutine_fn GRAPH_RDLOCK >> +bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key); >> +int coroutine_fn GRAPH_RDLOCK >> +bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, uint64_t pr_key, >> + BlockPrType type, bool abort); >> + >> /* Ensure contents are flushed to disk. */ >> int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs); >> >> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h >> index 761276127e..6e628069e9 100644 >> --- a/include/block/block_int-common.h >> +++ b/include/block/block_int-common.h >> @@ -766,6 +766,87 @@ struct BlockDriver { >> int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)( >> BlockDriverState *bs, unsigned long int req, void *buf); >> >> + /* >> + * Persistent reservation series api. >> + * Please refer to chapter 5.9 of SCSI Primary Commands - 4 or >> + * chapter 7 of NVMe Base Specification 2.0. >> + * >> + * The block layer driver should implement all the following APIs >> + * or none at all, including: bdrv_co_pr_read_keys, >> + * bdrv_co_pr_read_reservation, bdrv_co_pr_register, >> + * bdrv_co_pr_reserve, bdrv_co_pr_release, >> + * bdrv_co_pr_clear and bdrv_co_pr_preempt. >> + * >> + * Read the registered keys and return them in the @keys. >> + * @generation: The generation of the reservation key. >> + * @num_keys: The maximum number of keys that can be transmitted. >> + * @keys: Registered keys array. >> + * >> + * On success, store generation in @generation and store keys @keys >> + * and return the number of @keys. >> + * On failure return -errno. >> + */ >> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)( >> + BlockDriverState *bs, uint32_t *generation, >> + uint32_t num_keys, uint64_t *keys); >> + /* >> + * Read the reservation key and store it in the @key. >> + * @generation: The generation of the reservation key. >> + * @key: The reservation key. >> + * @type: Type of the reservation key. >> + * >> + * On success, store generation in @generation, store the >> + * reservation key in @key and return the number of @key >> + * which used to determine whether the reservation key exists. >> + * On failure return -errno. >> + */ >> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)( >> + BlockDriverState *bs, uint32_t *generation, >> + uint64_t *key, BlockPrType *type); >> + /* >> + * Register, unregister, or replace a reservation key. >> + * @old_key: The current reservation key associated with the host. >> + * @new_key: The new reservation Key. >> + * @type: Type of the reservation key. >> + * @ignore_key: Ignore or not @old_key. >> + * @ptpl: Whether to support Persist Through Power Loss(PTPL). >> + */ >> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)( >> + BlockDriverState *bs, uint64_t old_key, >> + uint64_t new_key, BlockPrType type, >> + bool ptpl, bool ignore_key); >> + /* >> + * Acquire a reservation on a host. >> + * @key: The current reservation key associated with the host. >> + * @type: Type of the reservation key. >> + */ >> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)( >> + BlockDriverState *bs, uint64_t key, BlockPrType type); >> + /* >> + * Release a reservation on a host. >> + * @key: The current reservation key associated with the host. >> + * @type: Type of the reservation key. >> + */ >> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)( >> + BlockDriverState *bs, uint64_t key, BlockPrType type); >> + /** >> + * Clear reservations on a host. >> + * @key: The current reservation key associated with the host. >> + */ >> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)( >> + BlockDriverState *bs, uint64_t key); >> + /* >> + * Preempt a reservation held on a host. >> + * @cr_key: The current reservation key associated with the host. >> + * @pr_key: The preempt reservation Key which to be >> + * unregistered from the namespace. >> + * @type: Type of the reservation key. >> + * @abort: Whether to abort a reservation held on a host. >> + */ >> + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)( >> + BlockDriverState *bs, uint64_t cr_key, >> + uint64_t pr_key, BlockPrType type, bool abort); >> + >> /* >> * Returns 0 for completed check, -errno for internal errors. >> * The check results are stored in result. >> @@ -899,6 +980,9 @@ typedef struct BlockLimits { >> uint32_t max_active_zones; >> >> uint32_t write_granularity; >> + >> + /* Persistent reservation capacities. */ >> + uint8_t pr_cap; >> } BlockLimits; >> >> typedef struct BdrvOpBlocker BdrvOpBlocker; >> diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h >> index d174275a5c..b3d49a3c6f 100644 >> --- a/include/sysemu/block-backend-io.h >> +++ b/include/sysemu/block-backend-io.h >> @@ -62,6 +62,30 @@ void blk_aio_cancel_async(BlockAIOCB *acb); >> BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, >> BlockCompletionFunc *cb, void *opaque); >> >> +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation, >> + uint32_t num_keys, uint64_t *keys, >> + BlockCompletionFunc *cb, void *opaque); >> +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation, >> + uint64_t *key, BlockPrType *type, >> + BlockCompletionFunc *cb, void *opaque); >> +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key, >> + uint64_t new_key, BlockPrType type, >> + bool ptpl, bool ignore_key, >> + BlockCompletionFunc *cb, >> + void *opaque); >> +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key, >> + BlockPrType type, >> + BlockCompletionFunc *cb, >> + void *opaque); >> +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key, >> + BlockPrType type, BlockCompletionFunc *cb, >> + void *opaque); >> +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key, >> + BlockCompletionFunc *cb, void *opaque); >> +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key, >> + uint64_t pr_key, BlockPrType type, bool abort, >> + BlockCompletionFunc *cb, void *opaque); >> + >> void blk_inc_in_flight(BlockBackend *blk); >> void blk_dec_in_flight(BlockBackend *blk); >> >> -- >> 2.20.1 >>
diff --git a/block/block-backend.c b/block/block-backend.c index db6f9b92a3..6707d94df7 100644 --- a/block/block-backend.c +++ b/block/block-backend.c @@ -1770,6 +1770,403 @@ BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, return blk_aio_prwv(blk, req, 0, buf, blk_aio_ioctl_entry, 0, cb, opaque); } +typedef struct BlkPrInCo { + BlockBackend *blk; + uint32_t *generation; + uint32_t num_keys; + BlockPrType *type; + uint64_t *keys; + int ret; +} BlkPrInCo; + +typedef struct BlkPrInCB { + BlockAIOCB common; + BlkPrInCo prco; + bool has_returned; +} BlkPrInCB; + +static const AIOCBInfo blk_pr_in_aiocb_info = { + .aiocb_size = sizeof(BlkPrInCB), +}; + +static void blk_pr_in_complete(BlkPrInCB *acb) +{ + if (acb->has_returned) { + acb->common.cb(acb->common.opaque, acb->prco.ret); + blk_dec_in_flight(acb->prco.blk); + qemu_aio_unref(acb); + } +} + +static void blk_pr_in_complete_bh(void *opaque) +{ + BlkPrInCB *acb = opaque; + assert(acb->has_returned); + blk_pr_in_complete(acb); +} + +static BlockAIOCB *blk_aio_pr_in(BlockBackend *blk, uint32_t *generation, + uint32_t num_keys, BlockPrType *type, + uint64_t *keys, CoroutineEntry co_entry, + BlockCompletionFunc *cb, void *opaque) +{ + BlkPrInCB *acb; + Coroutine *co; + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_pr_in_aiocb_info, blk, cb, opaque); + acb->prco = (BlkPrInCo) { + .blk = blk, + .generation = generation, + .num_keys = num_keys, + .type = type, + .ret = NOT_DONE, + .keys = keys, + }; + acb->has_returned = false; + + co = qemu_coroutine_create(co_entry, acb); + aio_co_enter(qemu_get_current_aio_context(), co); + + acb->has_returned = true; + if (acb->prco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), + blk_pr_in_complete_bh, acb); + } + + return &acb->common; +} + +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ +static int coroutine_fn +blk_aio_pr_do_read_keys(BlockBackend *blk, uint32_t *generation, + uint32_t num_keys, uint64_t *keys) +{ + IO_CODE(); + + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + if (!blk_co_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_pr_read_keys(blk_bs(blk), generation, num_keys, keys); +} + +static void coroutine_fn blk_aio_pr_read_keys_entry(void *opaque) +{ + BlkPrInCB *acb = opaque; + BlkPrInCo *prco = &acb->prco; + + prco->ret = blk_aio_pr_do_read_keys(prco->blk, prco->generation, + prco->num_keys, prco->keys); + blk_pr_in_complete(acb); +} + +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation, + uint32_t num_keys, uint64_t *keys, + BlockCompletionFunc *cb, void *opaque) +{ + IO_CODE(); + return blk_aio_pr_in(blk, generation, num_keys, NULL, keys, + blk_aio_pr_read_keys_entry, cb, opaque); +} + +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ +static int coroutine_fn +blk_aio_pr_do_read_reservation(BlockBackend *blk, uint32_t *generation, + uint64_t *key, BlockPrType *type) +{ + IO_CODE(); + + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + if (!blk_co_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_pr_read_reservation(blk_bs(blk), generation, key, type); +} + +static void coroutine_fn blk_aio_pr_read_reservation_entry(void *opaque) +{ + BlkPrInCB *acb = opaque; + BlkPrInCo *prco = &acb->prco; + + prco->ret = blk_aio_pr_do_read_reservation(prco->blk, prco->generation, + prco->keys, prco->type); + blk_pr_in_complete(acb); +} + +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation, + uint64_t *key, BlockPrType *type, + BlockCompletionFunc *cb, void *opaque) +{ + IO_CODE(); + return blk_aio_pr_in(blk, generation, 0, type, key, + blk_aio_pr_read_reservation_entry, cb, opaque); +} + +typedef struct BlkPrOutCo { + BlockBackend *blk; + uint64_t old_key; + uint64_t new_key; + bool ptpl; + BlockPrType type; + bool ignore_key; + bool abort; + int ret; +} BlkPrOutCo; + +typedef struct BlkPrOutCB { + BlockAIOCB common; + BlkPrOutCo prco; + bool has_returned; +} BlkPrOutCB; + +static const AIOCBInfo blk_pr_out_aiocb_info = { + .aiocb_size = sizeof(BlkPrOutCB), +}; + +static void blk_pr_out_complete(BlkPrOutCB *acb) +{ + if (acb->has_returned) { + acb->common.cb(acb->common.opaque, acb->prco.ret); + blk_dec_in_flight(acb->prco.blk); + qemu_aio_unref(acb); + } +} + +static void blk_pr_out_complete_bh(void *opaque) +{ + BlkPrOutCB *acb = opaque; + assert(acb->has_returned); + blk_pr_out_complete(acb); +} + +static BlockAIOCB *blk_aio_pr_out(BlockBackend *blk, uint64_t old_key, + uint64_t new_key, bool ptpl, + BlockPrType type, bool ignore_key, + bool abort, CoroutineEntry co_entry, + BlockCompletionFunc *cb, void *opaque) +{ + BlkPrOutCB *acb; + Coroutine *co; + + blk_inc_in_flight(blk); + acb = blk_aio_get(&blk_pr_out_aiocb_info, blk, cb, opaque); + acb->prco = (BlkPrOutCo) { + .blk = blk, + .old_key = old_key, + .new_key = new_key, + .ptpl = ptpl, + .type = type, + .ignore_key = ignore_key, + .abort = abort, + .ret = NOT_DONE, + }; + acb->has_returned = false; + + co = qemu_coroutine_create(co_entry, acb); + aio_co_enter(qemu_get_current_aio_context(), co); + + acb->has_returned = true; + if (acb->prco.ret != NOT_DONE) { + replay_bh_schedule_oneshot_event(qemu_get_current_aio_context(), + blk_pr_out_complete_bh, acb); + } + + return &acb->common; +} + +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ +static int coroutine_fn +blk_aio_pr_do_register(BlockBackend *blk, uint64_t old_key, + uint64_t new_key, BlockPrType type, + bool ptpl, bool ignore_key) +{ + IO_CODE(); + + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + if (!blk_co_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_pr_register(blk_bs(blk), old_key, new_key, type, + ptpl, ignore_key); +} + +static void coroutine_fn blk_aio_pr_register_entry(void *opaque) +{ + BlkPrOutCB *acb = opaque; + BlkPrOutCo *prco = &acb->prco; + + prco->ret = blk_aio_pr_do_register(prco->blk, prco->old_key, prco->new_key, + prco->type, prco->ptpl, + prco->ignore_key); + blk_pr_out_complete(acb); +} + +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key, + uint64_t new_key, BlockPrType type, + bool ptpl, bool ignore_key, + BlockCompletionFunc *cb, + void *opaque) +{ + IO_CODE(); + return blk_aio_pr_out(blk, old_key, new_key, ptpl, type, ignore_key, false, + blk_aio_pr_register_entry, cb, opaque); +} + +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ +static int coroutine_fn +blk_aio_pr_do_reserve(BlockBackend *blk, uint64_t key, BlockPrType type) +{ + IO_CODE(); + + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + if (!blk_co_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_pr_reserve(blk_bs(blk), key, type); +} + +static void coroutine_fn blk_aio_pr_reserve_entry(void *opaque) +{ + BlkPrOutCB *acb = opaque; + BlkPrOutCo *prco = &acb->prco; + + prco->ret = blk_aio_pr_do_reserve(prco->blk, prco->old_key, + prco->type); + blk_pr_out_complete(acb); +} + + +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key, + BlockPrType type, + BlockCompletionFunc *cb, + void *opaque) +{ + IO_CODE(); + return blk_aio_pr_out(blk, key, 0, false, type, false, false, + blk_aio_pr_reserve_entry, cb, opaque); +} + +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ +static int coroutine_fn +blk_aio_pr_do_release(BlockBackend *blk, uint64_t key, BlockPrType type) +{ + IO_CODE(); + + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + if (!blk_co_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_pr_release(blk_bs(blk), key, type); +} + +static void coroutine_fn blk_aio_pr_release_entry(void *opaque) +{ + BlkPrOutCB *acb = opaque; + BlkPrOutCo *prco = &acb->prco; + + prco->ret = blk_aio_pr_do_release(prco->blk, prco->old_key, prco->type); + blk_pr_out_complete(acb); +} + + +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key, + BlockPrType type, BlockCompletionFunc *cb, + void *opaque) +{ + IO_CODE(); + return blk_aio_pr_out(blk, key, 0, false, type, false, false, + blk_aio_pr_release_entry, cb, opaque); +} + +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ +static int coroutine_fn +blk_aio_pr_do_clear(BlockBackend *blk, uint64_t key) +{ + IO_CODE(); + + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + if (!blk_co_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_pr_clear(blk_bs(blk), key); +} + +static void coroutine_fn blk_aio_pr_clear_entry(void *opaque) +{ + BlkPrOutCB *acb = opaque; + BlkPrOutCo *prco = &acb->prco; + + prco->ret = blk_aio_pr_do_clear(prco->blk, prco->old_key); + blk_pr_out_complete(acb); +} + + +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key, + BlockCompletionFunc *cb, void *opaque) +{ + IO_CODE(); + return blk_aio_pr_out(blk, key, 0, false, 0, false, false, + blk_aio_pr_clear_entry, cb, opaque); +} + +/* To be called between exactly one pair of blk_inc/dec_in_flight() */ +static int coroutine_fn +blk_aio_pr_do_preempt(BlockBackend *blk, uint64_t cr_key, + uint64_t pr_key, BlockPrType type, bool abort) +{ + IO_CODE(); + + blk_wait_while_drained(blk); + GRAPH_RDLOCK_GUARD(); + + if (!blk_co_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_pr_preempt(blk_bs(blk), cr_key, pr_key, type, abort); +} + +static void coroutine_fn blk_aio_pr_preempt_entry(void *opaque) +{ + BlkPrOutCB *acb = opaque; + BlkPrOutCo *prco = &acb->prco; + + prco->ret = blk_aio_pr_do_preempt(prco->blk, prco->old_key, + prco->new_key, prco->type, + prco->abort); + blk_pr_out_complete(acb); +} + + +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key, + uint64_t pr_key, BlockPrType type, + bool abort, BlockCompletionFunc *cb, + void *opaque) +{ + IO_CODE(); + return blk_aio_pr_out(blk, cr_key, pr_key, false, type, false, abort, + blk_aio_pr_preempt_entry, cb, opaque); +} + /* To be called between exactly one pair of blk_inc/dec_in_flight() */ static int coroutine_fn blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes) diff --git a/block/io.c b/block/io.c index 7217cf811b..87a363c94f 100644 --- a/block/io.c +++ b/block/io.c @@ -3220,6 +3220,169 @@ out: return co.ret; } +int coroutine_fn bdrv_co_pr_read_keys(BlockDriverState *bs, + uint32_t *generation, uint32_t num_keys, + uint64_t *keys) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + IO_CODE(); + assert_bdrv_graph_readable(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_pr_read_keys) { + co.ret = -ENOTSUP; + goto out; + } + + co.ret = drv->bdrv_co_pr_read_keys(bs, generation, num_keys, keys); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_pr_read_reservation(BlockDriverState *bs, + uint32_t *generation, uint64_t *key, BlockPrType *type) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + IO_CODE(); + assert_bdrv_graph_readable(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_pr_read_reservation) { + co.ret = -ENOTSUP; + goto out; + } + + co.ret = drv->bdrv_co_pr_read_reservation(bs, generation, key, type); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key, + uint64_t new_key, BlockPrType type, bool ptpl, + bool ignore_key) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + IO_CODE(); + assert_bdrv_graph_readable(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_pr_register) { + co.ret = -ENOTSUP; + goto out; + } + + co.ret = drv->bdrv_co_pr_register(bs, old_key, new_key, type, + ptpl, ignore_key); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, + BlockPrType type) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + IO_CODE(); + assert_bdrv_graph_readable(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_pr_reserve) { + co.ret = -ENOTSUP; + goto out; + } + + co.ret = drv->bdrv_co_pr_reserve(bs, key, type); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, + BlockPrType type) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + IO_CODE(); + assert_bdrv_graph_readable(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_pr_release) { + co.ret = -ENOTSUP; + goto out; + } + + co.ret = drv->bdrv_co_pr_release(bs, key, type); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + IO_CODE(); + assert_bdrv_graph_readable(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_pr_clear) { + co.ret = -ENOTSUP; + goto out; + } + + co.ret = drv->bdrv_co_pr_clear(bs, key); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + +int coroutine_fn bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, + uint64_t pr_key, BlockPrType type, bool abort) +{ + BlockDriver *drv = bs->drv; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + IO_CODE(); + assert_bdrv_graph_readable(); + + bdrv_inc_in_flight(bs); + if (!drv || !drv->bdrv_co_pr_preempt) { + co.ret = -ENOTSUP; + goto out; + } + + co.ret = drv->bdrv_co_pr_preempt(bs, cr_key, pr_key, type, abort); +out: + bdrv_dec_in_flight(bs); + return co.ret; +} + int coroutine_fn bdrv_co_zone_report(BlockDriverState *bs, int64_t offset, unsigned int *nr_zones, BlockZoneDescriptor *zones) diff --git a/include/block/block-common.h b/include/block/block-common.h index a846023a09..7ca4e2328f 100644 --- a/include/block/block-common.h +++ b/include/block/block-common.h @@ -524,6 +524,46 @@ typedef enum { BDRV_FIX_ERRORS = 2, } BdrvCheckMode; +/** + * According SCSI protocol(chapter 5.9 of SCSI Primary Commands - 4) + * and NVMe protocol(chapter 7.2 of NVMe Base Specification 2.0), + * the persistent reservation types and persistent capabilities of + * the public layer block are abstracted. + */ +typedef enum { + BLK_PR_WRITE_EXCLUSIVE = 0x1, + BLK_PR_EXCLUSIVE_ACCESS = 0x2, + BLK_PR_WRITE_EXCLUSIVE_REGS_ONLY = 0x3, + BLK_PR_EXCLUSIVE_ACCESS_REGS_ONLY = 0x4, + BLK_PR_WRITE_EXCLUSIVE_ALL_REGS = 0x5, + BLK_PR_EXCLUSIVE_ACCESS_ALL_REGS = 0x6, +} BlockPrType; + +typedef enum BLKPrCap { + /* Persist Through Power Loss */ + BLK_PR_CAP_PTPL = 1 << 0, + /* Write Exclusive reservation type */ + BLK_PR_CAP_WR_EX = 1 << 1, + /* Exclusive Access reservation type */ + BLK_PR_CAP_EX_AC = 1 << 2, + /* Write Exclusive Registrants Only reservation type */ + BLK_PR_CAP_WR_EX_RO = 1 << 3, + /* Exclusive Access Registrants Only reservation type */ + BLK_PR_CAP_EX_AC_RO = 1 << 4, + /* Write Exclusive All Registrants reservation type */ + BLK_PR_CAP_WR_EX_AR = 1 << 5, + /* Exclusive Access All Registrants reservation type */ + BLK_PR_CAP_EX_AC_AR = 1 << 6, + + BLK_PR_CAP_ALL = (BLK_PR_CAP_PTPL | + BLK_PR_CAP_WR_EX | + BLK_PR_CAP_EX_AC | + BLK_PR_CAP_WR_EX_RO | + BLK_PR_CAP_EX_AC_RO | + BLK_PR_CAP_WR_EX_AR | + BLK_PR_CAP_EX_AC_AR), +} BLKPrCap; + typedef struct BlockSizes { uint32_t phys; uint32_t log; diff --git a/include/block/block-io.h b/include/block/block-io.h index b49e0537dd..908361862b 100644 --- a/include/block/block-io.h +++ b/include/block/block-io.h @@ -106,6 +106,26 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb); int coroutine_fn GRAPH_RDLOCK bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_pr_read_keys(BlockDriverState *bs, uint32_t *generation, + uint32_t num_keys, uint64_t *keys); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_pr_read_reservation(BlockDriverState *bs, uint32_t *generation, + uint64_t *key, BlockPrType *type); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_pr_register(BlockDriverState *bs, uint64_t old_key, + uint64_t new_key, BlockPrType type, + bool ptpl, bool ignore_key); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_pr_reserve(BlockDriverState *bs, uint64_t key, BlockPrType type); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_pr_release(BlockDriverState *bs, uint64_t key, BlockPrType type); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_pr_clear(BlockDriverState *bs, uint64_t key); +int coroutine_fn GRAPH_RDLOCK +bdrv_co_pr_preempt(BlockDriverState *bs, uint64_t cr_key, uint64_t pr_key, + BlockPrType type, bool abort); + /* Ensure contents are flushed to disk. */ int coroutine_fn GRAPH_RDLOCK bdrv_co_flush(BlockDriverState *bs); diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h index 761276127e..6e628069e9 100644 --- a/include/block/block_int-common.h +++ b/include/block/block_int-common.h @@ -766,6 +766,87 @@ struct BlockDriver { int coroutine_fn GRAPH_RDLOCK_PTR (*bdrv_co_ioctl)( BlockDriverState *bs, unsigned long int req, void *buf); + /* + * Persistent reservation series api. + * Please refer to chapter 5.9 of SCSI Primary Commands - 4 or + * chapter 7 of NVMe Base Specification 2.0. + * + * The block layer driver should implement all the following APIs + * or none at all, including: bdrv_co_pr_read_keys, + * bdrv_co_pr_read_reservation, bdrv_co_pr_register, + * bdrv_co_pr_reserve, bdrv_co_pr_release, + * bdrv_co_pr_clear and bdrv_co_pr_preempt. + * + * Read the registered keys and return them in the @keys. + * @generation: The generation of the reservation key. + * @num_keys: The maximum number of keys that can be transmitted. + * @keys: Registered keys array. + * + * On success, store generation in @generation and store keys @keys + * and return the number of @keys. + * On failure return -errno. + */ + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_keys)( + BlockDriverState *bs, uint32_t *generation, + uint32_t num_keys, uint64_t *keys); + /* + * Read the reservation key and store it in the @key. + * @generation: The generation of the reservation key. + * @key: The reservation key. + * @type: Type of the reservation key. + * + * On success, store generation in @generation, store the + * reservation key in @key and return the number of @key + * which used to determine whether the reservation key exists. + * On failure return -errno. + */ + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_read_reservation)( + BlockDriverState *bs, uint32_t *generation, + uint64_t *key, BlockPrType *type); + /* + * Register, unregister, or replace a reservation key. + * @old_key: The current reservation key associated with the host. + * @new_key: The new reservation Key. + * @type: Type of the reservation key. + * @ignore_key: Ignore or not @old_key. + * @ptpl: Whether to support Persist Through Power Loss(PTPL). + */ + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_register)( + BlockDriverState *bs, uint64_t old_key, + uint64_t new_key, BlockPrType type, + bool ptpl, bool ignore_key); + /* + * Acquire a reservation on a host. + * @key: The current reservation key associated with the host. + * @type: Type of the reservation key. + */ + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_reserve)( + BlockDriverState *bs, uint64_t key, BlockPrType type); + /* + * Release a reservation on a host. + * @key: The current reservation key associated with the host. + * @type: Type of the reservation key. + */ + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_release)( + BlockDriverState *bs, uint64_t key, BlockPrType type); + /** + * Clear reservations on a host. + * @key: The current reservation key associated with the host. + */ + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_clear)( + BlockDriverState *bs, uint64_t key); + /* + * Preempt a reservation held on a host. + * @cr_key: The current reservation key associated with the host. + * @pr_key: The preempt reservation Key which to be + * unregistered from the namespace. + * @type: Type of the reservation key. + * @abort: Whether to abort a reservation held on a host. + */ + int coroutine_fn GRAPH_RDLOCK_PTR(*bdrv_co_pr_preempt)( + BlockDriverState *bs, uint64_t cr_key, + uint64_t pr_key, BlockPrType type, bool abort); + /* * Returns 0 for completed check, -errno for internal errors. * The check results are stored in result. @@ -899,6 +980,9 @@ typedef struct BlockLimits { uint32_t max_active_zones; uint32_t write_granularity; + + /* Persistent reservation capacities. */ + uint8_t pr_cap; } BlockLimits; typedef struct BdrvOpBlocker BdrvOpBlocker; diff --git a/include/sysemu/block-backend-io.h b/include/sysemu/block-backend-io.h index d174275a5c..b3d49a3c6f 100644 --- a/include/sysemu/block-backend-io.h +++ b/include/sysemu/block-backend-io.h @@ -62,6 +62,30 @@ void blk_aio_cancel_async(BlockAIOCB *acb); BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pr_read_keys(BlockBackend *blk, uint32_t *generation, + uint32_t num_keys, uint64_t *keys, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pr_read_reservation(BlockBackend *blk, uint32_t *generation, + uint64_t *key, BlockPrType *type, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pr_register(BlockBackend *blk, uint64_t old_key, + uint64_t new_key, BlockPrType type, + bool ptpl, bool ignore_key, + BlockCompletionFunc *cb, + void *opaque); +BlockAIOCB *blk_aio_pr_reserve(BlockBackend *blk, uint64_t key, + BlockPrType type, + BlockCompletionFunc *cb, + void *opaque); +BlockAIOCB *blk_aio_pr_release(BlockBackend *blk, uint64_t key, + BlockPrType type, BlockCompletionFunc *cb, + void *opaque); +BlockAIOCB *blk_aio_pr_clear(BlockBackend *blk, uint64_t key, + BlockCompletionFunc *cb, void *opaque); +BlockAIOCB *blk_aio_pr_preempt(BlockBackend *blk, uint64_t cr_key, + uint64_t pr_key, BlockPrType type, bool abort, + BlockCompletionFunc *cb, void *opaque); + void blk_inc_in_flight(BlockBackend *blk); void blk_dec_in_flight(BlockBackend *blk);