diff mbox series

[RFC,v3,1/5] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.

Message ID 20220627001917.9417-2-faithilikerun@gmail.com
State New
Headers show
Series *** Add support for zoned device *** | expand

Commit Message

Sam Li June 27, 2022, 12:19 a.m. UTC
By adding zone management operations in BlockDriver, storage
controller emulation can use the new block layer APIs including
zone_report and zone_mgmt(open, close, finish, reset).
---
 block/block-backend.c            |  56 ++++++++
 block/coroutines.h               |   5 +
 block/file-posix.c               | 238 +++++++++++++++++++++++++++++++
 include/block/block-common.h     |  43 +++++-
 include/block/block_int-common.h |  20 +++
 5 files changed, 361 insertions(+), 1 deletion(-)

Comments

Hannes Reinecke June 27, 2022, 7:21 a.m. UTC | #1
On 6/27/22 02:19, Sam Li wrote:
> By adding zone management operations in BlockDriver, storage
> controller emulation can use the new block layer APIs including
> zone_report and zone_mgmt(open, close, finish, reset).
> ---
>   block/block-backend.c            |  56 ++++++++
>   block/coroutines.h               |   5 +
>   block/file-posix.c               | 238 +++++++++++++++++++++++++++++++
>   include/block/block-common.h     |  43 +++++-
>   include/block/block_int-common.h |  20 +++
>   5 files changed, 361 insertions(+), 1 deletion(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index e0e1aff4b1..786f964d02 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>       return ret;
>   }
>   
> +/*
> + * Return zone_report from BlockDriver. Offset can be any number within
> + * the zone size. No alignment for offset and len.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                       int64_t len, int64_t *nr_zones,
> +                       BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    BlockDriverState *bs;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    bs = blk_bs(blk);
> +
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    bdrv_inc_in_flight(bs);
> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> +                              nr_zones, zones);
> +    bdrv_dec_in_flight(bs);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Return zone_mgmt from BlockDriver.
> + * Offset is the start of a zone and len is aligned to zones.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len)
> +{
> +    int ret;
> +    BlockDriverState *bs;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    bs = blk_bs(blk);
> +
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    bdrv_inc_in_flight(bs);
> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> +    bdrv_dec_in_flight(bs);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>   void blk_drain(BlockBackend *blk)
>   {
>       BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/coroutines.h b/block/coroutines.h
> index 830ecaa733..a114d7bc30 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -80,6 +80,11 @@ int coroutine_fn
>   blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>   
>   int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t len, int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len);
>   
>   
>   /*
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..1b8b0d351f 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,7 @@
>   #include <sys/param.h>
>   #include <sys/syscall.h>
>   #include <sys/vfs.h>
> +#include <linux/blkzoned.h>
>   #include <linux/cdrom.h>
>   #include <linux/fd.h>
>   #include <linux/fs.h>
> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>               PreallocMode prealloc;
>               Error **errp;
>           } truncate;
> +        struct {
> +            int64_t *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        zone_op op;
>       };
>   } RawPosixAIOData;
>   
> @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
>   }
>   #endif
>   
> +/*
> + * parse_zone - Fill a zone descriptor
> + */
> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> +                              struct blk_zone *blkz) {
> +    zone->start = blkz->start;
> +    zone->length = blkz->len;
> +    zone->cap = blkz->capacity;
> +    zone->wp = blkz->wp - blkz->start;
> +    zone->type = blkz->type;
> +    zone->cond = blkz->cond;
> +}
> +
> +static int handle_aiocb_zone_report(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +
> +    struct blk_zone *blkz;
> +    int64_t rep_size, nrz;
> +    int ret, n = 0, i = 0;
> +
> +    nrz = *nr_zones;
> +    if (len == -1) {
> +        return -errno;
> +    }
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> +    printf("start to report zone with offset: 0x%lx\n", offset);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = offset;
> +        rep->nr_zones = nrz;
> +
> +        ret = ioctl(fd, BLKREPORTZONE, rep);
> +        if (ret != 0) {
> +            ret = -errno;
> +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> +                         fd, offset, errno);
> +            return ret;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            parse_zone(&zones[n], &blkz[i]);
> +            /* The next report should start after the last zone reported */
> +            offset = blkz[i].start + blkz[i].len;
> +        }

Where do you increase 'n' such that the loop can make forward progress?
Wouldn't it be better to use a for() loop here?

> +    }
> +
> +    *nr_zones = n;
> +    return 0;
> +}
> +
> +static int handle_aiocb_zone_mgmt(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +    zone_op op = aiocb->op;
> +
> +    struct blk_zone_range range;
> +    const char *ioctl_name;
> +    unsigned long ioctl_op;
> +    int64_t zone_size;
> +    int64_t zone_size_mask;
> +    int ret;
> +

Shouldn't we add a check here if 'fd' points to a zoned device?
ioctl errors are not _that_ helpful here, as you might get a variety
of errors and it's not quite obvious which of those errors indicate
an unsupported feature.

> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> +    if (ret) {
> +        return -1;
> +    }
> +
> +    zone_size_mask = zone_size - 1;
> +    if (offset & zone_size_mask) {
> +        error_report("offset is not the start of a zone");
> +        return -1;
> +    }
> +
> +    if (len & zone_size_mask) {
> +        error_report("len is not aligned to zones");
> +        return -1;
> +    }
> +
> +    switch (op) {
> +    case zone_open:
> +        ioctl_name = "BLKOPENZONE";
> +        ioctl_op = BLKOPENZONE;
> +        break;
> +    case zone_close:
> +        ioctl_name = "BLKCLOSEZONE";
> +        ioctl_op = BLKCLOSEZONE;
> +        break;
> +    case zone_finish:
> +        ioctl_name = "BLKFINISHZONE";
> +        ioctl_op = BLKFINISHZONE;
> +        break;
> +    case zone_reset:
> +        ioctl_name = "BLKRESETZONE";
> +        ioctl_op = BLKRESETZONE;
> +        break;
> +    default:
> +        error_report("Invalid zone operation 0x%x", op);
> +        errno = -EINVAL;
> +        return -1;
> +    }
> +
> +    /* Execute the operation */
> +    range.sector = offset;
> +    range.nr_sectors = len;
> +    ret = ioctl(fd, ioctl_op, &range);
> +    if (ret != 0) {
> +        error_report("ioctl %s failed %d",
> +                     ioctl_name, errno);
> +        return -1;
> +    }
> +
> +    return 0;
> +}
> +
>   static int handle_aiocb_copy_range(void *opaque)
>   {
>       RawPosixAIOData *aiocb = opaque;
> @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
>       }
>   }
>   
> +/*
> + * zone report - Get a zone block device's information in the form
> + * of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: an array of zone descriptors to hold zone
> + * information on reply
> + * @param offset: offset can be any byte within the zone size.
> + * @param len: (not sure yet.
> + * @return 0 on success, -1 on failure
> + */
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +        int64_t len, int64_t *nr_zones,
> +        BlockZoneDescriptor *zones) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_IOCTL,
> +        .aio_offset = offset,
> +        .aio_nbytes = len,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +                },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> +}
> +
> +/*
> + * zone management operations - Execute an operation on a zone
> + */
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> +        int64_t offset, int64_t len) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs             = bs,
> +        .aio_fildes     = s->fd,
> +        .aio_type       = QEMU_AIO_IOCTL,
> +        .aio_offset     = offset,
> +        .aio_nbytes     = len,
> +        .op             = op,
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> +}
> +
>   static coroutine_fn int
>   raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                   bool blkdev)
> @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = {
>       .bdrv_abort_perm_update = raw_abort_perm_update,
>       .create_opts = &raw_create_opts,
>       .mutable_opts = mutable_opts,
> +
> +    .bdrv_co_zone_report = raw_co_zone_report,
> +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>   };
>   
>   /***********************************************/
> @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = {
>   #endif
>   };
>   
> +static BlockDriver bdrv_zoned_host_device = {
> +        .format_name = "zoned_host_device",
> +        .protocol_name = "zoned_host_device",
> +        .instance_size = sizeof(BDRVRawState),
> +        .bdrv_needs_filename = true,
> +        .bdrv_probe_device  = hdev_probe_device,
> +        .bdrv_parse_filename = hdev_parse_filename,
> +        .bdrv_file_open     = hdev_open,
> +        .bdrv_close         = raw_close,
> +        .bdrv_reopen_prepare = raw_reopen_prepare,
> +        .bdrv_reopen_commit  = raw_reopen_commit,
> +        .bdrv_reopen_abort   = raw_reopen_abort,
> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> +        .create_opts         = &bdrv_create_opts_simple,
> +        .mutable_opts        = mutable_opts,
> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> +
> +        .bdrv_co_preadv         = raw_co_preadv,
> +        .bdrv_co_pwritev        = raw_co_pwritev,
> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> +        .bdrv_refresh_limits = raw_refresh_limits,
> +        .bdrv_io_plug = raw_aio_plug,
> +        .bdrv_io_unplug = raw_aio_unplug,
> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> +
> +        .bdrv_co_truncate       = raw_co_truncate,
> +        .bdrv_getlength = raw_getlength,
> +        .bdrv_get_info = raw_get_info,
> +        .bdrv_get_allocated_file_size
> +                            = raw_get_allocated_file_size,
> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> +        .bdrv_check_perm = raw_check_perm,
> +        .bdrv_set_perm   = raw_set_perm,
> +        .bdrv_abort_perm_update = raw_abort_perm_update,
> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> +        .bdrv_probe_geometry = hdev_probe_geometry,
> +        .bdrv_co_ioctl = hdev_co_ioctl,
> +
> +        /* zone management operations */
> +        .bdrv_co_zone_report = raw_co_zone_report,
> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +};
> +
>   #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>   static void cdrom_parse_filename(const char *filename, QDict *options,
>                                    Error **errp)
> @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void)
>   #if defined(HAVE_HOST_BLOCK_DEVICE)
>       bdrv_register(&bdrv_host_device);
>   #ifdef __linux__
> +    bdrv_register(&bdrv_zoned_host_device);
>       bdrv_register(&bdrv_host_cdrom);
>   #endif
>   #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..78cddeeda5 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -23,7 +23,6 @@
>    */
>   #ifndef BLOCK_COMMON_H
>   #define BLOCK_COMMON_H
> -
>   #include "block/aio.h"
>   #include "block/aio-wait.h"
>   #include "qemu/iov.h"
> @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
>   typedef struct BdrvChild BdrvChild;
>   typedef struct BdrvChildClass BdrvChildClass;
>   
> +typedef enum zone_op {
> +    zone_open,
> +    zone_close,
> +    zone_finish,
> +    zone_reset,
> +} zone_op;
> +
> +typedef enum zone_model {
> +    BLK_Z_HM,
> +    BLK_Z_HA,
> +} zone_model;
> +
> +typedef enum BlkZoneCondition {
> +    BLK_ZS_NOT_WP = 0x0,
> +    BLK_ZS_EMPTY = 0x1,
> +    BLK_ZS_IOPEN = 0x2,
> +    BLK_ZS_EOPEN = 0x3,
> +    BLK_ZS_CLOSED = 0x4,
> +    BLK_ZS_RDONLY = 0xD,
> +    BLK_ZS_FULL = 0xE,
> +    BLK_ZS_OFFLINE = 0xF,
> +} BlkZoneCondition;
> +
> +typedef enum BlkZoneType {
> +    BLK_ZT_CONV = 0x1,
> +    BLK_ZT_SWR = 0x2,
> +    BLK_ZT_SWP = 0x3,
> +} BlkZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provide information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    BlkZoneType type;
> +    BlkZoneCondition cond;
> +} BlockZoneDescriptor;
> +
>   typedef struct BlockDriverInfo {
>       /* in bytes, 0 if irrelevant */
>       int cluster_size;
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 8947abab76..b9ea9db6dc 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
>       struct BdrvTrackedRequest *waiting_for;
>   } BdrvTrackedRequest;
>   
> +/**
> + * Zone device information data structure.
> + * Provide information on a device.
> + */
> +typedef struct zbd_dev {
> +    uint32_t zone_size;
> +    zone_model model;
> +    uint32_t block_size;
> +    uint32_t write_granularity;
> +    uint32_t nr_zones;
> +    struct BlockZoneDescriptor *zones; /* array of zones */
> +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> +    uint32_t max_nr_active_zones;
> +} zbd_dev;
>   
>   struct BlockDriver {
>       /*
> @@ -691,6 +705,12 @@ struct BlockDriver {
>                                             QEMUIOVector *qiov,
>                                             int64_t pos);
>   
> +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> +            int64_t offset, int64_t len, int64_t *nr_zones,
> +            BlockZoneDescriptor *zones);
> +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> +            int64_t offset, int64_t len);
> +
>       /* removable device specific */
>       bool (*bdrv_is_inserted)(BlockDriverState *bs);
>       void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);

Other than that: Well done!

Cheers,

Hannes
Sam Li June 27, 2022, 7:45 a.m. UTC | #2
Hi Hannes,

Hannes Reinecke <hare@suse.de> 于2022年6月27日周一 15:21写道:

>
> On 6/27/22 02:19, Sam Li wrote:
> > By adding zone management operations in BlockDriver, storage
> > controller emulation can use the new block layer APIs including
> > zone_report and zone_mgmt(open, close, finish, reset).
> > ---
> >   block/block-backend.c            |  56 ++++++++
> >   block/coroutines.h               |   5 +
> >   block/file-posix.c               | 238 +++++++++++++++++++++++++++++++
> >   include/block/block-common.h     |  43 +++++-
> >   include/block/block_int-common.h |  20 +++
> >   5 files changed, 361 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index e0e1aff4b1..786f964d02 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >       return ret;
> >   }
> >
> > +/*
> > + * Return zone_report from BlockDriver. Offset can be any number within
> > + * the zone size. No alignment for offset and len.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                       int64_t len, int64_t *nr_zones,
> > +                       BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    BlockDriverState *bs;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    bs = blk_bs(blk);
> > +
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    bdrv_inc_in_flight(bs);
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> > +                              nr_zones, zones);
> > +    bdrv_dec_in_flight(bs);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Return zone_mgmt from BlockDriver.
> > + * Offset is the start of a zone and len is aligned to zones.
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret;
> > +    BlockDriverState *bs;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    bs = blk_bs(blk);
> > +
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    bdrv_inc_in_flight(bs);
> > +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > +    bdrv_dec_in_flight(bs);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >   void blk_drain(BlockBackend *blk)
> >   {
> >       BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..a114d7bc30 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> >   blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> >   int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t len, int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len);
> >
> >
> >   /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..1b8b0d351f 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,7 @@
> >   #include <sys/param.h>
> >   #include <sys/syscall.h>
> >   #include <sys/vfs.h>
> > +#include <linux/blkzoned.h>
> >   #include <linux/cdrom.h>
> >   #include <linux/fd.h>
> >   #include <linux/fs.h>
> > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >               PreallocMode prealloc;
> >               Error **errp;
> >           } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report;
> > +        zone_op op;
> >       };
> >   } RawPosixAIOData;
> >
> > @@ -1801,6 +1807,135 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
> >   }
> >   #endif
> >
> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > +                              struct blk_zone *blkz) {
> > +    zone->start = blkz->start;
> > +    zone->length = blkz->len;
> > +    zone->cap = blkz->capacity;
> > +    zone->wp = blkz->wp - blkz->start;
> > +    zone->type = blkz->type;
> > +    zone->cond = blkz->cond;
> > +}
> > +
> > +static int handle_aiocb_zone_report(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    nrz = *nr_zones;
> > +    if (len == -1) {
> > +        return -errno;
> > +    }
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> > +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> > +    printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
> > +
> > +        ret = ioctl(fd, BLKREPORTZONE, rep);
> > +        if (ret != 0) {
> > +            ret = -errno;
> > +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> > +                         fd, offset, errno);
> > +            return ret;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++, n++) {
> > +            parse_zone(&zones[n], &blkz[i]);
> > +            /* The next report should start after the last zone reported */
> > +            offset = blkz[i].start + blkz[i].len;
> > +        }
>
> Where do you increase 'n' such that the loop can make forward progress?
> Wouldn't it be better to use a for() loop here?
>
'n' increases in this for loop as 'i' increases. I think the for()
loop can serve the same purpose with some modifications.

> > +    }
> > +
> > +    *nr_zones = n;
> > +    return 0;
> > +}
> > +
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +    zone_op op = aiocb->op;
> > +
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    unsigned long ioctl_op;
> > +    int64_t zone_size;
> > +    int64_t zone_size_mask;
> > +    int ret;
> > +
>
> Shouldn't we add a check here if 'fd' points to a zoned device?
> ioctl errors are not _that_ helpful here, as you might get a variety
> of errors and it's not quite obvious which of those errors indicate
> an unsupported feature.
>
Yes, I'll add it in the next patch.

> > +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> > +    if (ret) {
> > +        return -1;
> > +    }
> > +
> > +    zone_size_mask = zone_size - 1;
> > +    if (offset & zone_size_mask) {
> > +        error_report("offset is not the start of a zone");
> > +        return -1;
> > +    }
> > +
> > +    if (len & zone_size_mask) {
> > +        error_report("len is not aligned to zones");
> > +        return -1;
> > +    }
> > +
> > +    switch (op) {
> > +    case zone_open:
> > +        ioctl_name = "BLKOPENZONE";
> > +        ioctl_op = BLKOPENZONE;
> > +        break;
> > +    case zone_close:
> > +        ioctl_name = "BLKCLOSEZONE";
> > +        ioctl_op = BLKCLOSEZONE;
> > +        break;
> > +    case zone_finish:
> > +        ioctl_name = "BLKFINISHZONE";
> > +        ioctl_op = BLKFINISHZONE;
> > +        break;
> > +    case zone_reset:
> > +        ioctl_name = "BLKRESETZONE";
> > +        ioctl_op = BLKRESETZONE;
> > +        break;
> > +    default:
> > +        error_report("Invalid zone operation 0x%x", op);
> > +        errno = -EINVAL;
> > +        return -1;
> > +    }
> > +
> > +    /* Execute the operation */
> > +    range.sector = offset;
> > +    range.nr_sectors = len;
> > +    ret = ioctl(fd, ioctl_op, &range);
> > +    if (ret != 0) {
> > +        error_report("ioctl %s failed %d",
> > +                     ioctl_name, errno);
> > +        return -1;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >   static int handle_aiocb_copy_range(void *opaque)
> >   {
> >       RawPosixAIOData *aiocb = opaque;
> > @@ -2973,6 +3108,58 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
> >       }
> >   }
> >
> > +/*
> > + * zone report - Get a zone block device's information in the form
> > + * of an array of zone descriptors.
> > + *
> > + * @param bs: passing zone block device file descriptor
> > + * @param zones: an array of zone descriptors to hold zone
> > + * information on reply
> > + * @param offset: offset can be any byte within the zone size.
> > + * @param len: (not sure yet.
> > + * @return 0 on success, -1 on failure
> > + */
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +        int64_t len, int64_t *nr_zones,
> > +        BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs         = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type   = QEMU_AIO_IOCTL,
> > +        .aio_offset = offset,
> > +        .aio_nbytes = len,
> > +        .zone_report    = {
> > +                .nr_zones       = nr_zones,
> > +                .zones          = zones,
> > +                },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> > +}
> > +
> > +/*
> > + * zone management operations - Execute an operation on a zone
> > + */
> > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> > +        int64_t offset, int64_t len) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs             = bs,
> > +        .aio_fildes     = s->fd,
> > +        .aio_type       = QEMU_AIO_IOCTL,
> > +        .aio_offset     = offset,
> > +        .aio_nbytes     = len,
> > +        .op             = op,
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> > +}
> > +
> >   static coroutine_fn int
> >   raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                   bool blkdev)
> > @@ -3324,6 +3511,9 @@ BlockDriver bdrv_file = {
> >       .bdrv_abort_perm_update = raw_abort_perm_update,
> >       .create_opts = &raw_create_opts,
> >       .mutable_opts = mutable_opts,
> > +
> > +    .bdrv_co_zone_report = raw_co_zone_report,
> > +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> >   };
> >
> >   /***********************************************/
> > @@ -3703,6 +3893,53 @@ static BlockDriver bdrv_host_device = {
> >   #endif
> >   };
> >
> > +static BlockDriver bdrv_zoned_host_device = {
> > +        .format_name = "zoned_host_device",
> > +        .protocol_name = "zoned_host_device",
> > +        .instance_size = sizeof(BDRVRawState),
> > +        .bdrv_needs_filename = true,
> > +        .bdrv_probe_device  = hdev_probe_device,
> > +        .bdrv_parse_filename = hdev_parse_filename,
> > +        .bdrv_file_open     = hdev_open,
> > +        .bdrv_close         = raw_close,
> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +        .create_opts         = &bdrv_create_opts_simple,
> > +        .mutable_opts        = mutable_opts,
> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > +        .bdrv_co_preadv         = raw_co_preadv,
> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > +        .bdrv_refresh_limits = raw_refresh_limits,
> > +        .bdrv_io_plug = raw_aio_plug,
> > +        .bdrv_io_unplug = raw_aio_unplug,
> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > +        .bdrv_co_truncate       = raw_co_truncate,
> > +        .bdrv_getlength = raw_getlength,
> > +        .bdrv_get_info = raw_get_info,
> > +        .bdrv_get_allocated_file_size
> > +                            = raw_get_allocated_file_size,
> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> > +        .bdrv_check_perm = raw_check_perm,
> > +        .bdrv_set_perm   = raw_set_perm,
> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > +        /* zone management operations */
> > +        .bdrv_co_zone_report = raw_co_zone_report,
> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
> > +
> >   #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> >   static void cdrom_parse_filename(const char *filename, QDict *options,
> >                                    Error **errp)
> > @@ -3964,6 +4201,7 @@ static void bdrv_file_init(void)
> >   #if defined(HAVE_HOST_BLOCK_DEVICE)
> >       bdrv_register(&bdrv_host_device);
> >   #ifdef __linux__
> > +    bdrv_register(&bdrv_zoned_host_device);
> >       bdrv_register(&bdrv_host_cdrom);
> >   #endif
> >   #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index fdb7306e78..78cddeeda5 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -23,7 +23,6 @@
> >    */
> >   #ifndef BLOCK_COMMON_H
> >   #define BLOCK_COMMON_H
> > -
> >   #include "block/aio.h"
> >   #include "block/aio-wait.h"
> >   #include "qemu/iov.h"
> > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
> >   typedef struct BdrvChild BdrvChild;
> >   typedef struct BdrvChildClass BdrvChildClass;
> >
> > +typedef enum zone_op {
> > +    zone_open,
> > +    zone_close,
> > +    zone_finish,
> > +    zone_reset,
> > +} zone_op;
> > +
> > +typedef enum zone_model {
> > +    BLK_Z_HM,
> > +    BLK_Z_HA,
> > +} zone_model;
> > +
> > +typedef enum BlkZoneCondition {
> > +    BLK_ZS_NOT_WP = 0x0,
> > +    BLK_ZS_EMPTY = 0x1,
> > +    BLK_ZS_IOPEN = 0x2,
> > +    BLK_ZS_EOPEN = 0x3,
> > +    BLK_ZS_CLOSED = 0x4,
> > +    BLK_ZS_RDONLY = 0xD,
> > +    BLK_ZS_FULL = 0xE,
> > +    BLK_ZS_OFFLINE = 0xF,
> > +} BlkZoneCondition;
> > +
> > +typedef enum BlkZoneType {
> > +    BLK_ZT_CONV = 0x1,
> > +    BLK_ZT_SWR = 0x2,
> > +    BLK_ZT_SWP = 0x3,
> > +} BlkZoneType;
> > +
> > +/*
> > + * Zone descriptor data structure.
> > + * Provide information on a zone with all position and size values in bytes.
> > + */
> > +typedef struct BlockZoneDescriptor {
> > +    uint64_t start;
> > +    uint64_t length;
> > +    uint64_t cap;
> > +    uint64_t wp;
> > +    BlkZoneType type;
> > +    BlkZoneCondition cond;
> > +} BlockZoneDescriptor;
> > +
> >   typedef struct BlockDriverInfo {
> >       /* in bytes, 0 if irrelevant */
> >       int cluster_size;
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 8947abab76..b9ea9db6dc 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
> >       struct BdrvTrackedRequest *waiting_for;
> >   } BdrvTrackedRequest;
> >
> > +/**
> > + * Zone device information data structure.
> > + * Provide information on a device.
> > + */
> > +typedef struct zbd_dev {
> > +    uint32_t zone_size;
> > +    zone_model model;
> > +    uint32_t block_size;
> > +    uint32_t write_granularity;
> > +    uint32_t nr_zones;
> > +    struct BlockZoneDescriptor *zones; /* array of zones */
> > +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> > +    uint32_t max_nr_active_zones;
> > +} zbd_dev;
> >
> >   struct BlockDriver {
> >       /*
> > @@ -691,6 +705,12 @@ struct BlockDriver {
> >                                             QEMUIOVector *qiov,
> >                                             int64_t pos);
> >
> > +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> > +            int64_t offset, int64_t len, int64_t *nr_zones,
> > +            BlockZoneDescriptor *zones);
> > +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> > +            int64_t offset, int64_t len);
> > +
> >       /* removable device specific */
> >       bool (*bdrv_is_inserted)(BlockDriverState *bs);
> >       void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>
> Other than that: Well done!
>
Thanks for reviewing!

Have a good day!
Sam
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke                        Kernel Storage Architect
> hare@suse.de                                      +49 911 74053 688
> SUSE Software Solutions Germany GmbH, Maxfeldstr. 5, 90409 Nürnberg
> HRB 36809 (AG Nürnberg), GF: Felix Imendörffer
Stefan Hajnoczi June 27, 2022, 2:15 p.m. UTC | #3
On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> diff --git a/block/block-backend.c b/block/block-backend.c
> index e0e1aff4b1..786f964d02 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>      return ret;
>  }
>  
> +/*
> + * Return zone_report from BlockDriver. Offset can be any number within
> + * the zone size. No alignment for offset and len.

What is the purpose of len? Is it the maximum number of zones to return
in nr_zones[]?

How does the caller know how many zones were returned?

> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                       int64_t len, int64_t *nr_zones,
> +                       BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    BlockDriverState *bs;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    bs = blk_bs(blk);
> +
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    bdrv_inc_in_flight(bs);

The bdrv_inc/dec_in_flight() call should be inside
bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.

> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> +                              nr_zones, zones);
> +    bdrv_dec_in_flight(bs);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Return zone_mgmt from BlockDriver.

Maybe this should be:

  Send a zone management command.

> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>              PreallocMode prealloc;
>              Error **errp;
>          } truncate;
> +        struct {
> +            int64_t *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        zone_op op;

It's cleaner to put op inside a struct zone_mgmt so its purpose is
self-explanatory:

  struct {
      zone_op op;
  } zone_mgmt;

> +static int handle_aiocb_zone_report(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +
> +    struct blk_zone *blkz;
> +    int64_t rep_size, nrz;
> +    int ret, n = 0, i = 0;
> +
> +    nrz = *nr_zones;
> +    if (len == -1) {
> +        return -errno;

Where is errno set? Should this be an errno constant instead like
-EINVAL?

> +    }
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);

g_new() looks incorrect. There should be 1 struct blk_zone_report
followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
instead.

> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> +    printf("start to report zone with offset: 0x%lx\n", offset);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = offset;
> +        rep->nr_zones = nrz;

What prevents zones[] overflowing? nrz isn't being reduced in the loop,
so maybe the rep->nr_zones return value will eventually exceed the
number of elements still available in zones[n..]?

> +static int handle_aiocb_zone_mgmt(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +    zone_op op = aiocb->op;
> +
> +    struct blk_zone_range range;
> +    const char *ioctl_name;
> +    unsigned long ioctl_op;
> +    int64_t zone_size;
> +    int64_t zone_size_mask;
> +    int ret;
> +
> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);

Can this value be stored in bs (maybe in BlockLimits) to avoid calling
ioctl(BLKGETZONESZ) each time?

> +    if (ret) {
> +        return -1;

The return value should be a negative errno. -1 is -EPERM but it's
probably not that error code you wanted. This should be:

  return -errno;

> +    }
> +
> +    zone_size_mask = zone_size - 1;
> +    if (offset & zone_size_mask) {
> +        error_report("offset is not the start of a zone");
> +        return -1;

return -EINVAL;

> +    }
> +
> +    if (len & zone_size_mask) {
> +        error_report("len is not aligned to zones");
> +        return -1;

return -EINVAL;

> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +        int64_t len, int64_t *nr_zones,
> +        BlockZoneDescriptor *zones) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_IOCTL,
> +        .aio_offset = offset,
> +        .aio_nbytes = len,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +                },

Indentation is off here. Please use 4-space indentation.
Sam Li June 28, 2022, 8:05 a.m. UTC | #4
Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
>
> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index e0e1aff4b1..786f964d02 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >      return ret;
> >  }
> >
> > +/*
> > + * Return zone_report from BlockDriver. Offset can be any number within
> > + * the zone size. No alignment for offset and len.
>
> What is the purpose of len? Is it the maximum number of zones to return
> in nr_zones[]?

len is actually not used in bdrv_co_zone_report. It is needed by
blk_check_byte_request.

> How does the caller know how many zones were returned?

nr_zones represents IN maximum and OUT actual. The caller will know by
nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
comments.

>
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                       int64_t len, int64_t *nr_zones,
> > +                       BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    BlockDriverState *bs;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    bs = blk_bs(blk);
> > +
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    bdrv_inc_in_flight(bs);
>
> The bdrv_inc/dec_in_flight() call should be inside
> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
>
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> > +                              nr_zones, zones);
> > +    bdrv_dec_in_flight(bs);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Return zone_mgmt from BlockDriver.
>
> Maybe this should be:
>
>   Send a zone management command.

Yes, it's more accurate.

>
> > @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >              PreallocMode prealloc;
> >              Error **errp;
> >          } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report;
> > +        zone_op op;
>
> It's cleaner to put op inside a struct zone_mgmt so its purpose is
> self-explanatory:
>
>   struct {
>       zone_op op;
>   } zone_mgmt;
>
> > +static int handle_aiocb_zone_report(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    nrz = *nr_zones;
> > +    if (len == -1) {
> > +        return -errno;
>
> Where is errno set? Should this be an errno constant instead like
> -EINVAL?

That's right! Noted.

>
> > +    }
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>
> g_new() looks incorrect. There should be 1 struct blk_zone_report
> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> instead.

Yes! However, it still has a memory leak error when using g_autofree
&& g_malloc.

>
> > +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> > +    printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
>
> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
> so maybe the rep->nr_zones return value will eventually exceed the
> number of elements still available in zones[n..]?

I suppose the number of zones[] is restricted in the subsequent for
loop where zones[] copy one zone at a time as n increases. Even if
rep->zones exceeds the available room in zones[], the extra zone will
not be copied.

>
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +    zone_op op = aiocb->op;
> > +
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    unsigned long ioctl_op;
> > +    int64_t zone_size;
> > +    int64_t zone_size_mask;
> > +    int ret;
> > +
> > +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
>
> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
> ioctl(BLKGETZONESZ) each time?

Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
I think through the configurations. In addition, it's a temporary
approach. It is substituted by get_sysfs_long_val now.

>
> > +    if (ret) {
> > +        return -1;
>
> The return value should be a negative errno. -1 is -EPERM but it's
> probably not that error code you wanted. This should be:
>
>   return -errno;
>
> > +    }
> > +
> > +    zone_size_mask = zone_size - 1;
> > +    if (offset & zone_size_mask) {
> > +        error_report("offset is not the start of a zone");
> > +        return -1;
>
> return -EINVAL;
>
> > +    }
> > +
> > +    if (len & zone_size_mask) {
> > +        error_report("len is not aligned to zones");
> > +        return -1;
>
> return -EINVAL;
>
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +        int64_t len, int64_t *nr_zones,
> > +        BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs         = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type   = QEMU_AIO_IOCTL,
> > +        .aio_offset = offset,
> > +        .aio_nbytes = len,
> > +        .zone_report    = {
> > +                .nr_zones       = nr_zones,
> > +                .zones          = zones,
> > +                },
>
> Indentation is off here. Please use 4-space indentation.
Noted!

Thanks for reviewing!

Sam
Damien Le Moal June 28, 2022, 9:05 a.m. UTC | #5
On 6/28/22 17:05, Sam Li wrote:
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
>>
>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>> index e0e1aff4b1..786f964d02 100644
>>> --- a/block/block-backend.c
>>> +++ b/block/block-backend.c
>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>>>      return ret;
>>>  }
>>>
>>> +/*
>>> + * Return zone_report from BlockDriver. Offset can be any number within
>>> + * the zone size. No alignment for offset and len.
>>
>> What is the purpose of len? Is it the maximum number of zones to return
>> in nr_zones[]?
> 
> len is actually not used in bdrv_co_zone_report. It is needed by
> blk_check_byte_request.

There is no IO buffer being passed. Only an array of zone descriptors and
an in-out number of zones. len is definitely not needed. Not sure what
blk_check_byte_request() is intended to check though.

> 
>> How does the caller know how many zones were returned?
> 
> nr_zones represents IN maximum and OUT actual. The caller will know by
> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
> comments.
> 
>>
>>> + */
>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>> +                       int64_t len, int64_t *nr_zones,
>>> +                       BlockZoneDescriptor *zones)
>>> +{
>>> +    int ret;
>>> +    BlockDriverState *bs;
>>> +    IO_CODE();
>>> +
>>> +    blk_inc_in_flight(blk); /* increase before waiting */
>>> +    blk_wait_while_drained(blk);
>>> +    bs = blk_bs(blk);
>>> +
>>> +    ret = blk_check_byte_request(blk, offset, len);
>>> +    if (ret < 0) {
>>> +        return ret;
>>> +    }
>>> +
>>> +    bdrv_inc_in_flight(bs);
>>
>> The bdrv_inc/dec_in_flight() call should be inside
>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
>>
>>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
>>> +                              nr_zones, zones);
>>> +    bdrv_dec_in_flight(bs);
>>> +    blk_dec_in_flight(blk);
>>> +    return ret;
>>> +}
>>> +
>>> +/*
>>> + * Return zone_mgmt from BlockDriver.
>>
>> Maybe this should be:
>>
>>   Send a zone management command.
> 
> Yes, it's more accurate.
> 
>>
>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>>>              PreallocMode prealloc;
>>>              Error **errp;
>>>          } truncate;
>>> +        struct {
>>> +            int64_t *nr_zones;
>>> +            BlockZoneDescriptor *zones;
>>> +        } zone_report;
>>> +        zone_op op;
>>
>> It's cleaner to put op inside a struct zone_mgmt so its purpose is
>> self-explanatory:
>>
>>   struct {
>>       zone_op op;
>>   } zone_mgmt;
>>
>>> +static int handle_aiocb_zone_report(void *opaque) {
>>> +    RawPosixAIOData *aiocb = opaque;
>>> +    int fd = aiocb->aio_fildes;
>>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
>>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>>> +    int64_t offset = aiocb->aio_offset;
>>> +    int64_t len = aiocb->aio_nbytes;

Since you have the zone array and number of zones (size of that array) I
really do not see why you need len.

>>> +
>>> +    struct blk_zone *blkz;
>>> +    int64_t rep_size, nrz;
>>> +    int ret, n = 0, i = 0;
>>> +
>>> +    nrz = *nr_zones;
>>> +    if (len == -1) {
>>> +        return -errno;
>>
>> Where is errno set? Should this be an errno constant instead like
>> -EINVAL?
> 
> That's right! Noted.
> 
>>
>>> +    }
>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>>
>> g_new() looks incorrect. There should be 1 struct blk_zone_report
>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
>> instead.
> 
> Yes! However, it still has a memory leak error when using g_autofree
> && g_malloc.

That may be because you are changing the value of the rep pointer while
parsing the report ?

> 
>>
>>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
>>> +    printf("start to report zone with offset: 0x%lx\n", offset);
>>> +
>>> +    blkz = (struct blk_zone *)(rep + 1);
>>> +    while (n < nrz) {
>>> +        memset(rep, 0, rep_size);
>>> +        rep->sector = offset;
>>> +        rep->nr_zones = nrz;
>>
>> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
>> so maybe the rep->nr_zones return value will eventually exceed the
>> number of elements still available in zones[n..]?
> 
> I suppose the number of zones[] is restricted in the subsequent for
> loop where zones[] copy one zone at a time as n increases. Even if
> rep->zones exceeds the available room in zones[], the extra zone will
> not be copied.
> 
>>
>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>>> +    RawPosixAIOData *aiocb = opaque;
>>> +    int fd = aiocb->aio_fildes;
>>> +    int64_t offset = aiocb->aio_offset;
>>> +    int64_t len = aiocb->aio_nbytes;
>>> +    zone_op op = aiocb->op;
>>> +
>>> +    struct blk_zone_range range;
>>> +    const char *ioctl_name;
>>> +    unsigned long ioctl_op;
>>> +    int64_t zone_size;
>>> +    int64_t zone_size_mask;
>>> +    int ret;
>>> +
>>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
>>
>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
>> ioctl(BLKGETZONESZ) each time?
> 
> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
> I think through the configurations. In addition, it's a temporary
> approach. It is substituted by get_sysfs_long_val now.
> 
>>
>>> +    if (ret) {
>>> +        return -1;
>>
>> The return value should be a negative errno. -1 is -EPERM but it's
>> probably not that error code you wanted. This should be:
>>
>>   return -errno;
>>
>>> +    }
>>> +
>>> +    zone_size_mask = zone_size - 1;
>>> +    if (offset & zone_size_mask) {
>>> +        error_report("offset is not the start of a zone");
>>> +        return -1;
>>
>> return -EINVAL;
>>
>>> +    }
>>> +
>>> +    if (len & zone_size_mask) {
>>> +        error_report("len is not aligned to zones");
>>> +        return -1;
>>
>> return -EINVAL;
>>
>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
>>> +        int64_t len, int64_t *nr_zones,
>>> +        BlockZoneDescriptor *zones) {
>>> +    BDRVRawState *s = bs->opaque;
>>> +    RawPosixAIOData acb;
>>> +
>>> +    acb = (RawPosixAIOData) {
>>> +        .bs         = bs,
>>> +        .aio_fildes = s->fd,
>>> +        .aio_type   = QEMU_AIO_IOCTL,
>>> +        .aio_offset = offset,
>>> +        .aio_nbytes = len,
>>> +        .zone_report    = {
>>> +                .nr_zones       = nr_zones,
>>> +                .zones          = zones,
>>> +                },
>>
>> Indentation is off here. Please use 4-space indentation.
> Noted!
> 
> Thanks for reviewing!
> 
> Sam
Sam Li June 28, 2022, 10:23 a.m. UTC | #6
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道:
>
> On 6/28/22 17:05, Sam Li wrote:
> > Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
> >>
> >> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> >>> diff --git a/block/block-backend.c b/block/block-backend.c
> >>> index e0e1aff4b1..786f964d02 100644
> >>> --- a/block/block-backend.c
> >>> +++ b/block/block-backend.c
> >>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >>>      return ret;
> >>>  }
> >>>
> >>> +/*
> >>> + * Return zone_report from BlockDriver. Offset can be any number within
> >>> + * the zone size. No alignment for offset and len.
> >>
> >> What is the purpose of len? Is it the maximum number of zones to return
> >> in nr_zones[]?
> >
> > len is actually not used in bdrv_co_zone_report. It is needed by
> > blk_check_byte_request.
>
> There is no IO buffer being passed. Only an array of zone descriptors and
> an in-out number of zones. len is definitely not needed. Not sure what
> blk_check_byte_request() is intended to check though.

Can I just remove len argument and blk_check_byte_request() if it's not used?

>
> >
> >> How does the caller know how many zones were returned?
> >
> > nr_zones represents IN maximum and OUT actual. The caller will know by
> > nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
> > comments.
> >
> >>
> >>> + */
> >>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> >>> +                       int64_t len, int64_t *nr_zones,
> >>> +                       BlockZoneDescriptor *zones)
> >>> +{
> >>> +    int ret;
> >>> +    BlockDriverState *bs;
> >>> +    IO_CODE();
> >>> +
> >>> +    blk_inc_in_flight(blk); /* increase before waiting */
> >>> +    blk_wait_while_drained(blk);
> >>> +    bs = blk_bs(blk);
> >>> +
> >>> +    ret = blk_check_byte_request(blk, offset, len);
> >>> +    if (ret < 0) {
> >>> +        return ret;
> >>> +    }
> >>> +
> >>> +    bdrv_inc_in_flight(bs);
> >>
> >> The bdrv_inc/dec_in_flight() call should be inside
> >> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
> >>
> >>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> >>> +                              nr_zones, zones);
> >>> +    bdrv_dec_in_flight(bs);
> >>> +    blk_dec_in_flight(blk);
> >>> +    return ret;
> >>> +}
> >>> +
> >>> +/*
> >>> + * Return zone_mgmt from BlockDriver.
> >>
> >> Maybe this should be:
> >>
> >>   Send a zone management command.
> >
> > Yes, it's more accurate.
> >
> >>
> >>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >>>              PreallocMode prealloc;
> >>>              Error **errp;
> >>>          } truncate;
> >>> +        struct {
> >>> +            int64_t *nr_zones;
> >>> +            BlockZoneDescriptor *zones;
> >>> +        } zone_report;
> >>> +        zone_op op;
> >>
> >> It's cleaner to put op inside a struct zone_mgmt so its purpose is
> >> self-explanatory:
> >>
> >>   struct {
> >>       zone_op op;
> >>   } zone_mgmt;
> >>
> >>> +static int handle_aiocb_zone_report(void *opaque) {
> >>> +    RawPosixAIOData *aiocb = opaque;
> >>> +    int fd = aiocb->aio_fildes;
> >>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> >>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> >>> +    int64_t offset = aiocb->aio_offset;
> >>> +    int64_t len = aiocb->aio_nbytes;
>
> Since you have the zone array and number of zones (size of that array) I
> really do not see why you need len.
>
> >>> +
> >>> +    struct blk_zone *blkz;
> >>> +    int64_t rep_size, nrz;
> >>> +    int ret, n = 0, i = 0;
> >>> +
> >>> +    nrz = *nr_zones;
> >>> +    if (len == -1) {
> >>> +        return -errno;
> >>
> >> Where is errno set? Should this be an errno constant instead like
> >> -EINVAL?
> >
> > That's right! Noted.
> >
> >>
> >>> +    }
> >>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> >>
> >> g_new() looks incorrect. There should be 1 struct blk_zone_report
> >> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> >> instead.
> >
> > Yes! However, it still has a memory leak error when using g_autofree
> > && g_malloc.
>
> That may be because you are changing the value of the rep pointer while
> parsing the report ?

I am not sure it is the case. Can you show me some way to find the problem?

Thanks for reviewing!


>
> >
> >>
> >>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> >>> +    printf("start to report zone with offset: 0x%lx\n", offset);
> >>> +
> >>> +    blkz = (struct blk_zone *)(rep + 1);
> >>> +    while (n < nrz) {
> >>> +        memset(rep, 0, rep_size);
> >>> +        rep->sector = offset;
> >>> +        rep->nr_zones = nrz;
> >>
> >> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
> >> so maybe the rep->nr_zones return value will eventually exceed the
> >> number of elements still available in zones[n..]?
> >
> > I suppose the number of zones[] is restricted in the subsequent for
> > loop where zones[] copy one zone at a time as n increases. Even if
> > rep->zones exceeds the available room in zones[], the extra zone will
> > not be copied.
> >
> >>
> >>> +static int handle_aiocb_zone_mgmt(void *opaque) {
> >>> +    RawPosixAIOData *aiocb = opaque;
> >>> +    int fd = aiocb->aio_fildes;
> >>> +    int64_t offset = aiocb->aio_offset;
> >>> +    int64_t len = aiocb->aio_nbytes;
> >>> +    zone_op op = aiocb->op;
> >>> +
> >>> +    struct blk_zone_range range;
> >>> +    const char *ioctl_name;
> >>> +    unsigned long ioctl_op;
> >>> +    int64_t zone_size;
> >>> +    int64_t zone_size_mask;
> >>> +    int ret;
> >>> +
> >>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> >>
> >> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
> >> ioctl(BLKGETZONESZ) each time?
> >
> > Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
> > I think through the configurations. In addition, it's a temporary
> > approach. It is substituted by get_sysfs_long_val now.
> >
> >>
> >>> +    if (ret) {
> >>> +        return -1;
> >>
> >> The return value should be a negative errno. -1 is -EPERM but it's
> >> probably not that error code you wanted. This should be:
> >>
> >>   return -errno;
> >>
> >>> +    }
> >>> +
> >>> +    zone_size_mask = zone_size - 1;
> >>> +    if (offset & zone_size_mask) {
> >>> +        error_report("offset is not the start of a zone");
> >>> +        return -1;
> >>
> >> return -EINVAL;
> >>
> >>> +    }
> >>> +
> >>> +    if (len & zone_size_mask) {
> >>> +        error_report("len is not aligned to zones");
> >>> +        return -1;
> >>
> >> return -EINVAL;
> >>
> >>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> >>> +        int64_t len, int64_t *nr_zones,
> >>> +        BlockZoneDescriptor *zones) {
> >>> +    BDRVRawState *s = bs->opaque;
> >>> +    RawPosixAIOData acb;
> >>> +
> >>> +    acb = (RawPosixAIOData) {
> >>> +        .bs         = bs,
> >>> +        .aio_fildes = s->fd,
> >>> +        .aio_type   = QEMU_AIO_IOCTL,
> >>> +        .aio_offset = offset,
> >>> +        .aio_nbytes = len,
> >>> +        .zone_report    = {
> >>> +                .nr_zones       = nr_zones,
> >>> +                .zones          = zones,
> >>> +                },
> >>
> >> Indentation is off here. Please use 4-space indentation.
> > Noted!
> >
> > Thanks for reviewing!
> >
> > Sam
>
>
> --
> Damien Le Moal
> Western Digital Research
Damien Le Moal June 29, 2022, 1:43 a.m. UTC | #7
On 6/28/22 19:23, Sam Li wrote:
> Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道:
>>
>> On 6/28/22 17:05, Sam Li wrote:
>>> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
>>>>
>>>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
>>>>> diff --git a/block/block-backend.c b/block/block-backend.c
>>>>> index e0e1aff4b1..786f964d02 100644
>>>>> --- a/block/block-backend.c
>>>>> +++ b/block/block-backend.c
>>>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
>>>>>      return ret;
>>>>>  }
>>>>>
>>>>> +/*
>>>>> + * Return zone_report from BlockDriver. Offset can be any number within
>>>>> + * the zone size. No alignment for offset and len.
>>>>
>>>> What is the purpose of len? Is it the maximum number of zones to return
>>>> in nr_zones[]?
>>>
>>> len is actually not used in bdrv_co_zone_report. It is needed by
>>> blk_check_byte_request.
>>
>> There is no IO buffer being passed. Only an array of zone descriptors and
>> an in-out number of zones. len is definitely not needed. Not sure what
>> blk_check_byte_request() is intended to check though.
> 
> Can I just remove len argument and blk_check_byte_request() if it's not used?

If it is unused, yes, you must remove it.

> 
>>
>>>
>>>> How does the caller know how many zones were returned?
>>>
>>> nr_zones represents IN maximum and OUT actual. The caller will know by
>>> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
>>> comments.
>>>
>>>>
>>>>> + */
>>>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>>>>> +                       int64_t len, int64_t *nr_zones,
>>>>> +                       BlockZoneDescriptor *zones)
>>>>> +{
>>>>> +    int ret;
>>>>> +    BlockDriverState *bs;
>>>>> +    IO_CODE();
>>>>> +
>>>>> +    blk_inc_in_flight(blk); /* increase before waiting */
>>>>> +    blk_wait_while_drained(blk);
>>>>> +    bs = blk_bs(blk);
>>>>> +
>>>>> +    ret = blk_check_byte_request(blk, offset, len);
>>>>> +    if (ret < 0) {
>>>>> +        return ret;
>>>>> +    }
>>>>> +
>>>>> +    bdrv_inc_in_flight(bs);
>>>>
>>>> The bdrv_inc/dec_in_flight() call should be inside
>>>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
>>>>
>>>>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
>>>>> +                              nr_zones, zones);
>>>>> +    bdrv_dec_in_flight(bs);
>>>>> +    blk_dec_in_flight(blk);
>>>>> +    return ret;
>>>>> +}
>>>>> +
>>>>> +/*
>>>>> + * Return zone_mgmt from BlockDriver.
>>>>
>>>> Maybe this should be:
>>>>
>>>>   Send a zone management command.
>>>
>>> Yes, it's more accurate.
>>>
>>>>
>>>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
>>>>>              PreallocMode prealloc;
>>>>>              Error **errp;
>>>>>          } truncate;
>>>>> +        struct {
>>>>> +            int64_t *nr_zones;
>>>>> +            BlockZoneDescriptor *zones;
>>>>> +        } zone_report;
>>>>> +        zone_op op;
>>>>
>>>> It's cleaner to put op inside a struct zone_mgmt so its purpose is
>>>> self-explanatory:
>>>>
>>>>   struct {
>>>>       zone_op op;
>>>>   } zone_mgmt;
>>>>
>>>>> +static int handle_aiocb_zone_report(void *opaque) {
>>>>> +    RawPosixAIOData *aiocb = opaque;
>>>>> +    int fd = aiocb->aio_fildes;
>>>>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
>>>>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>>>>> +    int64_t offset = aiocb->aio_offset;
>>>>> +    int64_t len = aiocb->aio_nbytes;
>>
>> Since you have the zone array and number of zones (size of that array) I
>> really do not see why you need len.
>>
>>>>> +
>>>>> +    struct blk_zone *blkz;
>>>>> +    int64_t rep_size, nrz;
>>>>> +    int ret, n = 0, i = 0;
>>>>> +
>>>>> +    nrz = *nr_zones;
>>>>> +    if (len == -1) {
>>>>> +        return -errno;
>>>>
>>>> Where is errno set? Should this be an errno constant instead like
>>>> -EINVAL?
>>>
>>> That's right! Noted.
>>>
>>>>
>>>>> +    }
>>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>>>>
>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
>>>> instead.
>>>
>>> Yes! However, it still has a memory leak error when using g_autofree
>>> && g_malloc.
>>
>> That may be because you are changing the value of the rep pointer while
>> parsing the report ?
> 
> I am not sure it is the case. Can you show me some way to find the problem?

Not sure. I never used this g_malloc()/g_autofree() before so not sure how
it works. It may be that g_autofree() work only with g_new() ?
Could you try separating the declaration and allocation ? e.g.

Declare at the beginning of the function:
g_autofree struct blk_zone_report *rep = NULL;

And then when needed do:

rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
rep = g_malloc(rep_size);

> 
> Thanks for reviewing!
> 
> 
>>
>>>
>>>>
>>>>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
>>>>> +    printf("start to report zone with offset: 0x%lx\n", offset);
>>>>> +
>>>>> +    blkz = (struct blk_zone *)(rep + 1);
>>>>> +    while (n < nrz) {
>>>>> +        memset(rep, 0, rep_size);
>>>>> +        rep->sector = offset;
>>>>> +        rep->nr_zones = nrz;
>>>>
>>>> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
>>>> so maybe the rep->nr_zones return value will eventually exceed the
>>>> number of elements still available in zones[n..]?
>>>
>>> I suppose the number of zones[] is restricted in the subsequent for
>>> loop where zones[] copy one zone at a time as n increases. Even if
>>> rep->zones exceeds the available room in zones[], the extra zone will
>>> not be copied.
>>>
>>>>
>>>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>>>>> +    RawPosixAIOData *aiocb = opaque;
>>>>> +    int fd = aiocb->aio_fildes;
>>>>> +    int64_t offset = aiocb->aio_offset;
>>>>> +    int64_t len = aiocb->aio_nbytes;
>>>>> +    zone_op op = aiocb->op;
>>>>> +
>>>>> +    struct blk_zone_range range;
>>>>> +    const char *ioctl_name;
>>>>> +    unsigned long ioctl_op;
>>>>> +    int64_t zone_size;
>>>>> +    int64_t zone_size_mask;
>>>>> +    int ret;
>>>>> +
>>>>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
>>>>
>>>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
>>>> ioctl(BLKGETZONESZ) each time?
>>>
>>> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
>>> I think through the configurations. In addition, it's a temporary
>>> approach. It is substituted by get_sysfs_long_val now.
>>>
>>>>
>>>>> +    if (ret) {
>>>>> +        return -1;
>>>>
>>>> The return value should be a negative errno. -1 is -EPERM but it's
>>>> probably not that error code you wanted. This should be:
>>>>
>>>>   return -errno;
>>>>
>>>>> +    }
>>>>> +
>>>>> +    zone_size_mask = zone_size - 1;
>>>>> +    if (offset & zone_size_mask) {
>>>>> +        error_report("offset is not the start of a zone");
>>>>> +        return -1;
>>>>
>>>> return -EINVAL;
>>>>
>>>>> +    }
>>>>> +
>>>>> +    if (len & zone_size_mask) {
>>>>> +        error_report("len is not aligned to zones");
>>>>> +        return -1;
>>>>
>>>> return -EINVAL;
>>>>
>>>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
>>>>> +        int64_t len, int64_t *nr_zones,
>>>>> +        BlockZoneDescriptor *zones) {
>>>>> +    BDRVRawState *s = bs->opaque;
>>>>> +    RawPosixAIOData acb;
>>>>> +
>>>>> +    acb = (RawPosixAIOData) {
>>>>> +        .bs         = bs,
>>>>> +        .aio_fildes = s->fd,
>>>>> +        .aio_type   = QEMU_AIO_IOCTL,
>>>>> +        .aio_offset = offset,
>>>>> +        .aio_nbytes = len,
>>>>> +        .zone_report    = {
>>>>> +                .nr_zones       = nr_zones,
>>>>> +                .zones          = zones,
>>>>> +                },
>>>>
>>>> Indentation is off here. Please use 4-space indentation.
>>> Noted!
>>>
>>> Thanks for reviewing!
>>>
>>> Sam
>>
>>
>> --
>> Damien Le Moal
>> Western Digital Research
Sam Li June 29, 2022, 1:50 a.m. UTC | #8
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月29日周三 09:43写道:
>
> On 6/28/22 19:23, Sam Li wrote:
> > Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月28日周二 17:05写道:
> >>
> >> On 6/28/22 17:05, Sam Li wrote:
> >>> Stefan Hajnoczi <stefanha@redhat.com> 于2022年6月28日周二 14:52写道:
> >>>>
> >>>> On Mon, Jun 27, 2022 at 08:19:13AM +0800, Sam Li wrote:
> >>>>> diff --git a/block/block-backend.c b/block/block-backend.c
> >>>>> index e0e1aff4b1..786f964d02 100644
> >>>>> --- a/block/block-backend.c
> >>>>> +++ b/block/block-backend.c
> >>>>> @@ -1810,6 +1810,62 @@ int blk_flush(BlockBackend *blk)
> >>>>>      return ret;
> >>>>>  }
> >>>>>
> >>>>> +/*
> >>>>> + * Return zone_report from BlockDriver. Offset can be any number within
> >>>>> + * the zone size. No alignment for offset and len.
> >>>>
> >>>> What is the purpose of len? Is it the maximum number of zones to return
> >>>> in nr_zones[]?
> >>>
> >>> len is actually not used in bdrv_co_zone_report. It is needed by
> >>> blk_check_byte_request.
> >>
> >> There is no IO buffer being passed. Only an array of zone descriptors and
> >> an in-out number of zones. len is definitely not needed. Not sure what
> >> blk_check_byte_request() is intended to check though.
> >
> > Can I just remove len argument and blk_check_byte_request() if it's not used?
>
> If it is unused, yes, you must remove it.
>
> >
> >>
> >>>
> >>>> How does the caller know how many zones were returned?
> >>>
> >>> nr_zones represents IN maximum and OUT actual. The caller will know by
> >>> nr_zones which is changed in bdrv_co_zone_report. I'll add it in the
> >>> comments.
> >>>
> >>>>
> >>>>> + */
> >>>>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> >>>>> +                       int64_t len, int64_t *nr_zones,
> >>>>> +                       BlockZoneDescriptor *zones)
> >>>>> +{
> >>>>> +    int ret;
> >>>>> +    BlockDriverState *bs;
> >>>>> +    IO_CODE();
> >>>>> +
> >>>>> +    blk_inc_in_flight(blk); /* increase before waiting */
> >>>>> +    blk_wait_while_drained(blk);
> >>>>> +    bs = blk_bs(blk);
> >>>>> +
> >>>>> +    ret = blk_check_byte_request(blk, offset, len);
> >>>>> +    if (ret < 0) {
> >>>>> +        return ret;
> >>>>> +    }
> >>>>> +
> >>>>> +    bdrv_inc_in_flight(bs);
> >>>>
> >>>> The bdrv_inc/dec_in_flight() call should be inside
> >>>> bdrv_co_zone_report(). See bdrv_co_ioctl() for an example.
> >>>>
> >>>>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
> >>>>> +                              nr_zones, zones);
> >>>>> +    bdrv_dec_in_flight(bs);
> >>>>> +    blk_dec_in_flight(blk);
> >>>>> +    return ret;
> >>>>> +}
> >>>>> +
> >>>>> +/*
> >>>>> + * Return zone_mgmt from BlockDriver.
> >>>>
> >>>> Maybe this should be:
> >>>>
> >>>>   Send a zone management command.
> >>>
> >>> Yes, it's more accurate.
> >>>
> >>>>
> >>>>> @@ -216,6 +217,11 @@ typedef struct RawPosixAIOData {
> >>>>>              PreallocMode prealloc;
> >>>>>              Error **errp;
> >>>>>          } truncate;
> >>>>> +        struct {
> >>>>> +            int64_t *nr_zones;
> >>>>> +            BlockZoneDescriptor *zones;
> >>>>> +        } zone_report;
> >>>>> +        zone_op op;
> >>>>
> >>>> It's cleaner to put op inside a struct zone_mgmt so its purpose is
> >>>> self-explanatory:
> >>>>
> >>>>   struct {
> >>>>       zone_op op;
> >>>>   } zone_mgmt;
> >>>>
> >>>>> +static int handle_aiocb_zone_report(void *opaque) {
> >>>>> +    RawPosixAIOData *aiocb = opaque;
> >>>>> +    int fd = aiocb->aio_fildes;
> >>>>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> >>>>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> >>>>> +    int64_t offset = aiocb->aio_offset;
> >>>>> +    int64_t len = aiocb->aio_nbytes;
> >>
> >> Since you have the zone array and number of zones (size of that array) I
> >> really do not see why you need len.
> >>
> >>>>> +
> >>>>> +    struct blk_zone *blkz;
> >>>>> +    int64_t rep_size, nrz;
> >>>>> +    int ret, n = 0, i = 0;
> >>>>> +
> >>>>> +    nrz = *nr_zones;
> >>>>> +    if (len == -1) {
> >>>>> +        return -errno;
> >>>>
> >>>> Where is errno set? Should this be an errno constant instead like
> >>>> -EINVAL?
> >>>
> >>> That's right! Noted.
> >>>
> >>>>
> >>>>> +    }
> >>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> >>>>
> >>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
> >>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> >>>> instead.
> >>>
> >>> Yes! However, it still has a memory leak error when using g_autofree
> >>> && g_malloc.
> >>
> >> That may be because you are changing the value of the rep pointer while
> >> parsing the report ?
> >
> > I am not sure it is the case. Can you show me some way to find the problem?
>
> Not sure. I never used this g_malloc()/g_autofree() before so not sure how
> it works. It may be that g_autofree() work only with g_new() ?
> Could you try separating the declaration and allocation ? e.g.
>
> Declare at the beginning of the function:
> g_autofree struct blk_zone_report *rep = NULL;
>
> And then when needed do:
>
> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> rep = g_malloc(rep_size);

Actually, the memory leak occurs in that way. When using zone_mgmt,
memory leak still occurs. Asan gives the error information not much so
I haven't tracked down the problem yet.

>
> >
> > Thanks for reviewing!
> >
> >
> >>
> >>>
> >>>>
> >>>>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> >>>>> +    printf("start to report zone with offset: 0x%lx\n", offset);
> >>>>> +
> >>>>> +    blkz = (struct blk_zone *)(rep + 1);
> >>>>> +    while (n < nrz) {
> >>>>> +        memset(rep, 0, rep_size);
> >>>>> +        rep->sector = offset;
> >>>>> +        rep->nr_zones = nrz;
> >>>>
> >>>> What prevents zones[] overflowing? nrz isn't being reduced in the loop,
> >>>> so maybe the rep->nr_zones return value will eventually exceed the
> >>>> number of elements still available in zones[n..]?
> >>>
> >>> I suppose the number of zones[] is restricted in the subsequent for
> >>> loop where zones[] copy one zone at a time as n increases. Even if
> >>> rep->zones exceeds the available room in zones[], the extra zone will
> >>> not be copied.
> >>>
> >>>>
> >>>>> +static int handle_aiocb_zone_mgmt(void *opaque) {
> >>>>> +    RawPosixAIOData *aiocb = opaque;
> >>>>> +    int fd = aiocb->aio_fildes;
> >>>>> +    int64_t offset = aiocb->aio_offset;
> >>>>> +    int64_t len = aiocb->aio_nbytes;
> >>>>> +    zone_op op = aiocb->op;
> >>>>> +
> >>>>> +    struct blk_zone_range range;
> >>>>> +    const char *ioctl_name;
> >>>>> +    unsigned long ioctl_op;
> >>>>> +    int64_t zone_size;
> >>>>> +    int64_t zone_size_mask;
> >>>>> +    int ret;
> >>>>> +
> >>>>> +    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
> >>>>
> >>>> Can this value be stored in bs (maybe in BlockLimits) to avoid calling
> >>>> ioctl(BLKGETZONESZ) each time?
> >>>
> >>> Yes, zone_size is in the zbd_dev field. I'll update BlockLimits after
> >>> I think through the configurations. In addition, it's a temporary
> >>> approach. It is substituted by get_sysfs_long_val now.
> >>>
> >>>>
> >>>>> +    if (ret) {
> >>>>> +        return -1;
> >>>>
> >>>> The return value should be a negative errno. -1 is -EPERM but it's
> >>>> probably not that error code you wanted. This should be:
> >>>>
> >>>>   return -errno;
> >>>>
> >>>>> +    }
> >>>>> +
> >>>>> +    zone_size_mask = zone_size - 1;
> >>>>> +    if (offset & zone_size_mask) {
> >>>>> +        error_report("offset is not the start of a zone");
> >>>>> +        return -1;
> >>>>
> >>>> return -EINVAL;
> >>>>
> >>>>> +    }
> >>>>> +
> >>>>> +    if (len & zone_size_mask) {
> >>>>> +        error_report("len is not aligned to zones");
> >>>>> +        return -1;
> >>>>
> >>>> return -EINVAL;
> >>>>
> >>>>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> >>>>> +        int64_t len, int64_t *nr_zones,
> >>>>> +        BlockZoneDescriptor *zones) {
> >>>>> +    BDRVRawState *s = bs->opaque;
> >>>>> +    RawPosixAIOData acb;
> >>>>> +
> >>>>> +    acb = (RawPosixAIOData) {
> >>>>> +        .bs         = bs,
> >>>>> +        .aio_fildes = s->fd,
> >>>>> +        .aio_type   = QEMU_AIO_IOCTL,
> >>>>> +        .aio_offset = offset,
> >>>>> +        .aio_nbytes = len,
> >>>>> +        .zone_report    = {
> >>>>> +                .nr_zones       = nr_zones,
> >>>>> +                .zones          = zones,
> >>>>> +                },
> >>>>
> >>>> Indentation is off here. Please use 4-space indentation.
> >>> Noted!
> >>>
> >>> Thanks for reviewing!
> >>>
> >>> Sam
> >>
> >>
> >> --
> >> Damien Le Moal
> >> Western Digital Research
>
>
> --
> Damien Le Moal
> Western Digital Research
Damien Le Moal June 29, 2022, 2:32 a.m. UTC | #9
On 6/29/22 10:50, Sam Li wrote:
>>>>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>>>>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
>>>>>>
>>>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
>>>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
>>>>>> instead.
>>>>>
>>>>> Yes! However, it still has a memory leak error when using g_autofree
>>>>> && g_malloc.
>>>>
>>>> That may be because you are changing the value of the rep pointer while
>>>> parsing the report ?
>>>
>>> I am not sure it is the case. Can you show me some way to find the problem?
>>
>> Not sure. I never used this g_malloc()/g_autofree() before so not sure how
>> it works. It may be that g_autofree() work only with g_new() ?
>> Could you try separating the declaration and allocation ? e.g.
>>
>> Declare at the beginning of the function:
>> g_autofree struct blk_zone_report *rep = NULL;
>>
>> And then when needed do:
>>
>> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>> rep = g_malloc(rep_size);
> 
> Actually, the memory leak occurs in that way. When using zone_mgmt,
> memory leak still occurs. Asan gives the error information not much so
> I haven't tracked down the problem yet.

See this:

https://blog.fishsoup.net/2015/11/05/attributecleanup-mixed-declarations-and-code-and-goto/

Maybe you can find some hints.
Sam Li June 29, 2022, 2:35 a.m. UTC | #10
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年6月29日周三 10:32写道:
>
> On 6/29/22 10:50, Sam Li wrote:
> >>>>>>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >>>>>>> +    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
> >>>>>>
> >>>>>> g_new() looks incorrect. There should be 1 struct blk_zone_report
> >>>>>> followed by nrz struct blk_zone structs. Please use g_malloc(rep_size)
> >>>>>> instead.
> >>>>>
> >>>>> Yes! However, it still has a memory leak error when using g_autofree
> >>>>> && g_malloc.
> >>>>
> >>>> That may be because you are changing the value of the rep pointer while
> >>>> parsing the report ?
> >>>
> >>> I am not sure it is the case. Can you show me some way to find the problem?
> >>
> >> Not sure. I never used this g_malloc()/g_autofree() before so not sure how
> >> it works. It may be that g_autofree() work only with g_new() ?
> >> Could you try separating the declaration and allocation ? e.g.
> >>
> >> Declare at the beginning of the function:
> >> g_autofree struct blk_zone_report *rep = NULL;
> >>
> >> And then when needed do:
> >>
> >> rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> >> rep = g_malloc(rep_size);
> >
> > Actually, the memory leak occurs in that way. When using zone_mgmt,
> > memory leak still occurs. Asan gives the error information not much so
> > I haven't tracked down the problem yet.
>
> See this:
>
> https://blog.fishsoup.net/2015/11/05/attributecleanup-mixed-declarations-and-code-and-goto/
>
> Maybe you can find some hints.

Thanks!

>
> --
> Damien Le Moal
> Western Digital Research
diff mbox series

Patch

diff --git a/block/block-backend.c b/block/block-backend.c
index e0e1aff4b1..786f964d02 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1810,6 +1810,62 @@  int blk_flush(BlockBackend *blk)
     return ret;
 }
 
+/*
+ * Return zone_report from BlockDriver. Offset can be any number within
+ * the zone size. No alignment for offset and len.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                       int64_t len, int64_t *nr_zones,
+                       BlockZoneDescriptor *zones)
+{
+    int ret;
+    BlockDriverState *bs;
+    IO_CODE();
+
+    blk_inc_in_flight(blk); /* increase before waiting */
+    blk_wait_while_drained(blk);
+    bs = blk_bs(blk);
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    bdrv_inc_in_flight(bs);
+    ret = bdrv_co_zone_report(blk->root->bs, offset, len,
+                              nr_zones, zones);
+    bdrv_dec_in_flight(bs);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Return zone_mgmt from BlockDriver.
+ * Offset is the start of a zone and len is aligned to zones.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    BlockDriverState *bs;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    bs = blk_bs(blk);
+
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    bdrv_inc_in_flight(bs);
+    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
+    bdrv_dec_in_flight(bs);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/coroutines.h b/block/coroutines.h
index 830ecaa733..a114d7bc30 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -80,6 +80,11 @@  int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
 
 int coroutine_fn blk_co_do_flush(BlockBackend *blk);
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    int64_t len, int64_t *nr_zones,
+                                    BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len);
 
 
 /*
diff --git a/block/file-posix.c b/block/file-posix.c
index 48cd096624..1b8b0d351f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,7 @@ 
 #include <sys/param.h>
 #include <sys/syscall.h>
 #include <sys/vfs.h>
+#include <linux/blkzoned.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
@@ -216,6 +217,11 @@  typedef struct RawPosixAIOData {
             PreallocMode prealloc;
             Error **errp;
         } truncate;
+        struct {
+            int64_t *nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report;
+        zone_op op;
     };
 } RawPosixAIOData;
 
@@ -1801,6 +1807,135 @@  static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
 }
 #endif
 
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+                              struct blk_zone *blkz) {
+    zone->start = blkz->start;
+    zone->length = blkz->len;
+    zone->cap = blkz->capacity;
+    zone->wp = blkz->wp - blkz->start;
+    zone->type = blkz->type;
+    zone->cond = blkz->cond;
+}
+
+static int handle_aiocb_zone_report(void *opaque) {
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t *nr_zones = aiocb->zone_report.nr_zones;
+    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+    int64_t offset = aiocb->aio_offset;
+    int64_t len = aiocb->aio_nbytes;
+
+    struct blk_zone *blkz;
+    int64_t rep_size, nrz;
+    int ret, n = 0, i = 0;
+
+    nrz = *nr_zones;
+    if (len == -1) {
+        return -errno;
+    }
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = g_new(struct blk_zone_report, nrz);
+    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
+    printf("start to report zone with offset: 0x%lx\n", offset);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = offset;
+        rep->nr_zones = nrz;
+
+        ret = ioctl(fd, BLKREPORTZONE, rep);
+        if (ret != 0) {
+            ret = -errno;
+            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
+                         fd, offset, errno);
+            return ret;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            parse_zone(&zones[n], &blkz[i]);
+            /* The next report should start after the last zone reported */
+            offset = blkz[i].start + blkz[i].len;
+        }
+    }
+
+    *nr_zones = n;
+    return 0;
+}
+
+static int handle_aiocb_zone_mgmt(void *opaque) {
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t offset = aiocb->aio_offset;
+    int64_t len = aiocb->aio_nbytes;
+    zone_op op = aiocb->op;
+
+    struct blk_zone_range range;
+    const char *ioctl_name;
+    unsigned long ioctl_op;
+    int64_t zone_size;
+    int64_t zone_size_mask;
+    int ret;
+
+    ret = ioctl(fd, BLKGETZONESZ, &zone_size);
+    if (ret) {
+        return -1;
+    }
+
+    zone_size_mask = zone_size - 1;
+    if (offset & zone_size_mask) {
+        error_report("offset is not the start of a zone");
+        return -1;
+    }
+
+    if (len & zone_size_mask) {
+        error_report("len is not aligned to zones");
+        return -1;
+    }
+
+    switch (op) {
+    case zone_open:
+        ioctl_name = "BLKOPENZONE";
+        ioctl_op = BLKOPENZONE;
+        break;
+    case zone_close:
+        ioctl_name = "BLKCLOSEZONE";
+        ioctl_op = BLKCLOSEZONE;
+        break;
+    case zone_finish:
+        ioctl_name = "BLKFINISHZONE";
+        ioctl_op = BLKFINISHZONE;
+        break;
+    case zone_reset:
+        ioctl_name = "BLKRESETZONE";
+        ioctl_op = BLKRESETZONE;
+        break;
+    default:
+        error_report("Invalid zone operation 0x%x", op);
+        errno = -EINVAL;
+        return -1;
+    }
+
+    /* Execute the operation */
+    range.sector = offset;
+    range.nr_sectors = len;
+    ret = ioctl(fd, ioctl_op, &range);
+    if (ret != 0) {
+        error_report("ioctl %s failed %d",
+                     ioctl_name, errno);
+        return -1;
+    }
+
+    return 0;
+}
+
 static int handle_aiocb_copy_range(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
@@ -2973,6 +3108,58 @@  static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
     }
 }
 
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ *
+ * @param bs: passing zone block device file descriptor
+ * @param zones: an array of zone descriptors to hold zone
+ * information on reply
+ * @param offset: offset can be any byte within the zone size.
+ * @param len: (not sure yet.
+ * @return 0 on success, -1 on failure
+ */
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+        int64_t len, int64_t *nr_zones,
+        BlockZoneDescriptor *zones) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs         = bs,
+        .aio_fildes = s->fd,
+        .aio_type   = QEMU_AIO_IOCTL,
+        .aio_offset = offset,
+        .aio_nbytes = len,
+        .zone_report    = {
+                .nr_zones       = nr_zones,
+                .zones          = zones,
+                },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
+}
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
+        int64_t offset, int64_t len) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs             = bs,
+        .aio_fildes     = s->fd,
+        .aio_type       = QEMU_AIO_IOCTL,
+        .aio_offset     = offset,
+        .aio_nbytes     = len,
+        .op             = op,
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
+}
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -3324,6 +3511,9 @@  BlockDriver bdrv_file = {
     .bdrv_abort_perm_update = raw_abort_perm_update,
     .create_opts = &raw_create_opts,
     .mutable_opts = mutable_opts,
+
+    .bdrv_co_zone_report = raw_co_zone_report,
+    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
 };
 
 /***********************************************/
@@ -3703,6 +3893,53 @@  static BlockDriver bdrv_host_device = {
 #endif
 };
 
+static BlockDriver bdrv_zoned_host_device = {
+        .format_name = "zoned_host_device",
+        .protocol_name = "zoned_host_device",
+        .instance_size = sizeof(BDRVRawState),
+        .bdrv_needs_filename = true,
+        .bdrv_probe_device  = hdev_probe_device,
+        .bdrv_parse_filename = hdev_parse_filename,
+        .bdrv_file_open     = hdev_open,
+        .bdrv_close         = raw_close,
+        .bdrv_reopen_prepare = raw_reopen_prepare,
+        .bdrv_reopen_commit  = raw_reopen_commit,
+        .bdrv_reopen_abort   = raw_reopen_abort,
+        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+        .create_opts         = &bdrv_create_opts_simple,
+        .mutable_opts        = mutable_opts,
+        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
+        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+
+        .bdrv_co_preadv         = raw_co_preadv,
+        .bdrv_co_pwritev        = raw_co_pwritev,
+        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
+        .bdrv_co_pdiscard       = hdev_co_pdiscard,
+        .bdrv_co_copy_range_from = raw_co_copy_range_from,
+        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
+        .bdrv_refresh_limits = raw_refresh_limits,
+        .bdrv_io_plug = raw_aio_plug,
+        .bdrv_io_unplug = raw_aio_unplug,
+        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+
+        .bdrv_co_truncate       = raw_co_truncate,
+        .bdrv_getlength = raw_getlength,
+        .bdrv_get_info = raw_get_info,
+        .bdrv_get_allocated_file_size
+                            = raw_get_allocated_file_size,
+        .bdrv_get_specific_stats = hdev_get_specific_stats,
+        .bdrv_check_perm = raw_check_perm,
+        .bdrv_set_perm   = raw_set_perm,
+        .bdrv_abort_perm_update = raw_abort_perm_update,
+        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+        .bdrv_probe_geometry = hdev_probe_geometry,
+        .bdrv_co_ioctl = hdev_co_ioctl,
+
+        /* zone management operations */
+        .bdrv_co_zone_report = raw_co_zone_report,
+        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+};
+
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                  Error **errp)
@@ -3964,6 +4201,7 @@  static void bdrv_file_init(void)
 #if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
 #ifdef __linux__
+    bdrv_register(&bdrv_zoned_host_device);
     bdrv_register(&bdrv_host_cdrom);
 #endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..78cddeeda5 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -23,7 +23,6 @@ 
  */
 #ifndef BLOCK_COMMON_H
 #define BLOCK_COMMON_H
-
 #include "block/aio.h"
 #include "block/aio-wait.h"
 #include "qemu/iov.h"
@@ -49,6 +48,48 @@  typedef struct BlockDriver BlockDriver;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildClass BdrvChildClass;
 
+typedef enum zone_op {
+    zone_open,
+    zone_close,
+    zone_finish,
+    zone_reset,
+} zone_op;
+
+typedef enum zone_model {
+    BLK_Z_HM,
+    BLK_Z_HA,
+} zone_model;
+
+typedef enum BlkZoneCondition {
+    BLK_ZS_NOT_WP = 0x0,
+    BLK_ZS_EMPTY = 0x1,
+    BLK_ZS_IOPEN = 0x2,
+    BLK_ZS_EOPEN = 0x3,
+    BLK_ZS_CLOSED = 0x4,
+    BLK_ZS_RDONLY = 0xD,
+    BLK_ZS_FULL = 0xE,
+    BLK_ZS_OFFLINE = 0xF,
+} BlkZoneCondition;
+
+typedef enum BlkZoneType {
+    BLK_ZT_CONV = 0x1,
+    BLK_ZT_SWR = 0x2,
+    BLK_ZT_SWP = 0x3,
+} BlkZoneType;
+
+/*
+ * Zone descriptor data structure.
+ * Provide information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+    uint64_t start;
+    uint64_t length;
+    uint64_t cap;
+    uint64_t wp;
+    BlkZoneType type;
+    BlkZoneCondition cond;
+} BlockZoneDescriptor;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..b9ea9db6dc 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -94,6 +94,20 @@  typedef struct BdrvTrackedRequest {
     struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;
 
+/**
+ * Zone device information data structure.
+ * Provide information on a device.
+ */
+typedef struct zbd_dev {
+    uint32_t zone_size;
+    zone_model model;
+    uint32_t block_size;
+    uint32_t write_granularity;
+    uint32_t nr_zones;
+    struct BlockZoneDescriptor *zones; /* array of zones */
+    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
+    uint32_t max_nr_active_zones;
+} zbd_dev;
 
 struct BlockDriver {
     /*
@@ -691,6 +705,12 @@  struct BlockDriver {
                                           QEMUIOVector *qiov,
                                           int64_t pos);
 
+    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
+            int64_t offset, int64_t len, int64_t *nr_zones,
+            BlockZoneDescriptor *zones);
+    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
+            int64_t offset, int64_t len);
+
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);