diff mbox series

[RFC,v4,1/9] block: add block layer APIs resembling Linux ZonedBlockDevice ioctls.

Message ID 20220712021345.8530-2-faithilikerun@gmail.com
State New
Headers show
Series Add support for zoned device | expand

Commit Message

Sam Li July 12, 2022, 2:13 a.m. UTC
By adding zone management operations in BlockDriver, storage
controller emulation can use the new block layer APIs including
zone_report and zone_mgmt(open, close, finish, reset).

Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/block-backend.c            |  41 ++++++
 block/coroutines.h               |   5 +
 block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
 include/block/block-common.h     |  43 +++++-
 include/block/block_int-common.h |  20 +++
 5 files changed, 344 insertions(+), 1 deletion(-)

Comments

Hannes Reinecke July 12, 2022, 6:10 a.m. UTC | #1
On 7/12/22 04:13, Sam Li wrote:
> By adding zone management operations in BlockDriver, storage
> controller emulation can use the new block layer APIs including
> zone_report and zone_mgmt(open, close, finish, reset).
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>   block/block-backend.c            |  41 ++++++
>   block/coroutines.h               |   5 +
>   block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
>   include/block/block-common.h     |  43 +++++-
>   include/block/block_int-common.h |  20 +++
>   5 files changed, 344 insertions(+), 1 deletion(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f425b00793..0a05247ae4 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
>       return ret;
>   }
>   
> +/*
> + * Send a zone_report command.
> + * offset can be any number within the zone size. No alignment for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Send a zone_management command.
> + * Offset is the start of a zone and len is aligned to zones.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>   void blk_drain(BlockBackend *blk)
>   {
>       BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/coroutines.h b/block/coroutines.h
> index 830ecaa733..19aa96cc56 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -80,6 +80,11 @@ int coroutine_fn
>   blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>   
>   int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +                                  int64_t offset, int64_t len);
>   
>   
>   /*
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..e7523ae2ed 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,7 @@
>   #include <sys/param.h>
>   #include <sys/syscall.h>
>   #include <sys/vfs.h>
> +#include <linux/blkzoned.h>
>   #include <linux/cdrom.h>
>   #include <linux/fd.h>
>   #include <linux/fs.h>
> @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
>               PreallocMode prealloc;
>               Error **errp;
>           } truncate;
> +        struct {
> +            int64_t *nr_zones;

Why is this a pointer?
I'd rather use a number here, seeing that it's the number
of zones in the *zones array ...

But the remainder looks good.

Cheers,

Hannes
Sam Li July 12, 2022, 6:17 a.m. UTC | #2
Hannes Reinecke <hare@suse.de> 于2022年7月12日周二 14:10写道:
>
> On 7/12/22 04:13, Sam Li wrote:
> > By adding zone management operations in BlockDriver, storage
> > controller emulation can use the new block layer APIs including
> > zone_report and zone_mgmt(open, close, finish, reset).
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >   block/block-backend.c            |  41 ++++++
> >   block/coroutines.h               |   5 +
> >   block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
> >   include/block/block-common.h     |  43 +++++-
> >   include/block/block_int-common.h |  20 +++
> >   5 files changed, 344 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index f425b00793..0a05247ae4 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
> >       return ret;
> >   }
> >
> > +/*
> > + * Send a zone_report command.
> > + * offset can be any number within the zone size. No alignment for offset.
> > + * nr_zones represents IN maximum and OUT actual.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Send a zone_management command.
> > + * Offset is the start of a zone and len is aligned to zones.
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >   void blk_drain(BlockBackend *blk)
> >   {
> >       BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..19aa96cc56 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> >   blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> >   int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +                                  int64_t offset, int64_t len);
> >
> >
> >   /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..e7523ae2ed 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,7 @@
> >   #include <sys/param.h>
> >   #include <sys/syscall.h>
> >   #include <sys/vfs.h>
> > +#include <linux/blkzoned.h>
> >   #include <linux/cdrom.h>
> >   #include <linux/fd.h>
> >   #include <linux/fs.h>
> > @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
> >               PreallocMode prealloc;
> >               Error **errp;
> >           } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
>
> Why is this a pointer?
> I'd rather use a number here, seeing that it's the number
> of zones in the *zones array ...

I see. The pointer is a little redundant. Will change it.

> But the remainder looks good.

Thanks for reviewing!
Damien Le Moal July 12, 2022, 7:35 a.m. UTC | #3
On 7/12/22 11:13, Sam Li wrote:
> By adding zone management operations in BlockDriver, storage
> controller emulation can use the new block layer APIs including
> zone_report and zone_mgmt(open, close, finish, reset).
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c            |  41 ++++++
>  block/coroutines.h               |   5 +
>  block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
>  include/block/block-common.h     |  43 +++++-
>  include/block/block_int-common.h |  20 +++
>  5 files changed, 344 insertions(+), 1 deletion(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f425b00793..0a05247ae4 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
>      return ret;
>  }
>  
> +/*
> + * Send a zone_report command.
> + * offset can be any number within the zone size. No alignment for offset.
> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Send a zone_management command.
> + * Offset is the start of a zone and len is aligned to zones.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +        int64_t offset, int64_t len)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;

You missed adding "blk_dec_in_flight(blk);" before return here. But I
think you can move the call to blk_check_byte_request() before
blk_inc_in_flight() to avoid having to call blk_dec_in_flight().

> +    }
> +
> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/coroutines.h b/block/coroutines.h
> index 830ecaa733..19aa96cc56 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -80,6 +80,11 @@ int coroutine_fn
>  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>  
>  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +                                  int64_t offset, int64_t len);
>  
>  
>  /*
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..e7523ae2ed 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,7 @@
>  #include <sys/param.h>
>  #include <sys/syscall.h>
>  #include <sys/vfs.h>
> +#include <linux/blkzoned.h>

You need to conditionally include this because not all kernels provide
this file. Old kernels will not have it. So you need something like:

#if defined(CONFIG_BLKZONED)
#include <linux/blkzoned.h>
#endif

And adding this to meson.build should do the trick:

diff --git a/meson.build b/meson.build
index 65a885ea69..31d8852a35 100644
--- a/meson.build
+++ b/meson.build
@@ -1869,6 +1869,7 @@ config_host_data.set('CONFIG_REPLICATION',
get_option('live_block_migration').al

 # has_header
 config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
+config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
 config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
 config_host_data.set('CONFIG_VALGRIND_H',
cc.has_header('valgrind/valgrind.h'))
 config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))

Then in build/config-host.h, you will see "#define CONFIG_BLKZONED". You
then can use "#if defined(CONFIG_BLKZONED)" to conditionally define the
code related to zoned devices.

To test all this, temporarily rename your host
/usr/include/linux/blkzoned.h file to some other name, configure qemu and
see if it compiles.

>  #include <linux/cdrom.h>
>  #include <linux/fd.h>
>  #include <linux/fs.h>
> @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
>              PreallocMode prealloc;
>              Error **errp;
>          } truncate;
> +        struct {
> +            int64_t *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        struct {
> +            zone_op op;
> +        } zone_mgmt;
>      };
>  } RawPosixAIOData;
>  
> @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
>  }
>  #endif
>  
> +/*
> + * parse_zone - Fill a zone descriptor
> + */
> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> +                              struct blk_zone *blkz) {
> +    zone->start = blkz->start;
> +    zone->length = blkz->len;
> +    zone->cap = blkz->capacity;
> +    zone->wp = blkz->wp - blkz->start;
> +    zone->type = blkz->type;
> +    zone->cond = blkz->cond;
> +}
> +
> +static int handle_aiocb_zone_report(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> +    int64_t offset = aiocb->aio_offset;
> +
> +    struct blk_zone *blkz;
> +    int64_t rep_size, nrz;
> +    int ret, n = 0, i = 0;
> +
> +    nrz = *nr_zones;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +    rep = g_malloc(rep_size);
> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> +    printf("start to report zone with offset: 0x%lx\n", offset);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = offset;
> +        rep->nr_zones = nrz;
> +
> +        ret = ioctl(fd, BLKREPORTZONE, rep);
> +        if (ret != 0) {
> +            ret = -errno;
> +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> +                         fd, offset, errno);
> +            return ret;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            parse_zone(&zones[n], &blkz[i]);
> +            /* The next report should start after the last zone reported */
> +            offset = blkz[i].start + blkz[i].len;
> +        }
> +    }
> +
> +    *nr_zones = n;
> +    return 0;
> +}
> +
> +static int handle_aiocb_zone_mgmt(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +    zone_op op = aiocb->zone_mgmt.op;
> +
> +    struct blk_zone_range range;
> +    const char *ioctl_name;
> +    unsigned long ioctl_op;
> +    int64_t zone_size;
> +    int64_t zone_size_mask;
> +    int ret;
> +
> +    g_autofree struct stat *file = NULL;
> +    file = g_new(struct stat, 1);
> +    stat(s->filename, file);
> +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
> +    zone_size_mask = zone_size - 1;
> +    if (offset & zone_size_mask) {
> +        error_report("offset is not the start of a zone");
> +        return -EINVAL;
> +    }
> +
> +    if (len & zone_size_mask) {
> +        error_report("len is not aligned to zones");
> +        return -EINVAL;
> +    }
> +
> +    switch (op) {
> +    case zone_open:
> +        ioctl_name = "BLKOPENZONE";
> +        ioctl_op = BLKOPENZONE;
> +        break;
> +    case zone_close:
> +        ioctl_name = "BLKCLOSEZONE";
> +        ioctl_op = BLKCLOSEZONE;
> +        break;
> +    case zone_finish:
> +        ioctl_name = "BLKFINISHZONE";
> +        ioctl_op = BLKFINISHZONE;
> +        break;
> +    case zone_reset:
> +        ioctl_name = "BLKRESETZONE";
> +        ioctl_op = BLKRESETZONE;
> +        break;
> +    default:
> +        error_report("Invalid zone operation 0x%x", op);
> +        return -EINVAL;
> +    }
> +
> +    /* Execute the operation */
> +    range.sector = offset;
> +    range.nr_sectors = len;
> +    ret = ioctl(fd, ioctl_op, &range);
> +    if (ret != 0) {
> +        error_report("ioctl %s failed %d",
> +                     ioctl_name, errno);
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
>  static int handle_aiocb_copy_range(void *opaque)
>  {
>      RawPosixAIOData *aiocb = opaque;
> @@ -2973,6 +3105,59 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
>      }
>  }
>  
> +/*
> + * zone report - Get a zone block device's information in the form
> + * of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: an array of zone descriptors to hold zone
> + * information on reply
> + * @param offset: offset can be any byte within the zone size.
> + * @param len: (not sure yet.
> + * @return 0 on success, -1 on failure
> + */
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                                           int64_t *nr_zones,
> +                                           BlockZoneDescriptor *zones) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_IOCTL,
> +        .aio_offset = offset,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> +}
> +
> +/*
> + * zone management operations - Execute an operation on a zone
> + */
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> +        int64_t offset, int64_t len) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs             = bs,
> +        .aio_fildes     = s->fd,
> +        .aio_type       = QEMU_AIO_IOCTL,
> +        .aio_offset     = offset,
> +        .aio_nbytes     = len,
> +        .zone_mgmt  = {
> +                .op = op,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> +}
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -3324,6 +3509,9 @@ BlockDriver bdrv_file = {
>      .bdrv_abort_perm_update = raw_abort_perm_update,
>      .create_opts = &raw_create_opts,
>      .mutable_opts = mutable_opts,
> +
> +    .bdrv_co_zone_report = raw_co_zone_report,
> +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>  };
>  
>  /***********************************************/
> @@ -3703,6 +3891,53 @@ static BlockDriver bdrv_host_device = {
>  #endif
>  };
>  
> +static BlockDriver bdrv_zoned_host_device = {
> +        .format_name = "zoned_host_device",
> +        .protocol_name = "zoned_host_device",
> +        .instance_size = sizeof(BDRVRawState),
> +        .bdrv_needs_filename = true,
> +        .bdrv_probe_device  = hdev_probe_device,
> +        .bdrv_parse_filename = hdev_parse_filename,
> +        .bdrv_file_open     = hdev_open,
> +        .bdrv_close         = raw_close,
> +        .bdrv_reopen_prepare = raw_reopen_prepare,
> +        .bdrv_reopen_commit  = raw_reopen_commit,
> +        .bdrv_reopen_abort   = raw_reopen_abort,
> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> +        .create_opts         = &bdrv_create_opts_simple,
> +        .mutable_opts        = mutable_opts,
> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> +
> +        .bdrv_co_preadv         = raw_co_preadv,
> +        .bdrv_co_pwritev        = raw_co_pwritev,
> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> +        .bdrv_refresh_limits = raw_refresh_limits,
> +        .bdrv_io_plug = raw_aio_plug,
> +        .bdrv_io_unplug = raw_aio_unplug,
> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> +
> +        .bdrv_co_truncate       = raw_co_truncate,
> +        .bdrv_getlength = raw_getlength,
> +        .bdrv_get_info = raw_get_info,
> +        .bdrv_get_allocated_file_size
> +                            = raw_get_allocated_file_size,
> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> +        .bdrv_check_perm = raw_check_perm,
> +        .bdrv_set_perm   = raw_set_perm,
> +        .bdrv_abort_perm_update = raw_abort_perm_update,
> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> +        .bdrv_probe_geometry = hdev_probe_geometry,
> +        .bdrv_co_ioctl = hdev_co_ioctl,
> +
> +        /* zone management operations */
> +        .bdrv_co_zone_report = raw_co_zone_report,
> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +};
> +
>  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>  static void cdrom_parse_filename(const char *filename, QDict *options,
>                                   Error **errp)
> @@ -3964,6 +4199,7 @@ static void bdrv_file_init(void)
>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>      bdrv_register(&bdrv_host_device);
>  #ifdef __linux__
> +    bdrv_register(&bdrv_zoned_host_device);
>      bdrv_register(&bdrv_host_cdrom);
>  #endif
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..78cddeeda5 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -23,7 +23,6 @@
>   */
>  #ifndef BLOCK_COMMON_H
>  #define BLOCK_COMMON_H
> -
>  #include "block/aio.h"
>  #include "block/aio-wait.h"
>  #include "qemu/iov.h"
> @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildClass BdrvChildClass;
>  
> +typedef enum zone_op {
> +    zone_open,
> +    zone_close,
> +    zone_finish,
> +    zone_reset,
> +} zone_op;
> +
> +typedef enum zone_model {
> +    BLK_Z_HM,
> +    BLK_Z_HA,
> +} zone_model;
> +
> +typedef enum BlkZoneCondition {
> +    BLK_ZS_NOT_WP = 0x0,
> +    BLK_ZS_EMPTY = 0x1,
> +    BLK_ZS_IOPEN = 0x2,
> +    BLK_ZS_EOPEN = 0x3,
> +    BLK_ZS_CLOSED = 0x4,
> +    BLK_ZS_RDONLY = 0xD,
> +    BLK_ZS_FULL = 0xE,
> +    BLK_ZS_OFFLINE = 0xF,
> +} BlkZoneCondition;
> +
> +typedef enum BlkZoneType {
> +    BLK_ZT_CONV = 0x1,
> +    BLK_ZT_SWR = 0x2,
> +    BLK_ZT_SWP = 0x3,
> +} BlkZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provide information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    BlkZoneType type;
> +    BlkZoneCondition cond;
> +} BlockZoneDescriptor;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 8947abab76..6037871089 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
>      struct BdrvTrackedRequest *waiting_for;
>  } BdrvTrackedRequest;
>  
> +/**
> + * Zone device information data structure.
> + * Provide information on a device.
> + */
> +typedef struct zbd_dev {
> +    uint32_t zone_size;
> +    zone_model model;
> +    uint32_t block_size;
> +    uint32_t write_granularity;
> +    uint32_t nr_zones;
> +    struct BlockZoneDescriptor *zones; /* array of zones */
> +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> +    uint32_t max_nr_active_zones;
> +} zbd_dev;
>  
>  struct BlockDriver {
>      /*
> @@ -691,6 +705,12 @@ struct BlockDriver {
>                                            QEMUIOVector *qiov,
>                                            int64_t pos);
>  
> +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> +            int64_t offset, int64_t *nr_zones,
> +            BlockZoneDescriptor *zones);
> +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> +            int64_t offset, int64_t len);
> +
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
>      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
Stefan Hajnoczi July 12, 2022, 3:49 p.m. UTC | #4
On Tue, Jul 12, 2022 at 10:13:37AM +0800, Sam Li wrote:
> By adding zone management operations in BlockDriver, storage
> controller emulation can use the new block layer APIs including
> zone_report and zone_mgmt(open, close, finish, reset).
> 
> Signed-off-by: Sam Li <faithilikerun@gmail.com>
> ---
>  block/block-backend.c            |  41 ++++++
>  block/coroutines.h               |   5 +
>  block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
>  include/block/block-common.h     |  43 +++++-
>  include/block/block_int-common.h |  20 +++
>  5 files changed, 344 insertions(+), 1 deletion(-)
> 
> diff --git a/block/block-backend.c b/block/block-backend.c
> index f425b00793..0a05247ae4 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
>      return ret;
>  }
>  
> +/*
> + * Send a zone_report command.
> + * offset can be any number within the zone size. No alignment for offset.

I think offset is a byte offset from the start of the device and its
range is [0, total_sectors * BDRV_SECTOR_SIZE)?

"any number within the zone size" gives the impression that the value
must be [0, zone_size_in_bytes), which is not right.

I suggest changing the text to "offset can be any number of bytes from
the start of the device" or similar.

> + * nr_zones represents IN maximum and OUT actual.
> + */
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk); /* increase before waiting */
> +    blk_wait_while_drained(blk);
> +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);

The !blk_is_available(blk) case needs to return -ENOMEDIUM before we can
safely dereference blk->root->bs (which can also be written as
blk_bs(blk)).

> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
> +/*
> + * Send a zone_management command.
> + * Offset is the start of a zone and len is aligned to zones.
> + */
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,

Please define typedef enum { ... } BlockZoneOp instead of enum { ... }
zone_op and then use a BlockZoneOp op argument instead of enum zone_op.
QEMU coding style uses typedefs instead of struct foo or enum foo when
possible.

> +        int64_t offset, int64_t len)
> +{
> +    int ret;
> +    IO_CODE();
> +
> +    blk_inc_in_flight(blk);
> +    blk_wait_while_drained(blk);
> +    ret = blk_check_byte_request(blk, offset, len);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> +    blk_dec_in_flight(blk);
> +    return ret;
> +}
> +
>  void blk_drain(BlockBackend *blk)
>  {
>      BlockDriverState *bs = blk_bs(blk);
> diff --git a/block/coroutines.h b/block/coroutines.h
> index 830ecaa733..19aa96cc56 100644
> --- a/block/coroutines.h
> +++ b/block/coroutines.h
> @@ -80,6 +80,11 @@ int coroutine_fn
>  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>  
>  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> +                                    int64_t *nr_zones,
> +                                    BlockZoneDescriptor *zones);
> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> +                                  int64_t offset, int64_t len);
>  
>  
>  /*
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 48cd096624..e7523ae2ed 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -67,6 +67,7 @@
>  #include <sys/param.h>
>  #include <sys/syscall.h>
>  #include <sys/vfs.h>
> +#include <linux/blkzoned.h>
>  #include <linux/cdrom.h>
>  #include <linux/fd.h>
>  #include <linux/fs.h>
> @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
>              PreallocMode prealloc;
>              Error **errp;
>          } truncate;
> +        struct {
> +            int64_t *nr_zones;
> +            BlockZoneDescriptor *zones;
> +        } zone_report;
> +        struct {
> +            zone_op op;
> +        } zone_mgmt;
>      };
>  } RawPosixAIOData;
>  
> @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
>  }
>  #endif
>  

Are the functions below within #ifdef __linux__?

> +/*
> + * parse_zone - Fill a zone descriptor
> + */
> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> +                              struct blk_zone *blkz) {
> +    zone->start = blkz->start;
> +    zone->length = blkz->len;
> +    zone->cap = blkz->capacity;
> +    zone->wp = blkz->wp - blkz->start;
> +    zone->type = blkz->type;
> +    zone->cond = blkz->cond;
> +}
> +
> +static int handle_aiocb_zone_report(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> +    int64_t offset = aiocb->aio_offset;

The code is easier to read if it's clear this value is in sector units:

  int64_t sector = aiocb->aio_offset / 512; /* BLKREPORTZONE uses 512B sectors */

Then there's no confusion about whether 'offset' is bytes or sectors.

> +
> +    struct blk_zone *blkz;
> +    int64_t rep_size, nrz;
> +    int ret, n = 0, i = 0;
> +
> +    nrz = *nr_zones;
> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> +    g_autofree struct blk_zone_report *rep = NULL;
> +    rep = g_malloc(rep_size);
> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> +    printf("start to report zone with offset: 0x%lx\n", offset);
> +
> +    blkz = (struct blk_zone *)(rep + 1);
> +    while (n < nrz) {
> +        memset(rep, 0, rep_size);
> +        rep->sector = offset;
> +        rep->nr_zones = nrz;

After the first time around the while loop zones[] no longer has space
for nrz elements. This must be taken into account to avoid overflowing
zones[]:

  rep->nr_zones = nrz - n;

> +
> +        ret = ioctl(fd, BLKREPORTZONE, rep);
> +        if (ret != 0) {

Does this need to retry when ret != 0 && errno == EINTR is encountered?
Damien/Hannes/Dmitry might know the answer. See handle_aiocb_rw_vector()
for an example of retrying when EINTR is returned from a blocking
syscall.

> +            ret = -errno;
> +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> +                         fd, offset, errno);

Please use "... at %" PRId64 " failed ..." instead of %ld for int64_t
values. %ld is not portable because sizeof(long) varies on different
machines.

> +            return ret;
> +        }
> +
> +        if (!rep->nr_zones) {
> +            break;
> +        }
> +
> +        for (i = 0; i < rep->nr_zones; i++, n++) {
> +            parse_zone(&zones[n], &blkz[i]);
> +            /* The next report should start after the last zone reported */
> +            offset = blkz[i].start + blkz[i].len;
> +        }
> +    }
> +
> +    *nr_zones = n;
> +    return 0;
> +}
> +
> +static int handle_aiocb_zone_mgmt(void *opaque) {
> +    RawPosixAIOData *aiocb = opaque;
> +    int fd = aiocb->aio_fildes;
> +    int64_t offset = aiocb->aio_offset;
> +    int64_t len = aiocb->aio_nbytes;
> +    zone_op op = aiocb->zone_mgmt.op;
> +
> +    struct blk_zone_range range;
> +    const char *ioctl_name;
> +    unsigned long ioctl_op;
> +    int64_t zone_size;
> +    int64_t zone_size_mask;
> +    int ret;
> +
> +    g_autofree struct stat *file = NULL;
> +    file = g_new(struct stat, 1);
> +    stat(s->filename, file);

Heap allocation is not needed. It's easier to put the struct stat
variable on the stack:

  struct stat st;
  if (fstat(fd, &st) < 0) {
      return -errno;
  }

fstat(2) is preferred over stat(2) when a file descriptor is available
because stat(2) suffers from race conditions when file system paths have
changed.

> +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");

The name "zone_sectors" would be clearer since size doesn't indicate the
units (bytes vs sectors).

> +    zone_size_mask = zone_size - 1;
> +    if (offset & zone_size_mask) {

Bytes and sectors units are being mixed here:

  int64_t offset = aiocb->aio_offset; <-- bytes

I suggest changing it to:

  int64_t sector = aiocb->aio_offset / 512; /* BLK*ZONE ioctls use 512B sectors */

> +        error_report("offset is not the start of a zone");
> +        return -EINVAL;
> +    }
> +
> +    if (len & zone_size_mask) {

Bytes and sectors are mixed here too. I think len is in bytes:

  int64_t len = aiocb->aio_nbytes;

Maybe change it to:

  int64_t nr_sectors = aiocb->aio_nbytes / 512;

> +        error_report("len is not aligned to zones");
> +        return -EINVAL;
> +    }
> +
> +    switch (op) {
> +    case zone_open:
> +        ioctl_name = "BLKOPENZONE";
> +        ioctl_op = BLKOPENZONE;
> +        break;
> +    case zone_close:
> +        ioctl_name = "BLKCLOSEZONE";
> +        ioctl_op = BLKCLOSEZONE;
> +        break;
> +    case zone_finish:
> +        ioctl_name = "BLKFINISHZONE";
> +        ioctl_op = BLKFINISHZONE;
> +        break;
> +    case zone_reset:
> +        ioctl_name = "BLKRESETZONE";
> +        ioctl_op = BLKRESETZONE;
> +        break;
> +    default:
> +        error_report("Invalid zone operation 0x%x", op);
> +        return -EINVAL;
> +    }
> +
> +    /* Execute the operation */
> +    range.sector = offset;
> +    range.nr_sectors = len;
> +    ret = ioctl(fd, ioctl_op, &range);
> +    if (ret != 0) {
> +        error_report("ioctl %s failed %d",
> +                     ioctl_name, errno);
> +        return -errno;
> +    }
> +
> +    return 0;
> +}
> +
>  static int handle_aiocb_copy_range(void *opaque)
>  {
>      RawPosixAIOData *aiocb = opaque;
> @@ -2973,6 +3105,59 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
>      }
>  }
>  
> +/*
> + * zone report - Get a zone block device's information in the form
> + * of an array of zone descriptors.
> + *
> + * @param bs: passing zone block device file descriptor
> + * @param zones: an array of zone descriptors to hold zone
> + * information on reply
> + * @param offset: offset can be any byte within the zone size.
> + * @param len: (not sure yet.
> + * @return 0 on success, -1 on failure
> + */
> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> +                                           int64_t *nr_zones,
> +                                           BlockZoneDescriptor *zones) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs         = bs,
> +        .aio_fildes = s->fd,
> +        .aio_type   = QEMU_AIO_IOCTL,

Please define QEMU_AIO_ZONE_REPORT, QEMU_AIO_ZONE_MGMT, etc values for
these new operations instead of reusing QEMU_AIO_IOCTL, which is meant
for generic passthrough ioctls.

> +        .aio_offset = offset,
> +        .zone_report    = {
> +                .nr_zones       = nr_zones,
> +                .zones          = zones,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> +}
> +
> +/*
> + * zone management operations - Execute an operation on a zone
> + */
> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> +        int64_t offset, int64_t len) {
> +    BDRVRawState *s = bs->opaque;
> +    RawPosixAIOData acb;
> +
> +    acb = (RawPosixAIOData) {
> +        .bs             = bs,
> +        .aio_fildes     = s->fd,
> +        .aio_type       = QEMU_AIO_IOCTL,
> +        .aio_offset     = offset,
> +        .aio_nbytes     = len,
> +        .zone_mgmt  = {
> +                .op = op,
> +        },
> +    };
> +
> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> +}
> +
>  static coroutine_fn int
>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>                  bool blkdev)
> @@ -3324,6 +3509,9 @@ BlockDriver bdrv_file = {
>      .bdrv_abort_perm_update = raw_abort_perm_update,
>      .create_opts = &raw_create_opts,
>      .mutable_opts = mutable_opts,
> +
> +    .bdrv_co_zone_report = raw_co_zone_report,
> +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>  };
>  
>  /***********************************************/
> @@ -3703,6 +3891,53 @@ static BlockDriver bdrv_host_device = {
>  #endif
>  };
>  

#ifdef __linux__

> +static BlockDriver bdrv_zoned_host_device = {
> +        .format_name = "zoned_host_device",
> +        .protocol_name = "zoned_host_device",
> +        .instance_size = sizeof(BDRVRawState),
> +        .bdrv_needs_filename = true,
> +        .bdrv_probe_device  = hdev_probe_device,
> +        .bdrv_parse_filename = hdev_parse_filename,

This function hardcodes "host_device:". zoned_host_device needs a
separate function that uses "zoned_host_device:".

> +        .bdrv_file_open     = hdev_open,
> +        .bdrv_close         = raw_close,
> +        .bdrv_reopen_prepare = raw_reopen_prepare,
> +        .bdrv_reopen_commit  = raw_reopen_commit,
> +        .bdrv_reopen_abort   = raw_reopen_abort,
> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> +        .create_opts         = &bdrv_create_opts_simple,
> +        .mutable_opts        = mutable_opts,
> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> +
> +        .bdrv_co_preadv         = raw_co_preadv,
> +        .bdrv_co_pwritev        = raw_co_pwritev,
> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> +        .bdrv_refresh_limits = raw_refresh_limits,
> +        .bdrv_io_plug = raw_aio_plug,
> +        .bdrv_io_unplug = raw_aio_unplug,
> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> +
> +        .bdrv_co_truncate       = raw_co_truncate,
> +        .bdrv_getlength = raw_getlength,
> +        .bdrv_get_info = raw_get_info,
> +        .bdrv_get_allocated_file_size
> +                            = raw_get_allocated_file_size,
> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> +        .bdrv_check_perm = raw_check_perm,
> +        .bdrv_set_perm   = raw_set_perm,
> +        .bdrv_abort_perm_update = raw_abort_perm_update,
> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> +        .bdrv_probe_geometry = hdev_probe_geometry,
> +        .bdrv_co_ioctl = hdev_co_ioctl,
> +
> +        /* zone management operations */
> +        .bdrv_co_zone_report = raw_co_zone_report,
> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> +};

#endif /* __linux__ */

> +
>  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>  static void cdrom_parse_filename(const char *filename, QDict *options,
>                                   Error **errp)
> @@ -3964,6 +4199,7 @@ static void bdrv_file_init(void)
>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>      bdrv_register(&bdrv_host_device);
>  #ifdef __linux__
> +    bdrv_register(&bdrv_zoned_host_device);
>      bdrv_register(&bdrv_host_cdrom);
>  #endif
>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> diff --git a/include/block/block-common.h b/include/block/block-common.h
> index fdb7306e78..78cddeeda5 100644
> --- a/include/block/block-common.h
> +++ b/include/block/block-common.h
> @@ -23,7 +23,6 @@
>   */
>  #ifndef BLOCK_COMMON_H
>  #define BLOCK_COMMON_H
> -
>  #include "block/aio.h"
>  #include "block/aio-wait.h"
>  #include "qemu/iov.h"

Please avoid making whitespace changes that are unrelated to the patch.

> @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildClass BdrvChildClass;
>  
> +typedef enum zone_op {
> +    zone_open,
> +    zone_close,
> +    zone_finish,
> +    zone_reset,
> +} zone_op;

QEMU coding style:

  typedef enum {
      BLK_ZO_OPEN,
      BLK_ZO_CLOSE,
      BLK_ZO_FINISH,
      BLK_ZO_RESET,
  } BlockZoneOp;

Please also reformat the enums below.

> +
> +typedef enum zone_model {
> +    BLK_Z_HM,
> +    BLK_Z_HA,
> +} zone_model;
> +
> +typedef enum BlkZoneCondition {
> +    BLK_ZS_NOT_WP = 0x0,
> +    BLK_ZS_EMPTY = 0x1,
> +    BLK_ZS_IOPEN = 0x2,
> +    BLK_ZS_EOPEN = 0x3,
> +    BLK_ZS_CLOSED = 0x4,
> +    BLK_ZS_RDONLY = 0xD,
> +    BLK_ZS_FULL = 0xE,
> +    BLK_ZS_OFFLINE = 0xF,
> +} BlkZoneCondition;
> +
> +typedef enum BlkZoneType {
> +    BLK_ZT_CONV = 0x1,
> +    BLK_ZT_SWR = 0x2,
> +    BLK_ZT_SWP = 0x3,
> +} BlkZoneType;
> +
> +/*
> + * Zone descriptor data structure.
> + * Provide information on a zone with all position and size values in bytes.
> + */
> +typedef struct BlockZoneDescriptor {
> +    uint64_t start;
> +    uint64_t length;
> +    uint64_t cap;
> +    uint64_t wp;
> +    BlkZoneType type;
> +    BlkZoneCondition cond;
> +} BlockZoneDescriptor;
> +
>  typedef struct BlockDriverInfo {
>      /* in bytes, 0 if irrelevant */
>      int cluster_size;
> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> index 8947abab76..6037871089 100644
> --- a/include/block/block_int-common.h
> +++ b/include/block/block_int-common.h
> @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
>      struct BdrvTrackedRequest *waiting_for;
>  } BdrvTrackedRequest;
>  
> +/**
> + * Zone device information data structure.
> + * Provide information on a device.
> + */
> +typedef struct zbd_dev {
> +    uint32_t zone_size;
> +    zone_model model;
> +    uint32_t block_size;
> +    uint32_t write_granularity;
> +    uint32_t nr_zones;
> +    struct BlockZoneDescriptor *zones; /* array of zones */
> +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> +    uint32_t max_nr_active_zones;
> +} zbd_dev;

This struct isn't use by this patch. Please move this change into the
patch that uses struct zbd_dev.

QEMU coding style naming would be BlockZoneDev instead of zbd_dev.

I think this struct contains the block limits fields that could be added
to QEMU's BlockLimits? A new struct may not be necessary.

>  
>  struct BlockDriver {
>      /*
> @@ -691,6 +705,12 @@ struct BlockDriver {
>                                            QEMUIOVector *qiov,
>                                            int64_t pos);
>  
> +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> +            int64_t offset, int64_t *nr_zones,
> +            BlockZoneDescriptor *zones);
> +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> +            int64_t offset, int64_t len);
> +
>      /* removable device specific */
>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
>      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
> -- 
> 2.36.1
>
Damien Le Moal July 12, 2022, 10:12 p.m. UTC | #5
On 7/13/22 00:49, Stefan Hajnoczi wrote:
> On Tue, Jul 12, 2022 at 10:13:37AM +0800, Sam Li wrote:
>> By adding zone management operations in BlockDriver, storage
>> controller emulation can use the new block layer APIs including
>> zone_report and zone_mgmt(open, close, finish, reset).
>>
>> Signed-off-by: Sam Li <faithilikerun@gmail.com>
>> ---
>>  block/block-backend.c            |  41 ++++++
>>  block/coroutines.h               |   5 +
>>  block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
>>  include/block/block-common.h     |  43 +++++-
>>  include/block/block_int-common.h |  20 +++
>>  5 files changed, 344 insertions(+), 1 deletion(-)
>>
>> diff --git a/block/block-backend.c b/block/block-backend.c
>> index f425b00793..0a05247ae4 100644
>> --- a/block/block-backend.c
>> +++ b/block/block-backend.c
>> @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
>>      return ret;
>>  }
>>  
>> +/*
>> + * Send a zone_report command.
>> + * offset can be any number within the zone size. No alignment for offset.
> 
> I think offset is a byte offset from the start of the device and its
> range is [0, total_sectors * BDRV_SECTOR_SIZE)?
> 
> "any number within the zone size" gives the impression that the value
> must be [0, zone_size_in_bytes), which is not right.
> 
> I suggest changing the text to "offset can be any number of bytes from
> the start of the device" or similar.
> 
>> + * nr_zones represents IN maximum and OUT actual.
>> + */
>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>> +                                    int64_t *nr_zones,
>> +                                    BlockZoneDescriptor *zones)
>> +{
>> +    int ret;
>> +    IO_CODE();
>> +
>> +    blk_inc_in_flight(blk); /* increase before waiting */
>> +    blk_wait_while_drained(blk);
>> +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
> 
> The !blk_is_available(blk) case needs to return -ENOMEDIUM before we can
> safely dereference blk->root->bs (which can also be written as
> blk_bs(blk)).
> 
>> +    blk_dec_in_flight(blk);
>> +    return ret;
>> +}
>> +
>> +/*
>> + * Send a zone_management command.
>> + * Offset is the start of a zone and len is aligned to zones.
>> + */
>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> 
> Please define typedef enum { ... } BlockZoneOp instead of enum { ... }
> zone_op and then use a BlockZoneOp op argument instead of enum zone_op.
> QEMU coding style uses typedefs instead of struct foo or enum foo when
> possible.
> 
>> +        int64_t offset, int64_t len)
>> +{
>> +    int ret;
>> +    IO_CODE();
>> +
>> +    blk_inc_in_flight(blk);
>> +    blk_wait_while_drained(blk);
>> +    ret = blk_check_byte_request(blk, offset, len);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
>> +    blk_dec_in_flight(blk);
>> +    return ret;
>> +}
>> +
>>  void blk_drain(BlockBackend *blk)
>>  {
>>      BlockDriverState *bs = blk_bs(blk);
>> diff --git a/block/coroutines.h b/block/coroutines.h
>> index 830ecaa733..19aa96cc56 100644
>> --- a/block/coroutines.h
>> +++ b/block/coroutines.h
>> @@ -80,6 +80,11 @@ int coroutine_fn
>>  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
>>  
>>  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
>> +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
>> +                                    int64_t *nr_zones,
>> +                                    BlockZoneDescriptor *zones);
>> +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
>> +                                  int64_t offset, int64_t len);
>>  
>>  
>>  /*
>> diff --git a/block/file-posix.c b/block/file-posix.c
>> index 48cd096624..e7523ae2ed 100644
>> --- a/block/file-posix.c
>> +++ b/block/file-posix.c
>> @@ -67,6 +67,7 @@
>>  #include <sys/param.h>
>>  #include <sys/syscall.h>
>>  #include <sys/vfs.h>
>> +#include <linux/blkzoned.h>
>>  #include <linux/cdrom.h>
>>  #include <linux/fd.h>
>>  #include <linux/fs.h>
>> @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
>>              PreallocMode prealloc;
>>              Error **errp;
>>          } truncate;
>> +        struct {
>> +            int64_t *nr_zones;
>> +            BlockZoneDescriptor *zones;
>> +        } zone_report;
>> +        struct {
>> +            zone_op op;
>> +        } zone_mgmt;
>>      };
>>  } RawPosixAIOData;
>>  
>> @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
>>  }
>>  #endif
>>  
> 
> Are the functions below within #ifdef __linux__?

We need more than that: linux AND blkzoned.h header present (meaning a
recent kernel). So the ifdef should be "#if defined(CONFIG_BLKZONED)" or
something like it, with CONFIG_BLKZONED defined for linux AND
/usr/include/linux/blkzoned.h present.

> 
>> +/*
>> + * parse_zone - Fill a zone descriptor
>> + */
>> +static inline void parse_zone(struct BlockZoneDescriptor *zone,
>> +                              struct blk_zone *blkz) {
>> +    zone->start = blkz->start;
>> +    zone->length = blkz->len;
>> +    zone->cap = blkz->capacity;
>> +    zone->wp = blkz->wp - blkz->start;
>> +    zone->type = blkz->type;
>> +    zone->cond = blkz->cond;
>> +}
>> +
>> +static int handle_aiocb_zone_report(void *opaque) {
>> +    RawPosixAIOData *aiocb = opaque;
>> +    int fd = aiocb->aio_fildes;
>> +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
>> +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
>> +    int64_t offset = aiocb->aio_offset;
> 
> The code is easier to read if it's clear this value is in sector units:
> 
>   int64_t sector = aiocb->aio_offset / 512; /* BLKREPORTZONE uses 512B sectors */
> 
> Then there's no confusion about whether 'offset' is bytes or sectors.
> 
>> +
>> +    struct blk_zone *blkz;
>> +    int64_t rep_size, nrz;
>> +    int ret, n = 0, i = 0;
>> +
>> +    nrz = *nr_zones;
>> +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
>> +    g_autofree struct blk_zone_report *rep = NULL;
>> +    rep = g_malloc(rep_size);
>> +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
>> +    printf("start to report zone with offset: 0x%lx\n", offset);
>> +
>> +    blkz = (struct blk_zone *)(rep + 1);
>> +    while (n < nrz) {
>> +        memset(rep, 0, rep_size);
>> +        rep->sector = offset;
>> +        rep->nr_zones = nrz;
> 
> After the first time around the while loop zones[] no longer has space
> for nrz elements. This must be taken into account to avoid overflowing
> zones[]:
> 
>   rep->nr_zones = nrz - n;
> 
>> +
>> +        ret = ioctl(fd, BLKREPORTZONE, rep);
>> +        if (ret != 0) {
> 
> Does this need to retry when ret != 0 && errno == EINTR is encountered?
> Damien/Hannes/Dmitry might know the answer. See handle_aiocb_rw_vector()
> for an example of retrying when EINTR is returned from a blocking
> syscall.
> 
>> +            ret = -errno;
>> +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
>> +                         fd, offset, errno);
> 
> Please use "... at %" PRId64 " failed ..." instead of %ld for int64_t
> values. %ld is not portable because sizeof(long) varies on different
> machines.
> 
>> +            return ret;
>> +        }
>> +
>> +        if (!rep->nr_zones) {
>> +            break;
>> +        }
>> +
>> +        for (i = 0; i < rep->nr_zones; i++, n++) {
>> +            parse_zone(&zones[n], &blkz[i]);
>> +            /* The next report should start after the last zone reported */
>> +            offset = blkz[i].start + blkz[i].len;
>> +        }
>> +    }
>> +
>> +    *nr_zones = n;
>> +    return 0;
>> +}
>> +
>> +static int handle_aiocb_zone_mgmt(void *opaque) {
>> +    RawPosixAIOData *aiocb = opaque;
>> +    int fd = aiocb->aio_fildes;
>> +    int64_t offset = aiocb->aio_offset;
>> +    int64_t len = aiocb->aio_nbytes;
>> +    zone_op op = aiocb->zone_mgmt.op;
>> +
>> +    struct blk_zone_range range;
>> +    const char *ioctl_name;
>> +    unsigned long ioctl_op;
>> +    int64_t zone_size;
>> +    int64_t zone_size_mask;
>> +    int ret;
>> +
>> +    g_autofree struct stat *file = NULL;
>> +    file = g_new(struct stat, 1);
>> +    stat(s->filename, file);
> 
> Heap allocation is not needed. It's easier to put the struct stat
> variable on the stack:
> 
>   struct stat st;
>   if (fstat(fd, &st) < 0) {
>       return -errno;
>   }
> 
> fstat(2) is preferred over stat(2) when a file descriptor is available
> because stat(2) suffers from race conditions when file system paths have
> changed.
> 
>> +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
> 
> The name "zone_sectors" would be clearer since size doesn't indicate the
> units (bytes vs sectors).
> 
>> +    zone_size_mask = zone_size - 1;
>> +    if (offset & zone_size_mask) {
> 
> Bytes and sectors units are being mixed here:
> 
>   int64_t offset = aiocb->aio_offset; <-- bytes
> 
> I suggest changing it to:
> 
>   int64_t sector = aiocb->aio_offset / 512; /* BLK*ZONE ioctls use 512B sectors */
> 
>> +        error_report("offset is not the start of a zone");
>> +        return -EINVAL;
>> +    }
>> +
>> +    if (len & zone_size_mask) {
> 
> Bytes and sectors are mixed here too. I think len is in bytes:
> 
>   int64_t len = aiocb->aio_nbytes;
> 
> Maybe change it to:
> 
>   int64_t nr_sectors = aiocb->aio_nbytes / 512;
> 
>> +        error_report("len is not aligned to zones");
>> +        return -EINVAL;
>> +    }
>> +
>> +    switch (op) {
>> +    case zone_open:
>> +        ioctl_name = "BLKOPENZONE";
>> +        ioctl_op = BLKOPENZONE;
>> +        break;
>> +    case zone_close:
>> +        ioctl_name = "BLKCLOSEZONE";
>> +        ioctl_op = BLKCLOSEZONE;
>> +        break;
>> +    case zone_finish:
>> +        ioctl_name = "BLKFINISHZONE";
>> +        ioctl_op = BLKFINISHZONE;
>> +        break;
>> +    case zone_reset:
>> +        ioctl_name = "BLKRESETZONE";
>> +        ioctl_op = BLKRESETZONE;
>> +        break;
>> +    default:
>> +        error_report("Invalid zone operation 0x%x", op);
>> +        return -EINVAL;
>> +    }
>> +
>> +    /* Execute the operation */
>> +    range.sector = offset;
>> +    range.nr_sectors = len;
>> +    ret = ioctl(fd, ioctl_op, &range);
>> +    if (ret != 0) {
>> +        error_report("ioctl %s failed %d",
>> +                     ioctl_name, errno);
>> +        return -errno;
>> +    }
>> +
>> +    return 0;
>> +}
>> +
>>  static int handle_aiocb_copy_range(void *opaque)
>>  {
>>      RawPosixAIOData *aiocb = opaque;
>> @@ -2973,6 +3105,59 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
>>      }
>>  }
>>  
>> +/*
>> + * zone report - Get a zone block device's information in the form
>> + * of an array of zone descriptors.
>> + *
>> + * @param bs: passing zone block device file descriptor
>> + * @param zones: an array of zone descriptors to hold zone
>> + * information on reply
>> + * @param offset: offset can be any byte within the zone size.
>> + * @param len: (not sure yet.
>> + * @return 0 on success, -1 on failure
>> + */
>> +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
>> +                                           int64_t *nr_zones,
>> +                                           BlockZoneDescriptor *zones) {
>> +    BDRVRawState *s = bs->opaque;
>> +    RawPosixAIOData acb;
>> +
>> +    acb = (RawPosixAIOData) {
>> +        .bs         = bs,
>> +        .aio_fildes = s->fd,
>> +        .aio_type   = QEMU_AIO_IOCTL,
> 
> Please define QEMU_AIO_ZONE_REPORT, QEMU_AIO_ZONE_MGMT, etc values for
> these new operations instead of reusing QEMU_AIO_IOCTL, which is meant
> for generic passthrough ioctls.
> 
>> +        .aio_offset = offset,
>> +        .zone_report    = {
>> +                .nr_zones       = nr_zones,
>> +                .zones          = zones,
>> +        },
>> +    };
>> +
>> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
>> +}
>> +
>> +/*
>> + * zone management operations - Execute an operation on a zone
>> + */
>> +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
>> +        int64_t offset, int64_t len) {
>> +    BDRVRawState *s = bs->opaque;
>> +    RawPosixAIOData acb;
>> +
>> +    acb = (RawPosixAIOData) {
>> +        .bs             = bs,
>> +        .aio_fildes     = s->fd,
>> +        .aio_type       = QEMU_AIO_IOCTL,
>> +        .aio_offset     = offset,
>> +        .aio_nbytes     = len,
>> +        .zone_mgmt  = {
>> +                .op = op,
>> +        },
>> +    };
>> +
>> +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
>> +}
>> +
>>  static coroutine_fn int
>>  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
>>                  bool blkdev)
>> @@ -3324,6 +3509,9 @@ BlockDriver bdrv_file = {
>>      .bdrv_abort_perm_update = raw_abort_perm_update,
>>      .create_opts = &raw_create_opts,
>>      .mutable_opts = mutable_opts,
>> +
>> +    .bdrv_co_zone_report = raw_co_zone_report,
>> +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>>  };
>>  
>>  /***********************************************/
>> @@ -3703,6 +3891,53 @@ static BlockDriver bdrv_host_device = {
>>  #endif
>>  };
>>  
> 
> #ifdef __linux__
> 
>> +static BlockDriver bdrv_zoned_host_device = {
>> +        .format_name = "zoned_host_device",
>> +        .protocol_name = "zoned_host_device",
>> +        .instance_size = sizeof(BDRVRawState),
>> +        .bdrv_needs_filename = true,
>> +        .bdrv_probe_device  = hdev_probe_device,
>> +        .bdrv_parse_filename = hdev_parse_filename,
> 
> This function hardcodes "host_device:". zoned_host_device needs a
> separate function that uses "zoned_host_device:".
> 
>> +        .bdrv_file_open     = hdev_open,
>> +        .bdrv_close         = raw_close,
>> +        .bdrv_reopen_prepare = raw_reopen_prepare,
>> +        .bdrv_reopen_commit  = raw_reopen_commit,
>> +        .bdrv_reopen_abort   = raw_reopen_abort,
>> +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
>> +        .create_opts         = &bdrv_create_opts_simple,
>> +        .mutable_opts        = mutable_opts,
>> +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
>> +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
>> +
>> +        .bdrv_co_preadv         = raw_co_preadv,
>> +        .bdrv_co_pwritev        = raw_co_pwritev,
>> +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
>> +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
>> +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
>> +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
>> +        .bdrv_refresh_limits = raw_refresh_limits,
>> +        .bdrv_io_plug = raw_aio_plug,
>> +        .bdrv_io_unplug = raw_aio_unplug,
>> +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>> +
>> +        .bdrv_co_truncate       = raw_co_truncate,
>> +        .bdrv_getlength = raw_getlength,
>> +        .bdrv_get_info = raw_get_info,
>> +        .bdrv_get_allocated_file_size
>> +                            = raw_get_allocated_file_size,
>> +        .bdrv_get_specific_stats = hdev_get_specific_stats,
>> +        .bdrv_check_perm = raw_check_perm,
>> +        .bdrv_set_perm   = raw_set_perm,
>> +        .bdrv_abort_perm_update = raw_abort_perm_update,
>> +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
>> +        .bdrv_probe_geometry = hdev_probe_geometry,
>> +        .bdrv_co_ioctl = hdev_co_ioctl,
>> +
>> +        /* zone management operations */
>> +        .bdrv_co_zone_report = raw_co_zone_report,
>> +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
>> +};
> 
> #endif /* __linux__ */
> 
>> +
>>  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>>  static void cdrom_parse_filename(const char *filename, QDict *options,
>>                                   Error **errp)
>> @@ -3964,6 +4199,7 @@ static void bdrv_file_init(void)
>>  #if defined(HAVE_HOST_BLOCK_DEVICE)
>>      bdrv_register(&bdrv_host_device);
>>  #ifdef __linux__
>> +    bdrv_register(&bdrv_zoned_host_device);
>>      bdrv_register(&bdrv_host_cdrom);
>>  #endif
>>  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
>> diff --git a/include/block/block-common.h b/include/block/block-common.h
>> index fdb7306e78..78cddeeda5 100644
>> --- a/include/block/block-common.h
>> +++ b/include/block/block-common.h
>> @@ -23,7 +23,6 @@
>>   */
>>  #ifndef BLOCK_COMMON_H
>>  #define BLOCK_COMMON_H
>> -
>>  #include "block/aio.h"
>>  #include "block/aio-wait.h"
>>  #include "qemu/iov.h"
> 
> Please avoid making whitespace changes that are unrelated to the patch.
> 
>> @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
>>  typedef struct BdrvChild BdrvChild;
>>  typedef struct BdrvChildClass BdrvChildClass;
>>  
>> +typedef enum zone_op {
>> +    zone_open,
>> +    zone_close,
>> +    zone_finish,
>> +    zone_reset,
>> +} zone_op;
> 
> QEMU coding style:
> 
>   typedef enum {
>       BLK_ZO_OPEN,
>       BLK_ZO_CLOSE,
>       BLK_ZO_FINISH,
>       BLK_ZO_RESET,
>   } BlockZoneOp;
> 
> Please also reformat the enums below.
> 
>> +
>> +typedef enum zone_model {
>> +    BLK_Z_HM,
>> +    BLK_Z_HA,
>> +} zone_model;
>> +
>> +typedef enum BlkZoneCondition {
>> +    BLK_ZS_NOT_WP = 0x0,
>> +    BLK_ZS_EMPTY = 0x1,
>> +    BLK_ZS_IOPEN = 0x2,
>> +    BLK_ZS_EOPEN = 0x3,
>> +    BLK_ZS_CLOSED = 0x4,
>> +    BLK_ZS_RDONLY = 0xD,
>> +    BLK_ZS_FULL = 0xE,
>> +    BLK_ZS_OFFLINE = 0xF,
>> +} BlkZoneCondition;
>> +
>> +typedef enum BlkZoneType {
>> +    BLK_ZT_CONV = 0x1,
>> +    BLK_ZT_SWR = 0x2,
>> +    BLK_ZT_SWP = 0x3,
>> +} BlkZoneType;
>> +
>> +/*
>> + * Zone descriptor data structure.
>> + * Provide information on a zone with all position and size values in bytes.
>> + */
>> +typedef struct BlockZoneDescriptor {
>> +    uint64_t start;
>> +    uint64_t length;
>> +    uint64_t cap;
>> +    uint64_t wp;
>> +    BlkZoneType type;
>> +    BlkZoneCondition cond;
>> +} BlockZoneDescriptor;
>> +
>>  typedef struct BlockDriverInfo {
>>      /* in bytes, 0 if irrelevant */
>>      int cluster_size;
>> diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
>> index 8947abab76..6037871089 100644
>> --- a/include/block/block_int-common.h
>> +++ b/include/block/block_int-common.h
>> @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
>>      struct BdrvTrackedRequest *waiting_for;
>>  } BdrvTrackedRequest;
>>  
>> +/**
>> + * Zone device information data structure.
>> + * Provide information on a device.
>> + */
>> +typedef struct zbd_dev {
>> +    uint32_t zone_size;
>> +    zone_model model;
>> +    uint32_t block_size;
>> +    uint32_t write_granularity;
>> +    uint32_t nr_zones;
>> +    struct BlockZoneDescriptor *zones; /* array of zones */
>> +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
>> +    uint32_t max_nr_active_zones;
>> +} zbd_dev;
> 
> This struct isn't use by this patch. Please move this change into the
> patch that uses struct zbd_dev.
> 
> QEMU coding style naming would be BlockZoneDev instead of zbd_dev.
> 
> I think this struct contains the block limits fields that could be added
> to QEMU's BlockLimits? A new struct may not be necessary.
> 
>>  
>>  struct BlockDriver {
>>      /*
>> @@ -691,6 +705,12 @@ struct BlockDriver {
>>                                            QEMUIOVector *qiov,
>>                                            int64_t pos);
>>  
>> +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
>> +            int64_t offset, int64_t *nr_zones,
>> +            BlockZoneDescriptor *zones);
>> +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
>> +            int64_t offset, int64_t len);
>> +
>>      /* removable device specific */
>>      bool (*bdrv_is_inserted)(BlockDriverState *bs);
>>      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>> -- 
>> 2.36.1
>>
Sam Li July 13, 2022, 12:51 a.m. UTC | #6
Stefan Hajnoczi <stefanha@redhat.com> 于2022年7月12日周二 23:49写道:
>
> On Tue, Jul 12, 2022 at 10:13:37AM +0800, Sam Li wrote:
> > By adding zone management operations in BlockDriver, storage
> > controller emulation can use the new block layer APIs including
> > zone_report and zone_mgmt(open, close, finish, reset).
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/block-backend.c            |  41 ++++++
> >  block/coroutines.h               |   5 +
> >  block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
> >  include/block/block-common.h     |  43 +++++-
> >  include/block/block_int-common.h |  20 +++
> >  5 files changed, 344 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index f425b00793..0a05247ae4 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
> >      return ret;
> >  }
> >
> > +/*
> > + * Send a zone_report command.
> > + * offset can be any number within the zone size. No alignment for offset.
>
> I think offset is a byte offset from the start of the device and its
> range is [0, total_sectors * BDRV_SECTOR_SIZE)?
>
> "any number within the zone size" gives the impression that the value
> must be [0, zone_size_in_bytes), which is not right.
>
> I suggest changing the text to "offset can be any number of bytes from
> the start of the device" or similar.
>
> > + * nr_zones represents IN maximum and OUT actual.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
>
> The !blk_is_available(blk) case needs to return -ENOMEDIUM before we can
> safely dereference blk->root->bs (which can also be written as
> blk_bs(blk)).
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Send a zone_management command.
> > + * Offset is the start of a zone and len is aligned to zones.
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
>
> Please define typedef enum { ... } BlockZoneOp instead of enum { ... }
> zone_op and then use a BlockZoneOp op argument instead of enum zone_op.
> QEMU coding style uses typedefs instead of struct foo or enum foo when
> possible.

Yes, it must be a type.

> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
> > +    }
> > +
> > +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >      BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..19aa96cc56 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> >  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> >  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +                                  int64_t offset, int64_t len);
> >
> >
> >  /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..e7523ae2ed 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,7 @@
> >  #include <sys/param.h>
> >  #include <sys/syscall.h>
> >  #include <sys/vfs.h>
> > +#include <linux/blkzoned.h>
> >  #include <linux/cdrom.h>
> >  #include <linux/fd.h>
> >  #include <linux/fs.h>
> > @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
> >              PreallocMode prealloc;
> >              Error **errp;
> >          } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report;
> > +        struct {
> > +            zone_op op;
> > +        } zone_mgmt;
> >      };
> >  } RawPosixAIOData;
> >
> > @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
> >  }
> >  #endif
> >
>
> Are the functions below within #ifdef __linux__?

Maybe add them later?

> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > +                              struct blk_zone *blkz) {
> > +    zone->start = blkz->start;
> > +    zone->length = blkz->len;
> > +    zone->cap = blkz->capacity;
> > +    zone->wp = blkz->wp - blkz->start;
> > +    zone->type = blkz->type;
> > +    zone->cond = blkz->cond;
> > +}
> > +
> > +static int handle_aiocb_zone_report(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > +    int64_t offset = aiocb->aio_offset;
>
> The code is easier to read if it's clear this value is in sector units:
>
>   int64_t sector = aiocb->aio_offset / 512; /* BLKREPORTZONE uses 512B sectors */
>
> Then there's no confusion about whether 'offset' is bytes or sectors.

I see. It is more readable.

> > +
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    nrz = *nr_zones;
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = NULL;
> > +    rep = g_malloc(rep_size);
> > +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> > +    printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
>
> After the first time around the while loop zones[] no longer has space
> for nrz elements. This must be taken into account to avoid overflowing
> zones[]:
>
>   rep->nr_zones = nrz - n;
>
> > +
> > +        ret = ioctl(fd, BLKREPORTZONE, rep);
> > +        if (ret != 0) {
>
> Does this need to retry when ret != 0 && errno == EINTR is encountered?
> Damien/Hannes/Dmitry might know the answer. See handle_aiocb_rw_vector()
> for an example of retrying when EINTR is returned from a blocking
> syscall.
>
> > +            ret = -errno;
> > +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> > +                         fd, offset, errno);
>
> Please use "... at %" PRId64 " failed ..." instead of %ld for int64_t
> values. %ld is not portable because sizeof(long) varies on different
> machines.

Right! I've missed other part than qemuio-cmds to use PRI64.

> > +            return ret;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++, n++) {
> > +            parse_zone(&zones[n], &blkz[i]);
> > +            /* The next report should start after the last zone reported */
> > +            offset = blkz[i].start + blkz[i].len;
> > +        }
> > +    }
> > +
> > +    *nr_zones = n;
> > +    return 0;
> > +}
> > +
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +    zone_op op = aiocb->zone_mgmt.op;
> > +
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    unsigned long ioctl_op;
> > +    int64_t zone_size;
> > +    int64_t zone_size_mask;
> > +    int ret;
> > +
> > +    g_autofree struct stat *file = NULL;
> > +    file = g_new(struct stat, 1);
> > +    stat(s->filename, file);
>
> Heap allocation is not needed. It's easier to put the struct stat
> variable on the stack:
>
>   struct stat st;
>   if (fstat(fd, &st) < 0) {
>       return -errno;
>   }
>
> fstat(2) is preferred over stat(2) when a file descriptor is available
> because stat(2) suffers from race conditions when file system paths have
> changed.

Yes, it was mentioned before. I have included this change in patch 3.
I'll move them into this patch for better reviewing later.

> > +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
>
> The name "zone_sectors" would be clearer since size doesn't indicate the
> units (bytes vs sectors).
> > +    zone_size_mask = zone_size - 1;
> > +    if (offset & zone_size_mask) {
>
> Bytes and sectors units are being mixed here:
>
>   int64_t offset = aiocb->aio_offset; <-- bytes
>
> I suggest changing it to:
>
>   int64_t sector = aiocb->aio_offset / 512; /* BLK*ZONE ioctls use 512B sectors */

I see. I missed considering the difference between sector and byte.

> > +        error_report("offset is not the start of a zone");
> > +        return -EINVAL;
> > +    }
> > +
> > +    if (len & zone_size_mask) {
>
> Bytes and sectors are mixed here too. I think len is in bytes:
>
>   int64_t len = aiocb->aio_nbytes;
>
> Maybe change it to:
>
>   int64_t nr_sectors = aiocb->aio_nbytes / 512;
> > +        error_report("len is not aligned to zones");
> > +        return -EINVAL;
> > +    }
> > +
> > +    switch (op) {
> > +    case zone_open:
> > +        ioctl_name = "BLKOPENZONE";
> > +        ioctl_op = BLKOPENZONE;
> > +        break;
> > +    case zone_close:
> > +        ioctl_name = "BLKCLOSEZONE";
> > +        ioctl_op = BLKCLOSEZONE;
> > +        break;
> > +    case zone_finish:
> > +        ioctl_name = "BLKFINISHZONE";
> > +        ioctl_op = BLKFINISHZONE;
> > +        break;
> > +    case zone_reset:
> > +        ioctl_name = "BLKRESETZONE";
> > +        ioctl_op = BLKRESETZONE;
> > +        break;
> > +    default:
> > +        error_report("Invalid zone operation 0x%x", op);
> > +        return -EINVAL;
> > +    }
> > +
> > +    /* Execute the operation */
> > +    range.sector = offset;
> > +    range.nr_sectors = len;
> > +    ret = ioctl(fd, ioctl_op, &range);
> > +    if (ret != 0) {
> > +        error_report("ioctl %s failed %d",
> > +                     ioctl_name, errno);
> > +        return -errno;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static int handle_aiocb_copy_range(void *opaque)
> >  {
> >      RawPosixAIOData *aiocb = opaque;
> > @@ -2973,6 +3105,59 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
> >      }
> >  }
> >
> > +/*
> > + * zone report - Get a zone block device's information in the form
> > + * of an array of zone descriptors.
> > + *
> > + * @param bs: passing zone block device file descriptor
> > + * @param zones: an array of zone descriptors to hold zone
> > + * information on reply
> > + * @param offset: offset can be any byte within the zone size.
> > + * @param len: (not sure yet.
> > + * @return 0 on success, -1 on failure
> > + */
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +                                           int64_t *nr_zones,
> > +                                           BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs         = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type   = QEMU_AIO_IOCTL,
>
> Please define QEMU_AIO_ZONE_REPORT, QEMU_AIO_ZONE_MGMT, etc values for
> these new operations instead of reusing QEMU_AIO_IOCTL, which is meant
> for generic passthrough ioctls.
> > +        .aio_offset = offset,
> > +        .zone_report    = {
> > +                .nr_zones       = nr_zones,
> > +                .zones          = zones,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> > +}
> > +
> > +/*
> > + * zone management operations - Execute an operation on a zone
> > + */
> > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> > +        int64_t offset, int64_t len) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs             = bs,
> > +        .aio_fildes     = s->fd,
> > +        .aio_type       = QEMU_AIO_IOCTL,
> > +        .aio_offset     = offset,
> > +        .aio_nbytes     = len,
> > +        .zone_mgmt  = {
> > +                .op = op,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> > +}
> > +
> >  static coroutine_fn int
> >  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                  bool blkdev)
> > @@ -3324,6 +3509,9 @@ BlockDriver bdrv_file = {
> >      .bdrv_abort_perm_update = raw_abort_perm_update,
> >      .create_opts = &raw_create_opts,
> >      .mutable_opts = mutable_opts,
> > +
> > +    .bdrv_co_zone_report = raw_co_zone_report,
> > +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> >  };
> >
> >  /***********************************************/
> > @@ -3703,6 +3891,53 @@ static BlockDriver bdrv_host_device = {
> >  #endif
> >  };
> >
>
> #ifdef __linux__
>
> > +static BlockDriver bdrv_zoned_host_device = {
> > +        .format_name = "zoned_host_device",
> > +        .protocol_name = "zoned_host_device",
> > +        .instance_size = sizeof(BDRVRawState),
> > +        .bdrv_needs_filename = true,
> > +        .bdrv_probe_device  = hdev_probe_device,
> > +        .bdrv_parse_filename = hdev_parse_filename,
>
> This function hardcodes "host_device:". zoned_host_device needs a
> separate function that uses "zoned_host_device:".
>
> > +        .bdrv_file_open     = hdev_open,
> > +        .bdrv_close         = raw_close,
> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +        .create_opts         = &bdrv_create_opts_simple,
> > +        .mutable_opts        = mutable_opts,
> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > +        .bdrv_co_preadv         = raw_co_preadv,
> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > +        .bdrv_refresh_limits = raw_refresh_limits,
> > +        .bdrv_io_plug = raw_aio_plug,
> > +        .bdrv_io_unplug = raw_aio_unplug,
> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > +        .bdrv_co_truncate       = raw_co_truncate,
> > +        .bdrv_getlength = raw_getlength,
> > +        .bdrv_get_info = raw_get_info,
> > +        .bdrv_get_allocated_file_size
> > +                            = raw_get_allocated_file_size,
> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> > +        .bdrv_check_perm = raw_check_perm,
> > +        .bdrv_set_perm   = raw_set_perm,
> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > +        /* zone management operations */
> > +        .bdrv_co_zone_report = raw_co_zone_report,
> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
>
> #endif /* __linux__ */
>
> > +
> >  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> >  static void cdrom_parse_filename(const char *filename, QDict *options,
> >                                   Error **errp)
> > @@ -3964,6 +4199,7 @@ static void bdrv_file_init(void)
> >  #if defined(HAVE_HOST_BLOCK_DEVICE)
> >      bdrv_register(&bdrv_host_device);
> >  #ifdef __linux__
> > +    bdrv_register(&bdrv_zoned_host_device);
> >      bdrv_register(&bdrv_host_cdrom);
> >  #endif
> >  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index fdb7306e78..78cddeeda5 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -23,7 +23,6 @@
> >   */
> >  #ifndef BLOCK_COMMON_H
> >  #define BLOCK_COMMON_H
> > -
> >  #include "block/aio.h"
> >  #include "block/aio-wait.h"
> >  #include "qemu/iov.h"
>
> Please avoid making whitespace changes that are unrelated to the patch.

Sorry, I will be more careful.

> > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
> >  typedef struct BdrvChild BdrvChild;
> >  typedef struct BdrvChildClass BdrvChildClass;
> >
> > +typedef enum zone_op {
> > +    zone_open,
> > +    zone_close,
> > +    zone_finish,
> > +    zone_reset,
> > +} zone_op;
>
> QEMU coding style:
>
>   typedef enum {
>       BLK_ZO_OPEN,
>       BLK_ZO_CLOSE,
>       BLK_ZO_FINISH,
>       BLK_ZO_RESET,
>   } BlockZoneOp;
>
> Please also reformat the enums below.
>
> > +
> > +typedef enum zone_model {
> > +    BLK_Z_HM,
> > +    BLK_Z_HA,
> > +} zone_model;
> > +
> > +typedef enum BlkZoneCondition {
> > +    BLK_ZS_NOT_WP = 0x0,
> > +    BLK_ZS_EMPTY = 0x1,
> > +    BLK_ZS_IOPEN = 0x2,
> > +    BLK_ZS_EOPEN = 0x3,
> > +    BLK_ZS_CLOSED = 0x4,
> > +    BLK_ZS_RDONLY = 0xD,
> > +    BLK_ZS_FULL = 0xE,
> > +    BLK_ZS_OFFLINE = 0xF,
> > +} BlkZoneCondition;
> > +
> > +typedef enum BlkZoneType {
> > +    BLK_ZT_CONV = 0x1,
> > +    BLK_ZT_SWR = 0x2,
> > +    BLK_ZT_SWP = 0x3,
> > +} BlkZoneType;
> > +
> > +/*
> > + * Zone descriptor data structure.
> > + * Provide information on a zone with all position and size values in bytes.
> > + */
> > +typedef struct BlockZoneDescriptor {
> > +    uint64_t start;
> > +    uint64_t length;
> > +    uint64_t cap;
> > +    uint64_t wp;
> > +    BlkZoneType type;
> > +    BlkZoneCondition cond;
> > +} BlockZoneDescriptor;
> > +
> >  typedef struct BlockDriverInfo {
> >      /* in bytes, 0 if irrelevant */
> >      int cluster_size;
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 8947abab76..6037871089 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
> >      struct BdrvTrackedRequest *waiting_for;
> >  } BdrvTrackedRequest;
> >
> > +/**
> > + * Zone device information data structure.
> > + * Provide information on a device.
> > + */
> > +typedef struct zbd_dev {
> > +    uint32_t zone_size;
> > +    zone_model model;
> > +    uint32_t block_size;
> > +    uint32_t write_granularity;
> > +    uint32_t nr_zones;
> > +    struct BlockZoneDescriptor *zones; /* array of zones */
> > +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> > +    uint32_t max_nr_active_zones;
> > +} zbd_dev;
>
> This struct isn't use by this patch. Please move this change into the
> patch that uses struct zbd_dev.
>
> QEMU coding style naming would be BlockZoneDev instead of zbd_dev.
>
> I think this struct contains the block limits fields that could be added
> to QEMU's BlockLimits? A new struct may not be necessary.

Yes! It will in the next patch series.

> >
> >  struct BlockDriver {
> >      /*
> > @@ -691,6 +705,12 @@ struct BlockDriver {
> >                                            QEMUIOVector *qiov,
> >                                            int64_t pos);
> >
> > +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> > +            int64_t offset, int64_t *nr_zones,
> > +            BlockZoneDescriptor *zones);
> > +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> > +            int64_t offset, int64_t len);
> > +
> >      /* removable device specific */
> >      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> >      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
> > --
> > 2.36.1
> >
Sam Li July 13, 2022, 12:54 a.m. UTC | #7
Damien Le Moal <damien.lemoal@opensource.wdc.com> 于2022年7月12日周二 15:35写道:
>
> On 7/12/22 11:13, Sam Li wrote:
> > By adding zone management operations in BlockDriver, storage
> > controller emulation can use the new block layer APIs including
> > zone_report and zone_mgmt(open, close, finish, reset).
> >
> > Signed-off-by: Sam Li <faithilikerun@gmail.com>
> > ---
> >  block/block-backend.c            |  41 ++++++
> >  block/coroutines.h               |   5 +
> >  block/file-posix.c               | 236 +++++++++++++++++++++++++++++++
> >  include/block/block-common.h     |  43 +++++-
> >  include/block/block_int-common.h |  20 +++
> >  5 files changed, 344 insertions(+), 1 deletion(-)
> >
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index f425b00793..0a05247ae4 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -1806,6 +1806,47 @@ int blk_flush(BlockBackend *blk)
> >      return ret;
> >  }
> >
> > +/*
> > + * Send a zone_report command.
> > + * offset can be any number within the zone size. No alignment for offset.
> > + * nr_zones represents IN maximum and OUT actual.
> > + */
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk); /* increase before waiting */
> > +    blk_wait_while_drained(blk);
> > +    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> > +/*
> > + * Send a zone_management command.
> > + * Offset is the start of a zone and len is aligned to zones.
> > + */
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +        int64_t offset, int64_t len)
> > +{
> > +    int ret;
> > +    IO_CODE();
> > +
> > +    blk_inc_in_flight(blk);
> > +    blk_wait_while_drained(blk);
> > +    ret = blk_check_byte_request(blk, offset, len);
> > +    if (ret < 0) {
> > +        return ret;
>
> You missed adding "blk_dec_in_flight(blk);" before return here. But I
> think you can move the call to blk_check_byte_request() before
> blk_inc_in_flight() to avoid having to call blk_dec_in_flight().

Yes, I'll just remove them.

> > +    }
> > +
> > +    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
> > +    blk_dec_in_flight(blk);
> > +    return ret;
> > +}
> > +
> >  void blk_drain(BlockBackend *blk)
> >  {
> >      BlockDriverState *bs = blk_bs(blk);
> > diff --git a/block/coroutines.h b/block/coroutines.h
> > index 830ecaa733..19aa96cc56 100644
> > --- a/block/coroutines.h
> > +++ b/block/coroutines.h
> > @@ -80,6 +80,11 @@ int coroutine_fn
> >  blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
> >
> >  int coroutine_fn blk_co_do_flush(BlockBackend *blk);
> > +int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
> > +                                    int64_t *nr_zones,
> > +                                    BlockZoneDescriptor *zones);
> > +int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
> > +                                  int64_t offset, int64_t len);
> >
> >
> >  /*
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 48cd096624..e7523ae2ed 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -67,6 +67,7 @@
> >  #include <sys/param.h>
> >  #include <sys/syscall.h>
> >  #include <sys/vfs.h>
> > +#include <linux/blkzoned.h>
>
> You need to conditionally include this because not all kernels provide
> this file. Old kernels will not have it. So you need something like:
>
> #if defined(CONFIG_BLKZONED)
> #include <linux/blkzoned.h>
> #endif
>
> And adding this to meson.build should do the trick:
>
> diff --git a/meson.build b/meson.build
> index 65a885ea69..31d8852a35 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -1869,6 +1869,7 @@ config_host_data.set('CONFIG_REPLICATION',
> get_option('live_block_migration').al
>
>  # has_header
>  config_host_data.set('CONFIG_EPOLL', cc.has_header('sys/epoll.h'))
> +config_host_data.set('CONFIG_BLKZONED', cc.has_header('linux/blkzoned.h'))
>  config_host_data.set('CONFIG_LINUX_MAGIC_H', cc.has_header('linux/magic.h'))
>  config_host_data.set('CONFIG_VALGRIND_H',
> cc.has_header('valgrind/valgrind.h'))
>  config_host_data.set('HAVE_BTRFS_H', cc.has_header('linux/btrfs.h'))
>
> Then in build/config-host.h, you will see "#define CONFIG_BLKZONED". You
> then can use "#if defined(CONFIG_BLKZONED)" to conditionally define the
> code related to zoned devices.
>
> To test all this, temporarily rename your host
> /usr/include/linux/blkzoned.h file to some other name, configure qemu and
> see if it compiles.

Thanks!

> >  #include <linux/cdrom.h>
> >  #include <linux/fd.h>
> >  #include <linux/fs.h>
> > @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
> >              PreallocMode prealloc;
> >              Error **errp;
> >          } truncate;
> > +        struct {
> > +            int64_t *nr_zones;
> > +            BlockZoneDescriptor *zones;
> > +        } zone_report;
> > +        struct {
> > +            zone_op op;
> > +        } zone_mgmt;
> >      };
> >  } RawPosixAIOData;
> >
> > @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
> >  }
> >  #endif
> >
> > +/*
> > + * parse_zone - Fill a zone descriptor
> > + */
> > +static inline void parse_zone(struct BlockZoneDescriptor *zone,
> > +                              struct blk_zone *blkz) {
> > +    zone->start = blkz->start;
> > +    zone->length = blkz->len;
> > +    zone->cap = blkz->capacity;
> > +    zone->wp = blkz->wp - blkz->start;
> > +    zone->type = blkz->type;
> > +    zone->cond = blkz->cond;
> > +}
> > +
> > +static int handle_aiocb_zone_report(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t *nr_zones = aiocb->zone_report.nr_zones;
> > +    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
> > +    int64_t offset = aiocb->aio_offset;
> > +
> > +    struct blk_zone *blkz;
> > +    int64_t rep_size, nrz;
> > +    int ret, n = 0, i = 0;
> > +
> > +    nrz = *nr_zones;
> > +    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
> > +    g_autofree struct blk_zone_report *rep = NULL;
> > +    rep = g_malloc(rep_size);
> > +    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
> > +    printf("start to report zone with offset: 0x%lx\n", offset);
> > +
> > +    blkz = (struct blk_zone *)(rep + 1);
> > +    while (n < nrz) {
> > +        memset(rep, 0, rep_size);
> > +        rep->sector = offset;
> > +        rep->nr_zones = nrz;
> > +
> > +        ret = ioctl(fd, BLKREPORTZONE, rep);
> > +        if (ret != 0) {
> > +            ret = -errno;
> > +            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
> > +                         fd, offset, errno);
> > +            return ret;
> > +        }
> > +
> > +        if (!rep->nr_zones) {
> > +            break;
> > +        }
> > +
> > +        for (i = 0; i < rep->nr_zones; i++, n++) {
> > +            parse_zone(&zones[n], &blkz[i]);
> > +            /* The next report should start after the last zone reported */
> > +            offset = blkz[i].start + blkz[i].len;
> > +        }
> > +    }
> > +
> > +    *nr_zones = n;
> > +    return 0;
> > +}
> > +
> > +static int handle_aiocb_zone_mgmt(void *opaque) {
> > +    RawPosixAIOData *aiocb = opaque;
> > +    int fd = aiocb->aio_fildes;
> > +    int64_t offset = aiocb->aio_offset;
> > +    int64_t len = aiocb->aio_nbytes;
> > +    zone_op op = aiocb->zone_mgmt.op;
> > +
> > +    struct blk_zone_range range;
> > +    const char *ioctl_name;
> > +    unsigned long ioctl_op;
> > +    int64_t zone_size;
> > +    int64_t zone_size_mask;
> > +    int ret;
> > +
> > +    g_autofree struct stat *file = NULL;
> > +    file = g_new(struct stat, 1);
> > +    stat(s->filename, file);
> > +    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
> > +    zone_size_mask = zone_size - 1;
> > +    if (offset & zone_size_mask) {
> > +        error_report("offset is not the start of a zone");
> > +        return -EINVAL;
> > +    }
> > +
> > +    if (len & zone_size_mask) {
> > +        error_report("len is not aligned to zones");
> > +        return -EINVAL;
> > +    }
> > +
> > +    switch (op) {
> > +    case zone_open:
> > +        ioctl_name = "BLKOPENZONE";
> > +        ioctl_op = BLKOPENZONE;
> > +        break;
> > +    case zone_close:
> > +        ioctl_name = "BLKCLOSEZONE";
> > +        ioctl_op = BLKCLOSEZONE;
> > +        break;
> > +    case zone_finish:
> > +        ioctl_name = "BLKFINISHZONE";
> > +        ioctl_op = BLKFINISHZONE;
> > +        break;
> > +    case zone_reset:
> > +        ioctl_name = "BLKRESETZONE";
> > +        ioctl_op = BLKRESETZONE;
> > +        break;
> > +    default:
> > +        error_report("Invalid zone operation 0x%x", op);
> > +        return -EINVAL;
> > +    }
> > +
> > +    /* Execute the operation */
> > +    range.sector = offset;
> > +    range.nr_sectors = len;
> > +    ret = ioctl(fd, ioctl_op, &range);
> > +    if (ret != 0) {
> > +        error_report("ioctl %s failed %d",
> > +                     ioctl_name, errno);
> > +        return -errno;
> > +    }
> > +
> > +    return 0;
> > +}
> > +
> >  static int handle_aiocb_copy_range(void *opaque)
> >  {
> >      RawPosixAIOData *aiocb = opaque;
> > @@ -2973,6 +3105,59 @@ static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
> >      }
> >  }
> >
> > +/*
> > + * zone report - Get a zone block device's information in the form
> > + * of an array of zone descriptors.
> > + *
> > + * @param bs: passing zone block device file descriptor
> > + * @param zones: an array of zone descriptors to hold zone
> > + * information on reply
> > + * @param offset: offset can be any byte within the zone size.
> > + * @param len: (not sure yet.
> > + * @return 0 on success, -1 on failure
> > + */
> > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> > +                                           int64_t *nr_zones,
> > +                                           BlockZoneDescriptor *zones) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs         = bs,
> > +        .aio_fildes = s->fd,
> > +        .aio_type   = QEMU_AIO_IOCTL,
> > +        .aio_offset = offset,
> > +        .zone_report    = {
> > +                .nr_zones       = nr_zones,
> > +                .zones          = zones,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
> > +}
> > +
> > +/*
> > + * zone management operations - Execute an operation on a zone
> > + */
> > +static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
> > +        int64_t offset, int64_t len) {
> > +    BDRVRawState *s = bs->opaque;
> > +    RawPosixAIOData acb;
> > +
> > +    acb = (RawPosixAIOData) {
> > +        .bs             = bs,
> > +        .aio_fildes     = s->fd,
> > +        .aio_type       = QEMU_AIO_IOCTL,
> > +        .aio_offset     = offset,
> > +        .aio_nbytes     = len,
> > +        .zone_mgmt  = {
> > +                .op = op,
> > +        },
> > +    };
> > +
> > +    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
> > +}
> > +
> >  static coroutine_fn int
> >  raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
> >                  bool blkdev)
> > @@ -3324,6 +3509,9 @@ BlockDriver bdrv_file = {
> >      .bdrv_abort_perm_update = raw_abort_perm_update,
> >      .create_opts = &raw_create_opts,
> >      .mutable_opts = mutable_opts,
> > +
> > +    .bdrv_co_zone_report = raw_co_zone_report,
> > +    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> >  };
> >
> >  /***********************************************/
> > @@ -3703,6 +3891,53 @@ static BlockDriver bdrv_host_device = {
> >  #endif
> >  };
> >
> > +static BlockDriver bdrv_zoned_host_device = {
> > +        .format_name = "zoned_host_device",
> > +        .protocol_name = "zoned_host_device",
> > +        .instance_size = sizeof(BDRVRawState),
> > +        .bdrv_needs_filename = true,
> > +        .bdrv_probe_device  = hdev_probe_device,
> > +        .bdrv_parse_filename = hdev_parse_filename,
> > +        .bdrv_file_open     = hdev_open,
> > +        .bdrv_close         = raw_close,
> > +        .bdrv_reopen_prepare = raw_reopen_prepare,
> > +        .bdrv_reopen_commit  = raw_reopen_commit,
> > +        .bdrv_reopen_abort   = raw_reopen_abort,
> > +        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
> > +        .create_opts         = &bdrv_create_opts_simple,
> > +        .mutable_opts        = mutable_opts,
> > +        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
> > +        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
> > +
> > +        .bdrv_co_preadv         = raw_co_preadv,
> > +        .bdrv_co_pwritev        = raw_co_pwritev,
> > +        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
> > +        .bdrv_co_pdiscard       = hdev_co_pdiscard,
> > +        .bdrv_co_copy_range_from = raw_co_copy_range_from,
> > +        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
> > +        .bdrv_refresh_limits = raw_refresh_limits,
> > +        .bdrv_io_plug = raw_aio_plug,
> > +        .bdrv_io_unplug = raw_aio_unplug,
> > +        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
> > +
> > +        .bdrv_co_truncate       = raw_co_truncate,
> > +        .bdrv_getlength = raw_getlength,
> > +        .bdrv_get_info = raw_get_info,
> > +        .bdrv_get_allocated_file_size
> > +                            = raw_get_allocated_file_size,
> > +        .bdrv_get_specific_stats = hdev_get_specific_stats,
> > +        .bdrv_check_perm = raw_check_perm,
> > +        .bdrv_set_perm   = raw_set_perm,
> > +        .bdrv_abort_perm_update = raw_abort_perm_update,
> > +        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
> > +        .bdrv_probe_geometry = hdev_probe_geometry,
> > +        .bdrv_co_ioctl = hdev_co_ioctl,
> > +
> > +        /* zone management operations */
> > +        .bdrv_co_zone_report = raw_co_zone_report,
> > +        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
> > +};
> > +
> >  #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> >  static void cdrom_parse_filename(const char *filename, QDict *options,
> >                                   Error **errp)
> > @@ -3964,6 +4199,7 @@ static void bdrv_file_init(void)
> >  #if defined(HAVE_HOST_BLOCK_DEVICE)
> >      bdrv_register(&bdrv_host_device);
> >  #ifdef __linux__
> > +    bdrv_register(&bdrv_zoned_host_device);
> >      bdrv_register(&bdrv_host_cdrom);
> >  #endif
> >  #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
> > diff --git a/include/block/block-common.h b/include/block/block-common.h
> > index fdb7306e78..78cddeeda5 100644
> > --- a/include/block/block-common.h
> > +++ b/include/block/block-common.h
> > @@ -23,7 +23,6 @@
> >   */
> >  #ifndef BLOCK_COMMON_H
> >  #define BLOCK_COMMON_H
> > -
> >  #include "block/aio.h"
> >  #include "block/aio-wait.h"
> >  #include "qemu/iov.h"
> > @@ -49,6 +48,48 @@ typedef struct BlockDriver BlockDriver;
> >  typedef struct BdrvChild BdrvChild;
> >  typedef struct BdrvChildClass BdrvChildClass;
> >
> > +typedef enum zone_op {
> > +    zone_open,
> > +    zone_close,
> > +    zone_finish,
> > +    zone_reset,
> > +} zone_op;
> > +
> > +typedef enum zone_model {
> > +    BLK_Z_HM,
> > +    BLK_Z_HA,
> > +} zone_model;
> > +
> > +typedef enum BlkZoneCondition {
> > +    BLK_ZS_NOT_WP = 0x0,
> > +    BLK_ZS_EMPTY = 0x1,
> > +    BLK_ZS_IOPEN = 0x2,
> > +    BLK_ZS_EOPEN = 0x3,
> > +    BLK_ZS_CLOSED = 0x4,
> > +    BLK_ZS_RDONLY = 0xD,
> > +    BLK_ZS_FULL = 0xE,
> > +    BLK_ZS_OFFLINE = 0xF,
> > +} BlkZoneCondition;
> > +
> > +typedef enum BlkZoneType {
> > +    BLK_ZT_CONV = 0x1,
> > +    BLK_ZT_SWR = 0x2,
> > +    BLK_ZT_SWP = 0x3,
> > +} BlkZoneType;
> > +
> > +/*
> > + * Zone descriptor data structure.
> > + * Provide information on a zone with all position and size values in bytes.
> > + */
> > +typedef struct BlockZoneDescriptor {
> > +    uint64_t start;
> > +    uint64_t length;
> > +    uint64_t cap;
> > +    uint64_t wp;
> > +    BlkZoneType type;
> > +    BlkZoneCondition cond;
> > +} BlockZoneDescriptor;
> > +
> >  typedef struct BlockDriverInfo {
> >      /* in bytes, 0 if irrelevant */
> >      int cluster_size;
> > diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
> > index 8947abab76..6037871089 100644
> > --- a/include/block/block_int-common.h
> > +++ b/include/block/block_int-common.h
> > @@ -94,6 +94,20 @@ typedef struct BdrvTrackedRequest {
> >      struct BdrvTrackedRequest *waiting_for;
> >  } BdrvTrackedRequest;
> >
> > +/**
> > + * Zone device information data structure.
> > + * Provide information on a device.
> > + */
> > +typedef struct zbd_dev {
> > +    uint32_t zone_size;
> > +    zone_model model;
> > +    uint32_t block_size;
> > +    uint32_t write_granularity;
> > +    uint32_t nr_zones;
> > +    struct BlockZoneDescriptor *zones; /* array of zones */
> > +    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
> > +    uint32_t max_nr_active_zones;
> > +} zbd_dev;
> >
> >  struct BlockDriver {
> >      /*
> > @@ -691,6 +705,12 @@ struct BlockDriver {
> >                                            QEMUIOVector *qiov,
> >                                            int64_t pos);
> >
> > +    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
> > +            int64_t offset, int64_t *nr_zones,
> > +            BlockZoneDescriptor *zones);
> > +    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
> > +            int64_t offset, int64_t len);
> > +
> >      /* removable device specific */
> >      bool (*bdrv_is_inserted)(BlockDriverState *bs);
> >      void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);
>
>
> --
> Damien Le Moal
> Western Digital Research
Stefan Hajnoczi July 13, 2022, 6:19 a.m. UTC | #8
On Wed, Jul 13, 2022 at 08:51:45AM +0800, Sam Li wrote:
> Stefan Hajnoczi <stefanha@redhat.com> 于2022年7月12日周二 23:49写道:
> >
> > On Tue, Jul 12, 2022 at 10:13:37AM +0800, Sam Li wrote:
> > > diff --git a/block/file-posix.c b/block/file-posix.c
> > > index 48cd096624..e7523ae2ed 100644
> > > --- a/block/file-posix.c
> > > +++ b/block/file-posix.c
> > > @@ -67,6 +67,7 @@
> > >  #include <sys/param.h>
> > >  #include <sys/syscall.h>
> > >  #include <sys/vfs.h>
> > > +#include <linux/blkzoned.h>
> > >  #include <linux/cdrom.h>
> > >  #include <linux/fd.h>
> > >  #include <linux/fs.h>
> > > @@ -216,6 +217,13 @@ typedef struct RawPosixAIOData {
> > >              PreallocMode prealloc;
> > >              Error **errp;
> > >          } truncate;
> > > +        struct {
> > > +            int64_t *nr_zones;
> > > +            BlockZoneDescriptor *zones;
> > > +        } zone_report;
> > > +        struct {
> > > +            zone_op op;
> > > +        } zone_mgmt;
> > >      };
> > >  } RawPosixAIOData;
> > >
> > > @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
> > >  }
> > >  #endif
> > >
> >
> > Are the functions below within #ifdef __linux__?
> 
> Maybe add them later?

When using #ifdefs you usually need to apply them consistently to avoid
compiler errors or warnings.

For example:

  static void foo(void) { ... }

  #ifdef __linux__
  static BlockDriver ... = {
      .foo = foo,
  };
  #endif /* __linux__ */

On non-Linux hosts the compiler only sees foo() but it's unused, so it a
warning about an unused function will be displayed.

And the other way around results in a compiler error:

  #ifdef __linux__
  static void foo(void) { ... }
  #endif /* __linux__ */

  static BlockDriver ... = {
      .foo = foo,
  };

On non-Linux hosts the compiler doesn't see the definition of foo() that
the BlockDriver struct requires and compilation fails with an error.

There will be no errors on your machine if you avoid #ifdefs everywhere,
but reviewers will be worried about it and Continuous Integration tests
will fail to build on non-Linux hosts.

I would put the #ifdefs in place from the start.

> > > +static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
> > > +                                           int64_t *nr_zones,
> > > +                                           BlockZoneDescriptor *zones) {
> > > +    BDRVRawState *s = bs->opaque;
> > > +    RawPosixAIOData acb;
> > > +
> > > +    acb = (RawPosixAIOData) {
> > > +        .bs         = bs,
> > > +        .aio_fildes = s->fd,
> > > +        .aio_type   = QEMU_AIO_IOCTL,
> >
> > Please define QEMU_AIO_ZONE_REPORT, QEMU_AIO_ZONE_MGMT, etc values for
> > these new operations instead of reusing QEMU_AIO_IOCTL, which is meant
> > for generic passthrough ioctls.

After looking again I think .aio_type isn't used in this code path. You
can skip initializing this field if you want and no new enum constants
are needed.

Stefan
Stefan Hajnoczi July 13, 2022, 6:22 a.m. UTC | #9
On Wed, Jul 13, 2022 at 07:12:17AM +0900, Damien Le Moal wrote:
> On 7/13/22 00:49, Stefan Hajnoczi wrote:
> > On Tue, Jul 12, 2022 at 10:13:37AM +0800, Sam Li wrote:
> >> @@ -1801,6 +1809,130 @@ static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
> >>  }
> >>  #endif
> >>  
> > 
> > Are the functions below within #ifdef __linux__?
> 
> We need more than that: linux AND blkzoned.h header present (meaning a
> recent kernel). So the ifdef should be "#if defined(CONFIG_BLKZONED)" or
> something like it, with CONFIG_BLKZONED defined for linux AND
> /usr/include/linux/blkzoned.h present.

Okay. Try adding the following to meson.build:

  config_host_data.set('HAVE_LINUX_BLKZONED_H', cc.has_header('linux/blkzoned.h'))

Then:

  #ifdef HAVE_LINUX_BLKZONED_H
  ...
  #endif /* HAVE_LINUX_BLKZONED_H */

Stefan
diff mbox series

Patch

diff --git a/block/block-backend.c b/block/block-backend.c
index f425b00793..0a05247ae4 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1806,6 +1806,47 @@  int blk_flush(BlockBackend *blk)
     return ret;
 }
 
+/*
+ * Send a zone_report command.
+ * offset can be any number within the zone size. No alignment for offset.
+ * nr_zones represents IN maximum and OUT actual.
+ */
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    int64_t *nr_zones,
+                                    BlockZoneDescriptor *zones)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk); /* increase before waiting */
+    blk_wait_while_drained(blk);
+    ret = bdrv_co_zone_report(blk->root->bs, offset, nr_zones, zones);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
+/*
+ * Send a zone_management command.
+ * Offset is the start of a zone and len is aligned to zones.
+ */
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+        int64_t offset, int64_t len)
+{
+    int ret;
+    IO_CODE();
+
+    blk_inc_in_flight(blk);
+    blk_wait_while_drained(blk);
+    ret = blk_check_byte_request(blk, offset, len);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_co_zone_mgmt(blk->root->bs, op, offset, len);
+    blk_dec_in_flight(blk);
+    return ret;
+}
+
 void blk_drain(BlockBackend *blk)
 {
     BlockDriverState *bs = blk_bs(blk);
diff --git a/block/coroutines.h b/block/coroutines.h
index 830ecaa733..19aa96cc56 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -80,6 +80,11 @@  int coroutine_fn
 blk_co_do_pdiscard(BlockBackend *blk, int64_t offset, int64_t bytes);
 
 int coroutine_fn blk_co_do_flush(BlockBackend *blk);
+int coroutine_fn blk_co_zone_report(BlockBackend *blk, int64_t offset,
+                                    int64_t *nr_zones,
+                                    BlockZoneDescriptor *zones);
+int coroutine_fn blk_co_zone_mgmt(BlockBackend *blk, enum zone_op op,
+                                  int64_t offset, int64_t len);
 
 
 /*
diff --git a/block/file-posix.c b/block/file-posix.c
index 48cd096624..e7523ae2ed 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -67,6 +67,7 @@ 
 #include <sys/param.h>
 #include <sys/syscall.h>
 #include <sys/vfs.h>
+#include <linux/blkzoned.h>
 #include <linux/cdrom.h>
 #include <linux/fd.h>
 #include <linux/fs.h>
@@ -216,6 +217,13 @@  typedef struct RawPosixAIOData {
             PreallocMode prealloc;
             Error **errp;
         } truncate;
+        struct {
+            int64_t *nr_zones;
+            BlockZoneDescriptor *zones;
+        } zone_report;
+        struct {
+            zone_op op;
+        } zone_mgmt;
     };
 } RawPosixAIOData;
 
@@ -1801,6 +1809,130 @@  static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
 }
 #endif
 
+/*
+ * parse_zone - Fill a zone descriptor
+ */
+static inline void parse_zone(struct BlockZoneDescriptor *zone,
+                              struct blk_zone *blkz) {
+    zone->start = blkz->start;
+    zone->length = blkz->len;
+    zone->cap = blkz->capacity;
+    zone->wp = blkz->wp - blkz->start;
+    zone->type = blkz->type;
+    zone->cond = blkz->cond;
+}
+
+static int handle_aiocb_zone_report(void *opaque) {
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t *nr_zones = aiocb->zone_report.nr_zones;
+    BlockZoneDescriptor *zones = aiocb->zone_report.zones;
+    int64_t offset = aiocb->aio_offset;
+
+    struct blk_zone *blkz;
+    int64_t rep_size, nrz;
+    int ret, n = 0, i = 0;
+
+    nrz = *nr_zones;
+    rep_size = sizeof(struct blk_zone_report) + nrz * sizeof(struct blk_zone);
+    g_autofree struct blk_zone_report *rep = NULL;
+    rep = g_malloc(rep_size);
+    offset = offset / 512; /* get the unit of the start sector: sector size is 512 bytes. */
+    printf("start to report zone with offset: 0x%lx\n", offset);
+
+    blkz = (struct blk_zone *)(rep + 1);
+    while (n < nrz) {
+        memset(rep, 0, rep_size);
+        rep->sector = offset;
+        rep->nr_zones = nrz;
+
+        ret = ioctl(fd, BLKREPORTZONE, rep);
+        if (ret != 0) {
+            ret = -errno;
+            error_report("%d: ioctl BLKREPORTZONE at %ld failed %d",
+                         fd, offset, errno);
+            return ret;
+        }
+
+        if (!rep->nr_zones) {
+            break;
+        }
+
+        for (i = 0; i < rep->nr_zones; i++, n++) {
+            parse_zone(&zones[n], &blkz[i]);
+            /* The next report should start after the last zone reported */
+            offset = blkz[i].start + blkz[i].len;
+        }
+    }
+
+    *nr_zones = n;
+    return 0;
+}
+
+static int handle_aiocb_zone_mgmt(void *opaque) {
+    RawPosixAIOData *aiocb = opaque;
+    int fd = aiocb->aio_fildes;
+    int64_t offset = aiocb->aio_offset;
+    int64_t len = aiocb->aio_nbytes;
+    zone_op op = aiocb->zone_mgmt.op;
+
+    struct blk_zone_range range;
+    const char *ioctl_name;
+    unsigned long ioctl_op;
+    int64_t zone_size;
+    int64_t zone_size_mask;
+    int ret;
+
+    g_autofree struct stat *file = NULL;
+    file = g_new(struct stat, 1);
+    stat(s->filename, file);
+    zone_size = get_sysfs_long_val(fd, file, "chunk_sectors");
+    zone_size_mask = zone_size - 1;
+    if (offset & zone_size_mask) {
+        error_report("offset is not the start of a zone");
+        return -EINVAL;
+    }
+
+    if (len & zone_size_mask) {
+        error_report("len is not aligned to zones");
+        return -EINVAL;
+    }
+
+    switch (op) {
+    case zone_open:
+        ioctl_name = "BLKOPENZONE";
+        ioctl_op = BLKOPENZONE;
+        break;
+    case zone_close:
+        ioctl_name = "BLKCLOSEZONE";
+        ioctl_op = BLKCLOSEZONE;
+        break;
+    case zone_finish:
+        ioctl_name = "BLKFINISHZONE";
+        ioctl_op = BLKFINISHZONE;
+        break;
+    case zone_reset:
+        ioctl_name = "BLKRESETZONE";
+        ioctl_op = BLKRESETZONE;
+        break;
+    default:
+        error_report("Invalid zone operation 0x%x", op);
+        return -EINVAL;
+    }
+
+    /* Execute the operation */
+    range.sector = offset;
+    range.nr_sectors = len;
+    ret = ioctl(fd, ioctl_op, &range);
+    if (ret != 0) {
+        error_report("ioctl %s failed %d",
+                     ioctl_name, errno);
+        return -errno;
+    }
+
+    return 0;
+}
+
 static int handle_aiocb_copy_range(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
@@ -2973,6 +3105,59 @@  static void raw_account_discard(BDRVRawState *s, uint64_t nbytes, int ret)
     }
 }
 
+/*
+ * zone report - Get a zone block device's information in the form
+ * of an array of zone descriptors.
+ *
+ * @param bs: passing zone block device file descriptor
+ * @param zones: an array of zone descriptors to hold zone
+ * information on reply
+ * @param offset: offset can be any byte within the zone size.
+ * @param len: (not sure yet.
+ * @return 0 on success, -1 on failure
+ */
+static int coroutine_fn raw_co_zone_report(BlockDriverState *bs, int64_t offset,
+                                           int64_t *nr_zones,
+                                           BlockZoneDescriptor *zones) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs         = bs,
+        .aio_fildes = s->fd,
+        .aio_type   = QEMU_AIO_IOCTL,
+        .aio_offset = offset,
+        .zone_report    = {
+                .nr_zones       = nr_zones,
+                .zones          = zones,
+        },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_report, &acb);
+}
+
+/*
+ * zone management operations - Execute an operation on a zone
+ */
+static int coroutine_fn raw_co_zone_mgmt(BlockDriverState *bs, zone_op op,
+        int64_t offset, int64_t len) {
+    BDRVRawState *s = bs->opaque;
+    RawPosixAIOData acb;
+
+    acb = (RawPosixAIOData) {
+        .bs             = bs,
+        .aio_fildes     = s->fd,
+        .aio_type       = QEMU_AIO_IOCTL,
+        .aio_offset     = offset,
+        .aio_nbytes     = len,
+        .zone_mgmt  = {
+                .op = op,
+        },
+    };
+
+    return raw_thread_pool_submit(bs, handle_aiocb_zone_mgmt, &acb);
+}
+
 static coroutine_fn int
 raw_do_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes,
                 bool blkdev)
@@ -3324,6 +3509,9 @@  BlockDriver bdrv_file = {
     .bdrv_abort_perm_update = raw_abort_perm_update,
     .create_opts = &raw_create_opts,
     .mutable_opts = mutable_opts,
+
+    .bdrv_co_zone_report = raw_co_zone_report,
+    .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
 };
 
 /***********************************************/
@@ -3703,6 +3891,53 @@  static BlockDriver bdrv_host_device = {
 #endif
 };
 
+static BlockDriver bdrv_zoned_host_device = {
+        .format_name = "zoned_host_device",
+        .protocol_name = "zoned_host_device",
+        .instance_size = sizeof(BDRVRawState),
+        .bdrv_needs_filename = true,
+        .bdrv_probe_device  = hdev_probe_device,
+        .bdrv_parse_filename = hdev_parse_filename,
+        .bdrv_file_open     = hdev_open,
+        .bdrv_close         = raw_close,
+        .bdrv_reopen_prepare = raw_reopen_prepare,
+        .bdrv_reopen_commit  = raw_reopen_commit,
+        .bdrv_reopen_abort   = raw_reopen_abort,
+        .bdrv_co_create_opts = bdrv_co_create_opts_simple,
+        .create_opts         = &bdrv_create_opts_simple,
+        .mutable_opts        = mutable_opts,
+        .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
+        .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
+
+        .bdrv_co_preadv         = raw_co_preadv,
+        .bdrv_co_pwritev        = raw_co_pwritev,
+        .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
+        .bdrv_co_pdiscard       = hdev_co_pdiscard,
+        .bdrv_co_copy_range_from = raw_co_copy_range_from,
+        .bdrv_co_copy_range_to  = raw_co_copy_range_to,
+        .bdrv_refresh_limits = raw_refresh_limits,
+        .bdrv_io_plug = raw_aio_plug,
+        .bdrv_io_unplug = raw_aio_unplug,
+        .bdrv_attach_aio_context = raw_aio_attach_aio_context,
+
+        .bdrv_co_truncate       = raw_co_truncate,
+        .bdrv_getlength = raw_getlength,
+        .bdrv_get_info = raw_get_info,
+        .bdrv_get_allocated_file_size
+                            = raw_get_allocated_file_size,
+        .bdrv_get_specific_stats = hdev_get_specific_stats,
+        .bdrv_check_perm = raw_check_perm,
+        .bdrv_set_perm   = raw_set_perm,
+        .bdrv_abort_perm_update = raw_abort_perm_update,
+        .bdrv_probe_blocksizes = hdev_probe_blocksizes,
+        .bdrv_probe_geometry = hdev_probe_geometry,
+        .bdrv_co_ioctl = hdev_co_ioctl,
+
+        /* zone management operations */
+        .bdrv_co_zone_report = raw_co_zone_report,
+        .bdrv_co_zone_mgmt = raw_co_zone_mgmt,
+};
+
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
 static void cdrom_parse_filename(const char *filename, QDict *options,
                                  Error **errp)
@@ -3964,6 +4199,7 @@  static void bdrv_file_init(void)
 #if defined(HAVE_HOST_BLOCK_DEVICE)
     bdrv_register(&bdrv_host_device);
 #ifdef __linux__
+    bdrv_register(&bdrv_zoned_host_device);
     bdrv_register(&bdrv_host_cdrom);
 #endif
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
diff --git a/include/block/block-common.h b/include/block/block-common.h
index fdb7306e78..78cddeeda5 100644
--- a/include/block/block-common.h
+++ b/include/block/block-common.h
@@ -23,7 +23,6 @@ 
  */
 #ifndef BLOCK_COMMON_H
 #define BLOCK_COMMON_H
-
 #include "block/aio.h"
 #include "block/aio-wait.h"
 #include "qemu/iov.h"
@@ -49,6 +48,48 @@  typedef struct BlockDriver BlockDriver;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildClass BdrvChildClass;
 
+typedef enum zone_op {
+    zone_open,
+    zone_close,
+    zone_finish,
+    zone_reset,
+} zone_op;
+
+typedef enum zone_model {
+    BLK_Z_HM,
+    BLK_Z_HA,
+} zone_model;
+
+typedef enum BlkZoneCondition {
+    BLK_ZS_NOT_WP = 0x0,
+    BLK_ZS_EMPTY = 0x1,
+    BLK_ZS_IOPEN = 0x2,
+    BLK_ZS_EOPEN = 0x3,
+    BLK_ZS_CLOSED = 0x4,
+    BLK_ZS_RDONLY = 0xD,
+    BLK_ZS_FULL = 0xE,
+    BLK_ZS_OFFLINE = 0xF,
+} BlkZoneCondition;
+
+typedef enum BlkZoneType {
+    BLK_ZT_CONV = 0x1,
+    BLK_ZT_SWR = 0x2,
+    BLK_ZT_SWP = 0x3,
+} BlkZoneType;
+
+/*
+ * Zone descriptor data structure.
+ * Provide information on a zone with all position and size values in bytes.
+ */
+typedef struct BlockZoneDescriptor {
+    uint64_t start;
+    uint64_t length;
+    uint64_t cap;
+    uint64_t wp;
+    BlkZoneType type;
+    BlkZoneCondition cond;
+} BlockZoneDescriptor;
+
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
     int cluster_size;
diff --git a/include/block/block_int-common.h b/include/block/block_int-common.h
index 8947abab76..6037871089 100644
--- a/include/block/block_int-common.h
+++ b/include/block/block_int-common.h
@@ -94,6 +94,20 @@  typedef struct BdrvTrackedRequest {
     struct BdrvTrackedRequest *waiting_for;
 } BdrvTrackedRequest;
 
+/**
+ * Zone device information data structure.
+ * Provide information on a device.
+ */
+typedef struct zbd_dev {
+    uint32_t zone_size;
+    zone_model model;
+    uint32_t block_size;
+    uint32_t write_granularity;
+    uint32_t nr_zones;
+    struct BlockZoneDescriptor *zones; /* array of zones */
+    uint32_t max_nr_open_zones; /* maximum number of explicitly open zones */
+    uint32_t max_nr_active_zones;
+} zbd_dev;
 
 struct BlockDriver {
     /*
@@ -691,6 +705,12 @@  struct BlockDriver {
                                           QEMUIOVector *qiov,
                                           int64_t pos);
 
+    int coroutine_fn (*bdrv_co_zone_report)(BlockDriverState *bs,
+            int64_t offset, int64_t *nr_zones,
+            BlockZoneDescriptor *zones);
+    int coroutine_fn (*bdrv_co_zone_mgmt)(BlockDriverState *bs, enum zone_op op,
+            int64_t offset, int64_t len);
+
     /* removable device specific */
     bool (*bdrv_is_inserted)(BlockDriverState *bs);
     void (*bdrv_eject)(BlockDriverState *bs, bool eject_flag);