diff mbox series

[RFC,v3,15/30] io: Add a pwritev/preadv version that takes a discontiguous iovec

Message ID 20231127202612.23012-16-farosas@suse.de
State New
Headers show
Series migration: File based migration with multifd and fixed-ram | expand

Commit Message

Fabiano Rosas Nov. 27, 2023, 8:25 p.m. UTC
For the upcoming support to fixed-ram migration with multifd, we need
to be able to accept an iovec array with non-contiguous data.

Add a pwritev and preadv version that splits the array into contiguous
segments before writing. With that we can have the ram code continue
to add pages in any order and the multifd code continue to send large
arrays for reading and writing.

Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
- split the API that was merged into a single function
- use uintptr_t for compatibility with 32-bit
---
 include/io/channel.h | 26 ++++++++++++++++
 io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)

Comments

Peter Xu Jan. 16, 2024, 6:58 a.m. UTC | #1
On Mon, Nov 27, 2023 at 05:25:57PM -0300, Fabiano Rosas wrote:
> For the upcoming support to fixed-ram migration with multifd, we need
> to be able to accept an iovec array with non-contiguous data.
> 
> Add a pwritev and preadv version that splits the array into contiguous
> segments before writing. With that we can have the ram code continue
> to add pages in any order and the multifd code continue to send large
> arrays for reading and writing.
> 
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
> - split the API that was merged into a single function
> - use uintptr_t for compatibility with 32-bit
> ---
>  include/io/channel.h | 26 ++++++++++++++++
>  io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 96 insertions(+)
> 
> diff --git a/include/io/channel.h b/include/io/channel.h
> index 7986c49c71..25383db5aa 100644
> --- a/include/io/channel.h
> +++ b/include/io/channel.h
> @@ -559,6 +559,19 @@ int qio_channel_close(QIOChannel *ioc,
>  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>                              size_t niov, off_t offset, Error **errp);
>  
> +/**
> + * qio_channel_pwritev_all:
> + * @ioc: the channel object
> + * @iov: the array of memory regions to write data from
> + * @niov: the length of the @iov array
> + * @offset: the iovec offset in the file where to write the data
> + * @errp: pointer to a NULL-initialized error object
> + *
> + * Returns: 0 if all bytes were written, or -1 on error
> + */
> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> +                            size_t niov, off_t offset, Error **errp);
> +
>  /**
>   * qio_channel_pwrite
>   * @ioc: the channel object
> @@ -595,6 +608,19 @@ ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>                             size_t niov, off_t offset, Error **errp);
>  
> +/**
> + * qio_channel_preadv_all:
> + * @ioc: the channel object
> + * @iov: the array of memory regions to read data to
> + * @niov: the length of the @iov array
> + * @offset: the iovec offset in the file from where to read the data
> + * @errp: pointer to a NULL-initialized error object
> + *
> + * Returns: 0 if all bytes were read, or -1 on error
> + */
> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
> +                           size_t niov, off_t offset, Error **errp);
> +
>  /**
>   * qio_channel_pread
>   * @ioc: the channel object
> diff --git a/io/channel.c b/io/channel.c
> index a1f12f8e90..2f1745d052 100644
> --- a/io/channel.c
> +++ b/io/channel.c
> @@ -472,6 +472,69 @@ ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>      return klass->io_pwritev(ioc, iov, niov, offset, errp);
>  }
>  
> +static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
> +                                                 const struct iovec *iov,
> +                                                 size_t niov, off_t offset,
> +                                                 bool is_write, Error **errp)
> +{
> +    ssize_t ret = -1;
> +    int i, slice_idx, slice_num;
> +    uintptr_t base, next, file_offset;
> +    size_t len;
> +
> +    slice_idx = 0;
> +    slice_num = 1;
> +
> +    /*
> +     * If the iov array doesn't have contiguous elements, we need to
> +     * split it in slices because we only have one (file) 'offset' for
> +     * the whole iov. Do this here so callers don't need to break the
> +     * iov array themselves.
> +     */
> +    for (i = 0; i < niov; i++, slice_num++) {
> +        base = (uintptr_t) iov[i].iov_base;
> +
> +        if (i != niov - 1) {
> +            len = iov[i].iov_len;
> +            next = (uintptr_t) iov[i + 1].iov_base;
> +
> +            if (base + len == next) {
> +                continue;
> +            }
> +        }
> +
> +        /*
> +         * Use the offset of the first element of the segment that
> +         * we're sending.
> +         */
> +        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
> +
> +        if (is_write) {
> +            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
> +                                      file_offset, errp);
> +        } else {
> +            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
> +                                     file_offset, errp);
> +        }
> +
> +        if (ret < 0) {
> +            break;
> +        }
> +
> +        slice_idx += slice_num;
> +        slice_num = 0;
> +    }
> +
> +    return (ret < 0) ? -1 : 0;
> +}
> +
> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> +                            size_t niov, off_t offset, Error **errp)
> +{
> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
> +                                                 offset, true, errp);
> +}

I'm not sure how Dan thinks about this, but I don't think this is pretty..

With this implementation, iochannels' preadv/pwritev is completely not
compatible with most OSes now, afaiu.

The definition of offset in preadv/pwritev of current iochannel is hard to
understand.. if I read it right it'll later be set to:
      
                /*
                 * If we subtract the host page now, we don't need to
                 * pass it into qio_channel_pwritev_all() below.
                 */
                write_base = p->pages->block->pages_offset -
                    (uintptr_t)p->pages->block->host;

Which I cannot easily tell what it is.. besides being an unsigned int.

IIUC it's also based on the assumption that the host address of each iov
entry is linear to its offset in the file, but it may not be true for
future iochannel users of such interface called as pwritev/preadv.  So
error prone.

Would it be possible we keep using the offset array (p->pages->offset[x])?
We have it already anyway, right?  Wouldn't that be clearer?

It doesn't need to be called pwritev/preadv, but taking two arrays: the
host address array and another offset array on that file.  It can still do
the range merge, do another sanity check on the offsets to make sure the
offsets are also continuous (and should be true in our case).

> +
>  ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>                             off_t offset, Error **errp)
>  {
> @@ -501,6 +564,13 @@ ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>      return klass->io_preadv(ioc, iov, niov, offset, errp);
>  }
>  
> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
> +                           size_t niov, off_t offset, Error **errp)
> +{
> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
> +                                                 offset, false, errp);
> +}
> +
>  ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
>                            off_t offset, Error **errp)
>  {
> -- 
> 2.35.3
>
Fabiano Rosas Jan. 16, 2024, 6:15 p.m. UTC | #2
Peter Xu <peterx@redhat.com> writes:

> On Mon, Nov 27, 2023 at 05:25:57PM -0300, Fabiano Rosas wrote:
>> For the upcoming support to fixed-ram migration with multifd, we need
>> to be able to accept an iovec array with non-contiguous data.
>> 
>> Add a pwritev and preadv version that splits the array into contiguous
>> segments before writing. With that we can have the ram code continue
>> to add pages in any order and the multifd code continue to send large
>> arrays for reading and writing.
>> 
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> ---
>> - split the API that was merged into a single function
>> - use uintptr_t for compatibility with 32-bit
>> ---
>>  include/io/channel.h | 26 ++++++++++++++++
>>  io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
>>  2 files changed, 96 insertions(+)
>> 
>> diff --git a/include/io/channel.h b/include/io/channel.h
>> index 7986c49c71..25383db5aa 100644
>> --- a/include/io/channel.h
>> +++ b/include/io/channel.h
>> @@ -559,6 +559,19 @@ int qio_channel_close(QIOChannel *ioc,
>>  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>>                              size_t niov, off_t offset, Error **errp);
>>  
>> +/**
>> + * qio_channel_pwritev_all:
>> + * @ioc: the channel object
>> + * @iov: the array of memory regions to write data from
>> + * @niov: the length of the @iov array
>> + * @offset: the iovec offset in the file where to write the data
>> + * @errp: pointer to a NULL-initialized error object
>> + *
>> + * Returns: 0 if all bytes were written, or -1 on error
>> + */
>> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
>> +                            size_t niov, off_t offset, Error **errp);
>> +
>>  /**
>>   * qio_channel_pwrite
>>   * @ioc: the channel object
>> @@ -595,6 +608,19 @@ ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>>  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>>                             size_t niov, off_t offset, Error **errp);
>>  
>> +/**
>> + * qio_channel_preadv_all:
>> + * @ioc: the channel object
>> + * @iov: the array of memory regions to read data to
>> + * @niov: the length of the @iov array
>> + * @offset: the iovec offset in the file from where to read the data
>> + * @errp: pointer to a NULL-initialized error object
>> + *
>> + * Returns: 0 if all bytes were read, or -1 on error
>> + */
>> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
>> +                           size_t niov, off_t offset, Error **errp);
>> +
>>  /**
>>   * qio_channel_pread
>>   * @ioc: the channel object
>> diff --git a/io/channel.c b/io/channel.c
>> index a1f12f8e90..2f1745d052 100644
>> --- a/io/channel.c
>> +++ b/io/channel.c
>> @@ -472,6 +472,69 @@ ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>>      return klass->io_pwritev(ioc, iov, niov, offset, errp);
>>  }
>>  
>> +static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
>> +                                                 const struct iovec *iov,
>> +                                                 size_t niov, off_t offset,
>> +                                                 bool is_write, Error **errp)
>> +{
>> +    ssize_t ret = -1;
>> +    int i, slice_idx, slice_num;
>> +    uintptr_t base, next, file_offset;
>> +    size_t len;
>> +
>> +    slice_idx = 0;
>> +    slice_num = 1;
>> +
>> +    /*
>> +     * If the iov array doesn't have contiguous elements, we need to
>> +     * split it in slices because we only have one (file) 'offset' for
>> +     * the whole iov. Do this here so callers don't need to break the
>> +     * iov array themselves.
>> +     */
>> +    for (i = 0; i < niov; i++, slice_num++) {
>> +        base = (uintptr_t) iov[i].iov_base;
>> +
>> +        if (i != niov - 1) {
>> +            len = iov[i].iov_len;
>> +            next = (uintptr_t) iov[i + 1].iov_base;
>> +
>> +            if (base + len == next) {
>> +                continue;
>> +            }
>> +        }
>> +
>> +        /*
>> +         * Use the offset of the first element of the segment that
>> +         * we're sending.
>> +         */
>> +        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
>> +
>> +        if (is_write) {
>> +            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
>> +                                      file_offset, errp);
>> +        } else {
>> +            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
>> +                                     file_offset, errp);
>> +        }
>> +
>> +        if (ret < 0) {
>> +            break;
>> +        }
>> +
>> +        slice_idx += slice_num;
>> +        slice_num = 0;
>> +    }
>> +
>> +    return (ret < 0) ? -1 : 0;
>> +}
>> +
>> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
>> +                            size_t niov, off_t offset, Error **errp)
>> +{
>> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
>> +                                                 offset, true, errp);
>> +}
>
> I'm not sure how Dan thinks about this, but I don't think this is pretty..
>
> With this implementation, iochannels' preadv/pwritev is completely not
> compatible with most OSes now, afaiu.

This is internal QEMU code. I hope no one is expecting qio_channel_foo()
to behave like some OS's foo() system call. We cannot guarantee that
compatibility save for the simplest of wrappers.

>
> The definition of offset in preadv/pwritev of current iochannel is hard to
> understand.. if I read it right it'll later be set to:
>       
>                 /*
>                  * If we subtract the host page now, we don't need to
>                  * pass it into qio_channel_pwritev_all() below.
>                  */
>                 write_base = p->pages->block->pages_offset -
>                     (uintptr_t)p->pages->block->host;
>
> Which I cannot easily tell what it is.. besides being an unsigned int.

This description was unfortunately dropped along the way:

"Since iovs can be non contiguous, we'd need a separate array on the
side to carry an extra file offset for each of them, so I'm relying on
the fact that iovs are all within a same host page and passing in an
encoded offset that takes the host page into account."

> IIUC it's also based on the assumption that the host address of each iov
> entry is linear to its offset in the file, but it may not be true for
> future iochannel users of such interface called as pwritev/preadv.  So
> error prone.

Yes, but it's also our choice whether to make this a generic API. We may
have good reasons to consider a migration-specific function here.

> Would it be possible we keep using the offset array (p->pages->offset[x])?
> We have it already anyway, right?  Wouldn't that be clearer?
>

We'd have to make a copy of the array because p->pages is expected to
change while the IO happens. And while we already have a copy in
p->normal, my intention for multifd was to eliminate p->normal in the
future, so it would be nice if we could avoid it.

Also, we cannot use p->pages->offset alone because we still need the
pages_offset, i.e. the file offset where that ramblocks's pages begin.
So that means also adding that to each element of the new array.

It would probably be overall clearer and less wasteful to pass in the
host page address instead of an array of offsets. I don't see an issue
with restricting the iovs to the same host page. The migration code is
the only user for this code and AFAIK we don't have plans to change that
invariant.

> It doesn't need to be called pwritev/preadv, but taking two arrays: the
> host address array and another offset array on that file.  It can still do
> the range merge, do another sanity check on the offsets to make sure the
> offsets are also continuous (and should be true in our case).
>
>> +
>>  ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>>                             off_t offset, Error **errp)
>>  {
>> @@ -501,6 +564,13 @@ ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>>      return klass->io_preadv(ioc, iov, niov, offset, errp);
>>  }
>>  
>> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
>> +                           size_t niov, off_t offset, Error **errp)
>> +{
>> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
>> +                                                 offset, false, errp);
>> +}
>> +
>>  ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
>>                            off_t offset, Error **errp)
>>  {
>> -- 
>> 2.35.3
>>
Peter Xu Jan. 17, 2024, 9:48 a.m. UTC | #3
On Tue, Jan 16, 2024 at 03:15:50PM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Mon, Nov 27, 2023 at 05:25:57PM -0300, Fabiano Rosas wrote:
> >> For the upcoming support to fixed-ram migration with multifd, we need
> >> to be able to accept an iovec array with non-contiguous data.
> >> 
> >> Add a pwritev and preadv version that splits the array into contiguous
> >> segments before writing. With that we can have the ram code continue
> >> to add pages in any order and the multifd code continue to send large
> >> arrays for reading and writing.
> >> 
> >> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> >> ---
> >> - split the API that was merged into a single function
> >> - use uintptr_t for compatibility with 32-bit
> >> ---
> >>  include/io/channel.h | 26 ++++++++++++++++
> >>  io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
> >>  2 files changed, 96 insertions(+)
> >> 
> >> diff --git a/include/io/channel.h b/include/io/channel.h
> >> index 7986c49c71..25383db5aa 100644
> >> --- a/include/io/channel.h
> >> +++ b/include/io/channel.h
> >> @@ -559,6 +559,19 @@ int qio_channel_close(QIOChannel *ioc,
> >>  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
> >>                              size_t niov, off_t offset, Error **errp);
> >>  
> >> +/**
> >> + * qio_channel_pwritev_all:
> >> + * @ioc: the channel object
> >> + * @iov: the array of memory regions to write data from
> >> + * @niov: the length of the @iov array
> >> + * @offset: the iovec offset in the file where to write the data
> >> + * @errp: pointer to a NULL-initialized error object
> >> + *
> >> + * Returns: 0 if all bytes were written, or -1 on error
> >> + */
> >> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> >> +                            size_t niov, off_t offset, Error **errp);
> >> +
> >>  /**
> >>   * qio_channel_pwrite
> >>   * @ioc: the channel object
> >> @@ -595,6 +608,19 @@ ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
> >>  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
> >>                             size_t niov, off_t offset, Error **errp);
> >>  
> >> +/**
> >> + * qio_channel_preadv_all:
> >> + * @ioc: the channel object
> >> + * @iov: the array of memory regions to read data to
> >> + * @niov: the length of the @iov array
> >> + * @offset: the iovec offset in the file from where to read the data
> >> + * @errp: pointer to a NULL-initialized error object
> >> + *
> >> + * Returns: 0 if all bytes were read, or -1 on error
> >> + */
> >> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
> >> +                           size_t niov, off_t offset, Error **errp);
> >> +
> >>  /**
> >>   * qio_channel_pread
> >>   * @ioc: the channel object
> >> diff --git a/io/channel.c b/io/channel.c
> >> index a1f12f8e90..2f1745d052 100644
> >> --- a/io/channel.c
> >> +++ b/io/channel.c
> >> @@ -472,6 +472,69 @@ ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
> >>      return klass->io_pwritev(ioc, iov, niov, offset, errp);
> >>  }
> >>  
> >> +static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
> >> +                                                 const struct iovec *iov,
> >> +                                                 size_t niov, off_t offset,
> >> +                                                 bool is_write, Error **errp)
> >> +{
> >> +    ssize_t ret = -1;
> >> +    int i, slice_idx, slice_num;
> >> +    uintptr_t base, next, file_offset;
> >> +    size_t len;
> >> +
> >> +    slice_idx = 0;
> >> +    slice_num = 1;
> >> +
> >> +    /*
> >> +     * If the iov array doesn't have contiguous elements, we need to
> >> +     * split it in slices because we only have one (file) 'offset' for
> >> +     * the whole iov. Do this here so callers don't need to break the
> >> +     * iov array themselves.
> >> +     */
> >> +    for (i = 0; i < niov; i++, slice_num++) {
> >> +        base = (uintptr_t) iov[i].iov_base;
> >> +
> >> +        if (i != niov - 1) {
> >> +            len = iov[i].iov_len;
> >> +            next = (uintptr_t) iov[i + 1].iov_base;
> >> +
> >> +            if (base + len == next) {
> >> +                continue;
> >> +            }
> >> +        }
> >> +
> >> +        /*
> >> +         * Use the offset of the first element of the segment that
> >> +         * we're sending.
> >> +         */
> >> +        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
> >> +
> >> +        if (is_write) {
> >> +            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
> >> +                                      file_offset, errp);
> >> +        } else {
> >> +            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
> >> +                                     file_offset, errp);
> >> +        }
> >> +
> >> +        if (ret < 0) {
> >> +            break;
> >> +        }
> >> +
> >> +        slice_idx += slice_num;
> >> +        slice_num = 0;
> >> +    }
> >> +
> >> +    return (ret < 0) ? -1 : 0;
> >> +}
> >> +
> >> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> >> +                            size_t niov, off_t offset, Error **errp)
> >> +{
> >> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
> >> +                                                 offset, true, errp);
> >> +}
> >
> > I'm not sure how Dan thinks about this, but I don't think this is pretty..
> >
> > With this implementation, iochannels' preadv/pwritev is completely not
> > compatible with most OSes now, afaiu.
> 
> This is internal QEMU code. I hope no one is expecting qio_channel_foo()
> to behave like some OS's foo() system call. We cannot guarantee that
> compatibility save for the simplest of wrappers.

I was expecting that when I started to read. :)

https://man.freebsd.org/cgi/man.cgi?query=pwritev
https://linux.die.net/man/2/pwritev

It's not "some OSes", it's mostly all.  I can understand you prefer such
approach, but even if so, shall we still try to avoid using pwritev/preadv
as the names?

> 
> >
> > The definition of offset in preadv/pwritev of current iochannel is hard to
> > understand.. if I read it right it'll later be set to:
> >       
> >                 /*
> >                  * If we subtract the host page now, we don't need to
> >                  * pass it into qio_channel_pwritev_all() below.
> >                  */
> >                 write_base = p->pages->block->pages_offset -
> >                     (uintptr_t)p->pages->block->host;
> >
> > Which I cannot easily tell what it is.. besides being an unsigned int.
> 
> This description was unfortunately dropped along the way:
> 
> "Since iovs can be non contiguous, we'd need a separate array on the
> side to carry an extra file offset for each of them, so I'm relying on
> the fact that iovs are all within a same host page and passing in an
> encoded offset that takes the host page into account."
> 
> > IIUC it's also based on the assumption that the host address of each iov
> > entry is linear to its offset in the file, but it may not be true for
> > future iochannel users of such interface called as pwritev/preadv.  So
> > error prone.
> 
> Yes, but it's also our choice whether to make this a generic API. We may
> have good reasons to consider a migration-specific function here.
> 
> > Would it be possible we keep using the offset array (p->pages->offset[x])?
> > We have it already anyway, right?  Wouldn't that be clearer?
> >
> 
> We'd have to make a copy of the array because p->pages is expected to
> change while the IO happens.

Hmm, I don't see why p->pages can change. IIUC p->pages will be there solid
at least until all IO syscalls are completed, then the next call to, e.g.,
multifd_send_pages() will swap that with multifd_send_state->pages.  But I
think I get your point, with below.

> And while we already have a copy in
> p->normal, my intention for multifd was to eliminate p->normal in the
> future, so it would be nice if we could avoid it.
> 
> Also, we cannot use p->pages->offset alone because we still need the
> pages_offset, i.e. the file offset where that ramblocks's pages begin.
> So that means also adding that to each element of the new array.
> 
> It would probably be overall clearer and less wasteful to pass in the
> host page address instead of an array of offsets. I don't see an issue
> with restricting the iovs to the same host page. The migration code is
> the only user for this code and AFAIK we don't have plans to change that
> invariant.

So I think I get your point now, the only concern (besides naming..) is,
I still want to avoid an interface that contains a field that is hard to
understand like write_base.

How about this?

  /**
   * multifd_write_ramblock_iov: Write IO vector (of ramblock) to channel
   *
   * @ioc: The iochannel to write to. The IOC must have pwritev/preadv
   *       interface must be implemented.
   * @iov: The IO vector to write.  All addresses must be within the
   *       ramblock host address range.
   * @iov_len: The IO vector size
   * @ramblock: The ramblock that covers all buffers in this IO vector
   */
  int multifd_write_ramblock_iov(ioc, iov, iov_len, ramblock);
Daniel P. Berrangé Jan. 17, 2024, 12:39 p.m. UTC | #4
On Mon, Nov 27, 2023 at 05:25:57PM -0300, Fabiano Rosas wrote:
> For the upcoming support to fixed-ram migration with multifd, we need
> to be able to accept an iovec array with non-contiguous data.
> 
> Add a pwritev and preadv version that splits the array into contiguous
> segments before writing. With that we can have the ram code continue
> to add pages in any order and the multifd code continue to send large
> arrays for reading and writing.
> 
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
> - split the API that was merged into a single function
> - use uintptr_t for compatibility with 32-bit
> ---
>  include/io/channel.h | 26 ++++++++++++++++
>  io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 96 insertions(+)
> 
> diff --git a/include/io/channel.h b/include/io/channel.h
> index 7986c49c71..25383db5aa 100644
> --- a/include/io/channel.h
> +++ b/include/io/channel.h
> @@ -559,6 +559,19 @@ int qio_channel_close(QIOChannel *ioc,
>  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>                              size_t niov, off_t offset, Error **errp);
>  
> +/**
> + * qio_channel_pwritev_all:
> + * @ioc: the channel object
> + * @iov: the array of memory regions to write data from
> + * @niov: the length of the @iov array
> + * @offset: the iovec offset in the file where to write the data
> + * @errp: pointer to a NULL-initialized error object
> + *
> + * Returns: 0 if all bytes were written, or -1 on error
> + */
> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> +                            size_t niov, off_t offset, Error **errp);
> +
>  /**
>   * qio_channel_pwrite
>   * @ioc: the channel object
> @@ -595,6 +608,19 @@ ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>                             size_t niov, off_t offset, Error **errp);
>  
> +/**
> + * qio_channel_preadv_all:
> + * @ioc: the channel object
> + * @iov: the array of memory regions to read data to
> + * @niov: the length of the @iov array
> + * @offset: the iovec offset in the file from where to read the data
> + * @errp: pointer to a NULL-initialized error object
> + *
> + * Returns: 0 if all bytes were read, or -1 on error
> + */
> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
> +                           size_t niov, off_t offset, Error **errp);
> +
>  /**
>   * qio_channel_pread
>   * @ioc: the channel object
> diff --git a/io/channel.c b/io/channel.c
> index a1f12f8e90..2f1745d052 100644
> --- a/io/channel.c
> +++ b/io/channel.c
> @@ -472,6 +472,69 @@ ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>      return klass->io_pwritev(ioc, iov, niov, offset, errp);
>  }
>  
> +static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
> +                                                 const struct iovec *iov,
> +                                                 size_t niov, off_t offset,
> +                                                 bool is_write, Error **errp)
> +{
> +    ssize_t ret = -1;
> +    int i, slice_idx, slice_num;
> +    uintptr_t base, next, file_offset;
> +    size_t len;
> +
> +    slice_idx = 0;
> +    slice_num = 1;
> +
> +    /*
> +     * If the iov array doesn't have contiguous elements, we need to
> +     * split it in slices because we only have one (file) 'offset' for
> +     * the whole iov. Do this here so callers don't need to break the
> +     * iov array themselves.
> +     */
> +    for (i = 0; i < niov; i++, slice_num++) {
> +        base = (uintptr_t) iov[i].iov_base;
> +
> +        if (i != niov - 1) {
> +            len = iov[i].iov_len;
> +            next = (uintptr_t) iov[i + 1].iov_base;
> +
> +            if (base + len == next) {
> +                continue;
> +            }
> +        }
> +
> +        /*
> +         * Use the offset of the first element of the segment that
> +         * we're sending.
> +         */
> +        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
> +
> +        if (is_write) {
> +            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
> +                                      file_offset, errp);
> +        } else {
> +            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
> +                                     file_offset, errp);
> +        }

iov_base is the address of a pointer in RAM, so could be
potentially any 64-bit value.

We're assigning file_offset to this pointer address with an
user supplied offset, and then using it as an offset on disk.
First this could result in 64-bit overflow when 'offset' is
added to 'iov_base', and second this could result in a file
that's 16 Exabytes in size (with holes of course).

I don't get how this is supposed to work, or be used ?

> +
> +        if (ret < 0) {
> +            break;
> +        }
> +
> +        slice_idx += slice_num;
> +        slice_num = 0;
> +    }
> +
> +    return (ret < 0) ? -1 : 0;
> +}
> +
> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> +                            size_t niov, off_t offset, Error **errp)
> +{
> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
> +                                                 offset, true, errp);
> +}
> +
>  ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>                             off_t offset, Error **errp)
>  {
> @@ -501,6 +564,13 @@ ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>      return klass->io_preadv(ioc, iov, niov, offset, errp);
>  }
>  
> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
> +                           size_t niov, off_t offset, Error **errp)
> +{
> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
> +                                                 offset, false, errp);
> +}
> +
>  ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
>                            off_t offset, Error **errp)
>  {
> -- 
> 2.35.3
> 

With regards,
Daniel
Daniel P. Berrangé Jan. 17, 2024, 2:27 p.m. UTC | #5
On Wed, Jan 17, 2024 at 12:39:26PM +0000, Daniel P. Berrangé wrote:
> On Mon, Nov 27, 2023 at 05:25:57PM -0300, Fabiano Rosas wrote:
> > For the upcoming support to fixed-ram migration with multifd, we need
> > to be able to accept an iovec array with non-contiguous data.
> > 
> > Add a pwritev and preadv version that splits the array into contiguous
> > segments before writing. With that we can have the ram code continue
> > to add pages in any order and the multifd code continue to send large
> > arrays for reading and writing.
> > 
> > Signed-off-by: Fabiano Rosas <farosas@suse.de>
> > ---
> > - split the API that was merged into a single function
> > - use uintptr_t for compatibility with 32-bit
> > ---
> >  include/io/channel.h | 26 ++++++++++++++++
> >  io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
> >  2 files changed, 96 insertions(+)
> > 
> > diff --git a/include/io/channel.h b/include/io/channel.h
> > index 7986c49c71..25383db5aa 100644
> > --- a/include/io/channel.h
> > +++ b/include/io/channel.h
> > @@ -559,6 +559,19 @@ int qio_channel_close(QIOChannel *ioc,
> >  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
> >                              size_t niov, off_t offset, Error **errp);
> >  
> > +/**
> > + * qio_channel_pwritev_all:
> > + * @ioc: the channel object
> > + * @iov: the array of memory regions to write data from
> > + * @niov: the length of the @iov array
> > + * @offset: the iovec offset in the file where to write the data
> > + * @errp: pointer to a NULL-initialized error object
> > + *
> > + * Returns: 0 if all bytes were written, or -1 on error
> > + */
> > +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> > +                            size_t niov, off_t offset, Error **errp);
> > +
> >  /**
> >   * qio_channel_pwrite
> >   * @ioc: the channel object
> > @@ -595,6 +608,19 @@ ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
> >  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
> >                             size_t niov, off_t offset, Error **errp);
> >  
> > +/**
> > + * qio_channel_preadv_all:
> > + * @ioc: the channel object
> > + * @iov: the array of memory regions to read data to
> > + * @niov: the length of the @iov array
> > + * @offset: the iovec offset in the file from where to read the data
> > + * @errp: pointer to a NULL-initialized error object
> > + *
> > + * Returns: 0 if all bytes were read, or -1 on error
> > + */
> > +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
> > +                           size_t niov, off_t offset, Error **errp);
> > +
> >  /**
> >   * qio_channel_pread
> >   * @ioc: the channel object
> > diff --git a/io/channel.c b/io/channel.c
> > index a1f12f8e90..2f1745d052 100644
> > --- a/io/channel.c
> > +++ b/io/channel.c
> > @@ -472,6 +472,69 @@ ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
> >      return klass->io_pwritev(ioc, iov, niov, offset, errp);
> >  }
> >  
> > +static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
> > +                                                 const struct iovec *iov,
> > +                                                 size_t niov, off_t offset,
> > +                                                 bool is_write, Error **errp)
> > +{
> > +    ssize_t ret = -1;
> > +    int i, slice_idx, slice_num;
> > +    uintptr_t base, next, file_offset;
> > +    size_t len;
> > +
> > +    slice_idx = 0;
> > +    slice_num = 1;
> > +
> > +    /*
> > +     * If the iov array doesn't have contiguous elements, we need to
> > +     * split it in slices because we only have one (file) 'offset' for
> > +     * the whole iov. Do this here so callers don't need to break the
> > +     * iov array themselves.
> > +     */
> > +    for (i = 0; i < niov; i++, slice_num++) {
> > +        base = (uintptr_t) iov[i].iov_base;
> > +
> > +        if (i != niov - 1) {
> > +            len = iov[i].iov_len;
> > +            next = (uintptr_t) iov[i + 1].iov_base;
> > +
> > +            if (base + len == next) {
> > +                continue;
> > +            }
> > +        }
> > +
> > +        /*
> > +         * Use the offset of the first element of the segment that
> > +         * we're sending.
> > +         */
> > +        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
> > +
> > +        if (is_write) {
> > +            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
> > +                                      file_offset, errp);
> > +        } else {
> > +            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
> > +                                     file_offset, errp);
> > +        }
> 
> iov_base is the address of a pointer in RAM, so could be
> potentially any 64-bit value.
> 
> We're assigning file_offset to this pointer address with an
> user supplied offset, and then using it as an offset on disk.
> First this could result in 64-bit overflow when 'offset' is
> added to 'iov_base', and second this could result in a file
> that's 16 Exabytes in size (with holes of course).
> 
> I don't get how this is supposed to work, or be used ?

I feel like this whole method might become clearer if we separated
out the logic for merging memory adjacent iovecs.

How about adding a 'iov_collapse' method in iov.h / iov.c to do
the merging and then let the actual I/O code be simpler ?

> 
> > +
> > +        if (ret < 0) {
> > +            break;
> > +        }
> > +
> > +        slice_idx += slice_num;
> > +        slice_num = 0;
> > +    }
> > +
> > +    return (ret < 0) ? -1 : 0;
> > +}



> > +
> > +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
> > +                            size_t niov, off_t offset, Error **errp)
> > +{
> > +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
> > +                                                 offset, true, errp);
> > +}
> > +
> >  ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
> >                             off_t offset, Error **errp)
> >  {
> > @@ -501,6 +564,13 @@ ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
> >      return klass->io_preadv(ioc, iov, niov, offset, errp);
> >  }
> >  
> > +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
> > +                           size_t niov, off_t offset, Error **errp)
> > +{
> > +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
> > +                                                 offset, false, errp);
> > +}
> > +
> >  ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
> >                            off_t offset, Error **errp)
> >  {
> > -- 
> > 2.35.3
> > 
> 
> With regards,
> Daniel
> -- 
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
> 
> 

With regards,
Daniel
Fabiano Rosas Jan. 17, 2024, 6:06 p.m. UTC | #6
Peter Xu <peterx@redhat.com> writes:

> On Tue, Jan 16, 2024 at 03:15:50PM -0300, Fabiano Rosas wrote:
>> Peter Xu <peterx@redhat.com> writes:
>> 
>> > On Mon, Nov 27, 2023 at 05:25:57PM -0300, Fabiano Rosas wrote:
>> >> For the upcoming support to fixed-ram migration with multifd, we need
>> >> to be able to accept an iovec array with non-contiguous data.
>> >> 
>> >> Add a pwritev and preadv version that splits the array into contiguous
>> >> segments before writing. With that we can have the ram code continue
>> >> to add pages in any order and the multifd code continue to send large
>> >> arrays for reading and writing.
>> >> 
>> >> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> >> ---
>> >> - split the API that was merged into a single function
>> >> - use uintptr_t for compatibility with 32-bit
>> >> ---
>> >>  include/io/channel.h | 26 ++++++++++++++++
>> >>  io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
>> >>  2 files changed, 96 insertions(+)
>> >> 
>> >> diff --git a/include/io/channel.h b/include/io/channel.h
>> >> index 7986c49c71..25383db5aa 100644
>> >> --- a/include/io/channel.h
>> >> +++ b/include/io/channel.h
>> >> @@ -559,6 +559,19 @@ int qio_channel_close(QIOChannel *ioc,
>> >>  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>> >>                              size_t niov, off_t offset, Error **errp);
>> >>  
>> >> +/**
>> >> + * qio_channel_pwritev_all:
>> >> + * @ioc: the channel object
>> >> + * @iov: the array of memory regions to write data from
>> >> + * @niov: the length of the @iov array
>> >> + * @offset: the iovec offset in the file where to write the data
>> >> + * @errp: pointer to a NULL-initialized error object
>> >> + *
>> >> + * Returns: 0 if all bytes were written, or -1 on error
>> >> + */
>> >> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
>> >> +                            size_t niov, off_t offset, Error **errp);
>> >> +
>> >>  /**
>> >>   * qio_channel_pwrite
>> >>   * @ioc: the channel object
>> >> @@ -595,6 +608,19 @@ ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>> >>  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>> >>                             size_t niov, off_t offset, Error **errp);
>> >>  
>> >> +/**
>> >> + * qio_channel_preadv_all:
>> >> + * @ioc: the channel object
>> >> + * @iov: the array of memory regions to read data to
>> >> + * @niov: the length of the @iov array
>> >> + * @offset: the iovec offset in the file from where to read the data
>> >> + * @errp: pointer to a NULL-initialized error object
>> >> + *
>> >> + * Returns: 0 if all bytes were read, or -1 on error
>> >> + */
>> >> +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
>> >> +                           size_t niov, off_t offset, Error **errp);
>> >> +
>> >>  /**
>> >>   * qio_channel_pread
>> >>   * @ioc: the channel object
>> >> diff --git a/io/channel.c b/io/channel.c
>> >> index a1f12f8e90..2f1745d052 100644
>> >> --- a/io/channel.c
>> >> +++ b/io/channel.c
>> >> @@ -472,6 +472,69 @@ ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>> >>      return klass->io_pwritev(ioc, iov, niov, offset, errp);
>> >>  }
>> >>  
>> >> +static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
>> >> +                                                 const struct iovec *iov,
>> >> +                                                 size_t niov, off_t offset,
>> >> +                                                 bool is_write, Error **errp)
>> >> +{
>> >> +    ssize_t ret = -1;
>> >> +    int i, slice_idx, slice_num;
>> >> +    uintptr_t base, next, file_offset;
>> >> +    size_t len;
>> >> +
>> >> +    slice_idx = 0;
>> >> +    slice_num = 1;
>> >> +
>> >> +    /*
>> >> +     * If the iov array doesn't have contiguous elements, we need to
>> >> +     * split it in slices because we only have one (file) 'offset' for
>> >> +     * the whole iov. Do this here so callers don't need to break the
>> >> +     * iov array themselves.
>> >> +     */
>> >> +    for (i = 0; i < niov; i++, slice_num++) {
>> >> +        base = (uintptr_t) iov[i].iov_base;
>> >> +
>> >> +        if (i != niov - 1) {
>> >> +            len = iov[i].iov_len;
>> >> +            next = (uintptr_t) iov[i + 1].iov_base;
>> >> +
>> >> +            if (base + len == next) {
>> >> +                continue;
>> >> +            }
>> >> +        }
>> >> +
>> >> +        /*
>> >> +         * Use the offset of the first element of the segment that
>> >> +         * we're sending.
>> >> +         */
>> >> +        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
>> >> +
>> >> +        if (is_write) {
>> >> +            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
>> >> +                                      file_offset, errp);
>> >> +        } else {
>> >> +            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
>> >> +                                     file_offset, errp);
>> >> +        }
>> >> +
>> >> +        if (ret < 0) {
>> >> +            break;
>> >> +        }
>> >> +
>> >> +        slice_idx += slice_num;
>> >> +        slice_num = 0;
>> >> +    }
>> >> +
>> >> +    return (ret < 0) ? -1 : 0;
>> >> +}
>> >> +
>> >> +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
>> >> +                            size_t niov, off_t offset, Error **errp)
>> >> +{
>> >> +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
>> >> +                                                 offset, true, errp);
>> >> +}
>> >
>> > I'm not sure how Dan thinks about this, but I don't think this is pretty..
>> >
>> > With this implementation, iochannels' preadv/pwritev is completely not
>> > compatible with most OSes now, afaiu.
>> 
>> This is internal QEMU code. I hope no one is expecting qio_channel_foo()
>> to behave like some OS's foo() system call. We cannot guarantee that
>> compatibility save for the simplest of wrappers.
>
> I was expecting that when I started to read. :)
>
> https://man.freebsd.org/cgi/man.cgi?query=pwritev
> https://linux.die.net/man/2/pwritev
>
> It's not "some OSes", it's mostly all.

What I mean is no one would ever replace a call to pwritev() with
qio_channel_pwritev() and expect the same behvior. We're not writing a
libc.

> I can understand you prefer such
> approach, but even if so, shall we still try to avoid using pwritev/preadv
> as the names?
>

Yes, it's probably better to avoid those if we're going to be doing any
extra operations.

>> 
>> >
>> > The definition of offset in preadv/pwritev of current iochannel is hard to
>> > understand.. if I read it right it'll later be set to:
>> >       
>> >                 /*
>> >                  * If we subtract the host page now, we don't need to
>> >                  * pass it into qio_channel_pwritev_all() below.
>> >                  */
>> >                 write_base = p->pages->block->pages_offset -
>> >                     (uintptr_t)p->pages->block->host;
>> >
>> > Which I cannot easily tell what it is.. besides being an unsigned int.
>> 
>> This description was unfortunately dropped along the way:
>> 
>> "Since iovs can be non contiguous, we'd need a separate array on the
>> side to carry an extra file offset for each of them, so I'm relying on
>> the fact that iovs are all within a same host page and passing in an
>> encoded offset that takes the host page into account."
>> 
>> > IIUC it's also based on the assumption that the host address of each iov
>> > entry is linear to its offset in the file, but it may not be true for
>> > future iochannel users of such interface called as pwritev/preadv.  So
>> > error prone.
>> 
>> Yes, but it's also our choice whether to make this a generic API. We may
>> have good reasons to consider a migration-specific function here.
>> 
>> > Would it be possible we keep using the offset array (p->pages->offset[x])?
>> > We have it already anyway, right?  Wouldn't that be clearer?
>> >
>> 
>> We'd have to make a copy of the array because p->pages is expected to
>> change while the IO happens.
>
> Hmm, I don't see why p->pages can change. IIUC p->pages will be there solid
> at least until all IO syscalls are completed, then the next call to, e.g.,
> multifd_send_pages() will swap that with multifd_send_state->pages.  But I
> think I get your point, with below.

Oh no, you're right. Because of p->pending_job. And thinking about
p->pending_job, wouldn't a trylock to the same job while being more
explicit?

    next_channel %= migrate_multifd_channels();
    for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
        p = &multifd_send_state->params[i];

        if(qemu_mutex_trylock(&p->mutex)) {
            if (p->quit) {
                error_report("%s: channel %d has already quit!", __func__, i);
                qemu_mutex_unlock(&p->mutex);
                return -1;
            }
            next_channel = (i + 1) % migrate_multifd_channels();
            break;
        } else {
            /* channel still busy, try the next one */
        }
    }
    multifd_send_state->pages = p->pages;
    p->pages = pages;
    qemu_mutex_unlock(&p->mutex);

>> And while we already have a copy in
>> p->normal, my intention for multifd was to eliminate p->normal in the
>> future, so it would be nice if we could avoid it.
>> 
>> Also, we cannot use p->pages->offset alone because we still need the
>> pages_offset, i.e. the file offset where that ramblocks's pages begin.
>> So that means also adding that to each element of the new array.
>> 
>> It would probably be overall clearer and less wasteful to pass in the
>> host page address instead of an array of offsets. I don't see an issue
>> with restricting the iovs to the same host page. The migration code is
>> the only user for this code and AFAIK we don't have plans to change that
>> invariant.
>
> So I think I get your point now, the only concern (besides naming..) is,
> I still want to avoid an interface that contains a field that is hard to
> understand like write_base.
>
> How about this?
>
>   /**
>    * multifd_write_ramblock_iov: Write IO vector (of ramblock) to channel
>    *
>    * @ioc: The iochannel to write to. The IOC must have pwritev/preadv
>    *       interface must be implemented.
>    * @iov: The IO vector to write.  All addresses must be within the
>    *       ramblock host address range.
>    * @iov_len: The IO vector size
>    * @ramblock: The ramblock that covers all buffers in this IO vector
>    */
>   int multifd_write_ramblock_iov(ioc, iov, iov_len, ramblock);

Ok, then I can take block->pages_offset and block->host from the
ramblock. I think I prefer something like this, that way we can be
explicit about the migration assumptions.

Thanks!
Fabiano Rosas Jan. 17, 2024, 6:09 p.m. UTC | #7
Daniel P. Berrangé <berrange@redhat.com> writes:

> On Wed, Jan 17, 2024 at 12:39:26PM +0000, Daniel P. Berrangé wrote:
>> On Mon, Nov 27, 2023 at 05:25:57PM -0300, Fabiano Rosas wrote:
>> > For the upcoming support to fixed-ram migration with multifd, we need
>> > to be able to accept an iovec array with non-contiguous data.
>> > 
>> > Add a pwritev and preadv version that splits the array into contiguous
>> > segments before writing. With that we can have the ram code continue
>> > to add pages in any order and the multifd code continue to send large
>> > arrays for reading and writing.
>> > 
>> > Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> > ---
>> > - split the API that was merged into a single function
>> > - use uintptr_t for compatibility with 32-bit
>> > ---
>> >  include/io/channel.h | 26 ++++++++++++++++
>> >  io/channel.c         | 70 ++++++++++++++++++++++++++++++++++++++++++++
>> >  2 files changed, 96 insertions(+)
>> > 
>> > diff --git a/include/io/channel.h b/include/io/channel.h
>> > index 7986c49c71..25383db5aa 100644
>> > --- a/include/io/channel.h
>> > +++ b/include/io/channel.h
>> > @@ -559,6 +559,19 @@ int qio_channel_close(QIOChannel *ioc,
>> >  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>> >                              size_t niov, off_t offset, Error **errp);
>> >  
>> > +/**
>> > + * qio_channel_pwritev_all:
>> > + * @ioc: the channel object
>> > + * @iov: the array of memory regions to write data from
>> > + * @niov: the length of the @iov array
>> > + * @offset: the iovec offset in the file where to write the data
>> > + * @errp: pointer to a NULL-initialized error object
>> > + *
>> > + * Returns: 0 if all bytes were written, or -1 on error
>> > + */
>> > +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
>> > +                            size_t niov, off_t offset, Error **errp);
>> > +
>> >  /**
>> >   * qio_channel_pwrite
>> >   * @ioc: the channel object
>> > @@ -595,6 +608,19 @@ ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>> >  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>> >                             size_t niov, off_t offset, Error **errp);
>> >  
>> > +/**
>> > + * qio_channel_preadv_all:
>> > + * @ioc: the channel object
>> > + * @iov: the array of memory regions to read data to
>> > + * @niov: the length of the @iov array
>> > + * @offset: the iovec offset in the file from where to read the data
>> > + * @errp: pointer to a NULL-initialized error object
>> > + *
>> > + * Returns: 0 if all bytes were read, or -1 on error
>> > + */
>> > +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
>> > +                           size_t niov, off_t offset, Error **errp);
>> > +
>> >  /**
>> >   * qio_channel_pread
>> >   * @ioc: the channel object
>> > diff --git a/io/channel.c b/io/channel.c
>> > index a1f12f8e90..2f1745d052 100644
>> > --- a/io/channel.c
>> > +++ b/io/channel.c
>> > @@ -472,6 +472,69 @@ ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
>> >      return klass->io_pwritev(ioc, iov, niov, offset, errp);
>> >  }
>> >  
>> > +static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
>> > +                                                 const struct iovec *iov,
>> > +                                                 size_t niov, off_t offset,
>> > +                                                 bool is_write, Error **errp)
>> > +{
>> > +    ssize_t ret = -1;
>> > +    int i, slice_idx, slice_num;
>> > +    uintptr_t base, next, file_offset;
>> > +    size_t len;
>> > +
>> > +    slice_idx = 0;
>> > +    slice_num = 1;
>> > +
>> > +    /*
>> > +     * If the iov array doesn't have contiguous elements, we need to
>> > +     * split it in slices because we only have one (file) 'offset' for
>> > +     * the whole iov. Do this here so callers don't need to break the
>> > +     * iov array themselves.
>> > +     */
>> > +    for (i = 0; i < niov; i++, slice_num++) {
>> > +        base = (uintptr_t) iov[i].iov_base;
>> > +
>> > +        if (i != niov - 1) {
>> > +            len = iov[i].iov_len;
>> > +            next = (uintptr_t) iov[i + 1].iov_base;
>> > +
>> > +            if (base + len == next) {
>> > +                continue;
>> > +            }
>> > +        }
>> > +
>> > +        /*
>> > +         * Use the offset of the first element of the segment that
>> > +         * we're sending.
>> > +         */
>> > +        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
>> > +
>> > +        if (is_write) {
>> > +            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
>> > +                                      file_offset, errp);
>> > +        } else {
>> > +            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
>> > +                                     file_offset, errp);
>> > +        }
>> 
>> iov_base is the address of a pointer in RAM, so could be
>> potentially any 64-bit value.
>> 
>> We're assigning file_offset to this pointer address with an
>> user supplied offset, and then using it as an offset on disk.
>> First this could result in 64-bit overflow when 'offset' is
>> added to 'iov_base', and second this could result in a file
>> that's 16 Exabytes in size (with holes of course).
>> 
>> I don't get how this is supposed to work, or be used ?
>
> I feel like this whole method might become clearer if we separated
> out the logic for merging memory adjacent iovecs.
>
> How about adding a 'iov_collapse' method in iov.h / iov.c to do
> the merging and then let the actual I/O code be simpler ?

I think if we add a migration-specific wrapper like we're discussing
with Peter earlier in the thread (on this same message), that would be
enough to keep the migration assumptions contained and not have to
pollute the IO code with any of this logic.

>> 
>> > +
>> > +        if (ret < 0) {
>> > +            break;
>> > +        }
>> > +
>> > +        slice_idx += slice_num;
>> > +        slice_num = 0;
>> > +    }
>> > +
>> > +    return (ret < 0) ? -1 : 0;
>> > +}
>
>
>
>> > +
>> > +int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
>> > +                            size_t niov, off_t offset, Error **errp)
>> > +{
>> > +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
>> > +                                                 offset, true, errp);
>> > +}
>> > +
>> >  ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
>> >                             off_t offset, Error **errp)
>> >  {
>> > @@ -501,6 +564,13 @@ ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
>> >      return klass->io_preadv(ioc, iov, niov, offset, errp);
>> >  }
>> >  
>> > +int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
>> > +                           size_t niov, off_t offset, Error **errp)
>> > +{
>> > +    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
>> > +                                                 offset, false, errp);
>> > +}
>> > +
>> >  ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
>> >                            off_t offset, Error **errp)
>> >  {
>> > -- 
>> > 2.35.3
>> > 
>> 
>> With regards,
>> Daniel
>> -- 
>> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
>> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
>> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>> 
>> 
>
> With regards,
> Daniel
Peter Xu Jan. 18, 2024, 7:44 a.m. UTC | #8
On Wed, Jan 17, 2024 at 03:06:15PM -0300, Fabiano Rosas wrote:
> Oh no, you're right. Because of p->pending_job. And thinking about
> p->pending_job, wouldn't a trylock to the same job while being more
> explicit?
> 
>     next_channel %= migrate_multifd_channels();
>     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
>         p = &multifd_send_state->params[i];
> 
>         if(qemu_mutex_trylock(&p->mutex)) {
>             if (p->quit) {
>                 error_report("%s: channel %d has already quit!", __func__, i);
>                 qemu_mutex_unlock(&p->mutex);
>                 return -1;
>             }
>             next_channel = (i + 1) % migrate_multifd_channels();
>             break;
>         } else {
>             /* channel still busy, try the next one */
>         }
>     }
>     multifd_send_state->pages = p->pages;
>     p->pages = pages;
>     qemu_mutex_unlock(&p->mutex);

We probably can't for now; multifd_send_thread() will unlock the mutex
before the iochannel write()s, while the write()s will need those fields.

> Ok, then I can take block->pages_offset and block->host from the
> ramblock. I think I prefer something like this, that way we can be
> explicit about the migration assumptions.

I'm glad we reached an initial consensus.  Yes let's put that in
migration/; I won't expect this code will be used by other iochannel users.
Fabiano Rosas Jan. 18, 2024, 12:47 p.m. UTC | #9
Peter Xu <peterx@redhat.com> writes:

> On Wed, Jan 17, 2024 at 03:06:15PM -0300, Fabiano Rosas wrote:
>> Oh no, you're right. Because of p->pending_job. And thinking about
>> p->pending_job, wouldn't a trylock to the same job while being more
>> explicit?
>> 
>>     next_channel %= migrate_multifd_channels();
>>     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
>>         p = &multifd_send_state->params[i];
>> 
>>         if(qemu_mutex_trylock(&p->mutex)) {
>>             if (p->quit) {
>>                 error_report("%s: channel %d has already quit!", __func__, i);
>>                 qemu_mutex_unlock(&p->mutex);
>>                 return -1;
>>             }
>>             next_channel = (i + 1) % migrate_multifd_channels();
>>             break;
>>         } else {
>>             /* channel still busy, try the next one */
>>         }
>>     }
>>     multifd_send_state->pages = p->pages;
>>     p->pages = pages;
>>     qemu_mutex_unlock(&p->mutex);
>
> We probably can't for now; multifd_send_thread() will unlock the mutex
> before the iochannel write()s, while the write()s will need those fields.

Right, but we'd change that code to do the IO with the lock held. If no
one is blocking, it should be ok to hold the lock. Anyway, food for
thought.
Peter Xu Jan. 19, 2024, 12:22 a.m. UTC | #10
On Thu, Jan 18, 2024 at 09:47:18AM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Wed, Jan 17, 2024 at 03:06:15PM -0300, Fabiano Rosas wrote:
> >> Oh no, you're right. Because of p->pending_job. And thinking about
> >> p->pending_job, wouldn't a trylock to the same job while being more
> >> explicit?
> >> 
> >>     next_channel %= migrate_multifd_channels();
> >>     for (i = next_channel;; i = (i + 1) % migrate_multifd_channels()) {
> >>         p = &multifd_send_state->params[i];
> >> 
> >>         if(qemu_mutex_trylock(&p->mutex)) {
> >>             if (p->quit) {
> >>                 error_report("%s: channel %d has already quit!", __func__, i);
> >>                 qemu_mutex_unlock(&p->mutex);
> >>                 return -1;
> >>             }
> >>             next_channel = (i + 1) % migrate_multifd_channels();
> >>             break;
> >>         } else {
> >>             /* channel still busy, try the next one */
> >>         }
> >>     }
> >>     multifd_send_state->pages = p->pages;
> >>     p->pages = pages;
> >>     qemu_mutex_unlock(&p->mutex);
> >
> > We probably can't for now; multifd_send_thread() will unlock the mutex
> > before the iochannel write()s, while the write()s will need those fields.
> 
> Right, but we'd change that code to do the IO with the lock held. If no
> one is blocking, it should be ok to hold the lock. Anyway, food for
> thought.

I see what you meant.  Sounds possible.
diff mbox series

Patch

diff --git a/include/io/channel.h b/include/io/channel.h
index 7986c49c71..25383db5aa 100644
--- a/include/io/channel.h
+++ b/include/io/channel.h
@@ -559,6 +559,19 @@  int qio_channel_close(QIOChannel *ioc,
 ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
                             size_t niov, off_t offset, Error **errp);
 
+/**
+ * qio_channel_pwritev_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to write data from
+ * @niov: the length of the @iov array
+ * @offset: the iovec offset in the file where to write the data
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns: 0 if all bytes were written, or -1 on error
+ */
+int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
+                            size_t niov, off_t offset, Error **errp);
+
 /**
  * qio_channel_pwrite
  * @ioc: the channel object
@@ -595,6 +608,19 @@  ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
 ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
                            size_t niov, off_t offset, Error **errp);
 
+/**
+ * qio_channel_preadv_all:
+ * @ioc: the channel object
+ * @iov: the array of memory regions to read data to
+ * @niov: the length of the @iov array
+ * @offset: the iovec offset in the file from where to read the data
+ * @errp: pointer to a NULL-initialized error object
+ *
+ * Returns: 0 if all bytes were read, or -1 on error
+ */
+int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
+                           size_t niov, off_t offset, Error **errp);
+
 /**
  * qio_channel_pread
  * @ioc: the channel object
diff --git a/io/channel.c b/io/channel.c
index a1f12f8e90..2f1745d052 100644
--- a/io/channel.c
+++ b/io/channel.c
@@ -472,6 +472,69 @@  ssize_t qio_channel_pwritev(QIOChannel *ioc, const struct iovec *iov,
     return klass->io_pwritev(ioc, iov, niov, offset, errp);
 }
 
+static int qio_channel_preadv_pwritev_contiguous(QIOChannel *ioc,
+                                                 const struct iovec *iov,
+                                                 size_t niov, off_t offset,
+                                                 bool is_write, Error **errp)
+{
+    ssize_t ret = -1;
+    int i, slice_idx, slice_num;
+    uintptr_t base, next, file_offset;
+    size_t len;
+
+    slice_idx = 0;
+    slice_num = 1;
+
+    /*
+     * If the iov array doesn't have contiguous elements, we need to
+     * split it in slices because we only have one (file) 'offset' for
+     * the whole iov. Do this here so callers don't need to break the
+     * iov array themselves.
+     */
+    for (i = 0; i < niov; i++, slice_num++) {
+        base = (uintptr_t) iov[i].iov_base;
+
+        if (i != niov - 1) {
+            len = iov[i].iov_len;
+            next = (uintptr_t) iov[i + 1].iov_base;
+
+            if (base + len == next) {
+                continue;
+            }
+        }
+
+        /*
+         * Use the offset of the first element of the segment that
+         * we're sending.
+         */
+        file_offset = offset + (uintptr_t) iov[slice_idx].iov_base;
+
+        if (is_write) {
+            ret = qio_channel_pwritev(ioc, &iov[slice_idx], slice_num,
+                                      file_offset, errp);
+        } else {
+            ret = qio_channel_preadv(ioc, &iov[slice_idx], slice_num,
+                                     file_offset, errp);
+        }
+
+        if (ret < 0) {
+            break;
+        }
+
+        slice_idx += slice_num;
+        slice_num = 0;
+    }
+
+    return (ret < 0) ? -1 : 0;
+}
+
+int qio_channel_pwritev_all(QIOChannel *ioc, const struct iovec *iov,
+                            size_t niov, off_t offset, Error **errp)
+{
+    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
+                                                 offset, true, errp);
+}
+
 ssize_t qio_channel_pwrite(QIOChannel *ioc, char *buf, size_t buflen,
                            off_t offset, Error **errp)
 {
@@ -501,6 +564,13 @@  ssize_t qio_channel_preadv(QIOChannel *ioc, const struct iovec *iov,
     return klass->io_preadv(ioc, iov, niov, offset, errp);
 }
 
+int qio_channel_preadv_all(QIOChannel *ioc, const struct iovec *iov,
+                           size_t niov, off_t offset, Error **errp)
+{
+    return qio_channel_preadv_pwritev_contiguous(ioc, iov, niov,
+                                                 offset, false, errp);
+}
+
 ssize_t qio_channel_pread(QIOChannel *ioc, char *buf, size_t buflen,
                           off_t offset, Error **errp)
 {