diff mbox series

[v2,16/29] migration/ram: Add support for 'fixed-ram' migration restore

Message ID 20231023203608.26370-17-farosas@suse.de
State New
Headers show
Series migration: File based migration with multifd and fixed-ram | expand

Commit Message

Fabiano Rosas Oct. 23, 2023, 8:35 p.m. UTC
From: Nikolay Borisov <nborisov@suse.com>

Add the necessary code to parse the format changes for the 'fixed-ram'
capability.

One of the more notable changes in behavior is that in the 'fixed-ram'
case ram pages are restored in one go rather than constantly looping
through the migration stream.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
(farosas) reused more of the common code by making the fixed-ram
function take only one ramblock and calling it from inside
parse_ramblock.
---
 migration/ram.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

Comments

Daniel P. Berrangé Oct. 25, 2023, 9:43 a.m. UTC | #1
On Mon, Oct 23, 2023 at 05:35:55PM -0300, Fabiano Rosas wrote:
> From: Nikolay Borisov <nborisov@suse.com>
> 
> Add the necessary code to parse the format changes for the 'fixed-ram'
> capability.
> 
> One of the more notable changes in behavior is that in the 'fixed-ram'
> case ram pages are restored in one go rather than constantly looping
> through the migration stream.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
> (farosas) reused more of the common code by making the fixed-ram
> function take only one ramblock and calling it from inside
> parse_ramblock.
> ---
>  migration/ram.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 93 insertions(+)
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index 152a03604f..cea6971ab2 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -3032,6 +3032,32 @@ static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
>      qemu_put_buffer(file, (uint8_t *) header, header_size);
>  }
>  
> +static int fixed_ram_read_header(QEMUFile *file, struct FixedRamHeader *header)
> +{
> +    size_t ret, header_size = sizeof(struct FixedRamHeader);
> +
> +    ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
> +    if (ret != header_size) {
> +        return -1;
> +    }
> +
> +    /* migration stream is big-endian */
> +    be32_to_cpus(&header->version);
> +
> +    if (header->version > FIXED_RAM_HDR_VERSION) {
> +        error_report("Migration fixed-ram capability version mismatch (expected %d, got %d)",
> +                     FIXED_RAM_HDR_VERSION, header->version);
> +        return -1;
> +    }
> +
> +    be64_to_cpus(&header->page_size);
> +    be64_to_cpus(&header->bitmap_offset);
> +    be64_to_cpus(&header->pages_offset);
> +
> +
> +    return 0;
> +}
> +
>  /*
>   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
>   * long-running RCU critical section.  When rcu-reclaims in the code
> @@ -3932,6 +3958,68 @@ void colo_flush_ram_cache(void)
>      trace_colo_flush_ram_cache_end();
>  }
>  
> +static void read_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block,
> +                                    long num_pages, unsigned long *bitmap)
> +{
> +    unsigned long set_bit_idx, clear_bit_idx;
> +    unsigned long len;
> +    ram_addr_t offset;
> +    void *host;
> +    size_t read, completed, read_len;
> +
> +    for (set_bit_idx = find_first_bit(bitmap, num_pages);
> +         set_bit_idx < num_pages;
> +         set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
> +
> +        clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
> +
> +        len = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
> +        offset = set_bit_idx << TARGET_PAGE_BITS;
> +
> +        for (read = 0, completed = 0; completed < len; offset += read) {
> +            host = host_from_ram_block_offset(block, offset);
> +            read_len = MIN(len, TARGET_PAGE_SIZE);
> +
> +            read = qemu_get_buffer_at(f, host, read_len,
> +                                      block->pages_offset + offset);
> +            completed += read;
> +        }
> +    }
> +}
> +
> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> +{
> +    g_autofree unsigned long *bitmap = NULL;
> +    struct FixedRamHeader header;
> +    size_t bitmap_size;
> +    long num_pages;
> +    int ret = 0;
> +
> +    ret = fixed_ram_read_header(f, &header);
> +    if (ret < 0) {
> +        error_report("Error reading fixed-ram header");
> +        return -EINVAL;
> +    }
> +
> +    block->pages_offset = header.pages_offset;

Do you think it is worth sanity checking that 'pages_offset' is aligned
in some way.

It is nice that we have flexibility to change the alignment in future
if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
check htere. Perhaps we could at least sanity check for alignment at
TARGET_PAGE_SIZE, to detect a gross data corruption problem ?

> +    num_pages = length / header.page_size;
> +    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
> +
> +    bitmap = g_malloc0(bitmap_size);
> +    if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size,
> +                           header.bitmap_offset) != bitmap_size) {
> +        error_report("Error parsing dirty bitmap");

s/parsing/reading/ since we're not actually parsing any semantic
info here.

> +        return -EINVAL;
> +    }
> +
> +    read_ramblock_fixed_ram(f, block, num_pages, bitmap);
> +
> +    /* Skip pages array */
> +    qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
> +
> +    return ret;
> +}
> +
>  static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
>  {
>      int ret = 0;
> @@ -3940,6 +4028,10 @@ static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
>  
>      assert(block);
>  
> +    if (migrate_fixed_ram()) {
> +        return parse_ramblock_fixed_ram(f, block, length);
> +    }
> +
>      if (!qemu_ram_is_migratable(block)) {
>          error_report("block %s should not be migrated !", block->idstr);
>          return -EINVAL;
> @@ -4142,6 +4234,7 @@ static int ram_load_precopy(QEMUFile *f)
>                  migrate_multifd_flush_after_each_section()) {
>                  multifd_recv_sync_main();
>              }
> +
>              break;

Spurious whitespace


>          case RAM_SAVE_FLAG_HOOK:
>              ret = rdma_registration_handle(f);
> -- 
> 2.35.3
> 

With regards,
Daniel
Fabiano Rosas Oct. 25, 2023, 2:07 p.m. UTC | #2
Daniel P. Berrangé <berrange@redhat.com> writes:

> On Mon, Oct 23, 2023 at 05:35:55PM -0300, Fabiano Rosas wrote:
>> From: Nikolay Borisov <nborisov@suse.com>
>> 
>> Add the necessary code to parse the format changes for the 'fixed-ram'
>> capability.
>> 
>> One of the more notable changes in behavior is that in the 'fixed-ram'
>> case ram pages are restored in one go rather than constantly looping
>> through the migration stream.
>> 
>> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> ---
>> (farosas) reused more of the common code by making the fixed-ram
>> function take only one ramblock and calling it from inside
>> parse_ramblock.
>> ---
>>  migration/ram.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 93 insertions(+)
>> 
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 152a03604f..cea6971ab2 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -3032,6 +3032,32 @@ static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
>>      qemu_put_buffer(file, (uint8_t *) header, header_size);
>>  }
>>  
>> +static int fixed_ram_read_header(QEMUFile *file, struct FixedRamHeader *header)
>> +{
>> +    size_t ret, header_size = sizeof(struct FixedRamHeader);
>> +
>> +    ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
>> +    if (ret != header_size) {
>> +        return -1;
>> +    }
>> +
>> +    /* migration stream is big-endian */
>> +    be32_to_cpus(&header->version);
>> +
>> +    if (header->version > FIXED_RAM_HDR_VERSION) {
>> +        error_report("Migration fixed-ram capability version mismatch (expected %d, got %d)",
>> +                     FIXED_RAM_HDR_VERSION, header->version);
>> +        return -1;
>> +    }
>> +
>> +    be64_to_cpus(&header->page_size);
>> +    be64_to_cpus(&header->bitmap_offset);
>> +    be64_to_cpus(&header->pages_offset);
>> +
>> +
>> +    return 0;
>> +}
>> +
>>  /*
>>   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
>>   * long-running RCU critical section.  When rcu-reclaims in the code
>> @@ -3932,6 +3958,68 @@ void colo_flush_ram_cache(void)
>>      trace_colo_flush_ram_cache_end();
>>  }
>>  
>> +static void read_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block,
>> +                                    long num_pages, unsigned long *bitmap)
>> +{
>> +    unsigned long set_bit_idx, clear_bit_idx;
>> +    unsigned long len;
>> +    ram_addr_t offset;
>> +    void *host;
>> +    size_t read, completed, read_len;
>> +
>> +    for (set_bit_idx = find_first_bit(bitmap, num_pages);
>> +         set_bit_idx < num_pages;
>> +         set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
>> +
>> +        clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
>> +
>> +        len = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
>> +        offset = set_bit_idx << TARGET_PAGE_BITS;
>> +
>> +        for (read = 0, completed = 0; completed < len; offset += read) {
>> +            host = host_from_ram_block_offset(block, offset);
>> +            read_len = MIN(len, TARGET_PAGE_SIZE);
>> +
>> +            read = qemu_get_buffer_at(f, host, read_len,
>> +                                      block->pages_offset + offset);
>> +            completed += read;
>> +        }
>> +    }
>> +}
>> +
>> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
>> +{
>> +    g_autofree unsigned long *bitmap = NULL;
>> +    struct FixedRamHeader header;
>> +    size_t bitmap_size;
>> +    long num_pages;
>> +    int ret = 0;
>> +
>> +    ret = fixed_ram_read_header(f, &header);
>> +    if (ret < 0) {
>> +        error_report("Error reading fixed-ram header");
>> +        return -EINVAL;
>> +    }
>> +
>> +    block->pages_offset = header.pages_offset;
>
> Do you think it is worth sanity checking that 'pages_offset' is aligned
> in some way.
>
> It is nice that we have flexibility to change the alignment in future
> if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> check htere. Perhaps we could at least sanity check for alignment at
> TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
>

I don't see why not. I'll add it.
Peter Xu Oct. 31, 2023, 7:03 p.m. UTC | #3
On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
> >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> >> +{
> >> +    g_autofree unsigned long *bitmap = NULL;
> >> +    struct FixedRamHeader header;
> >> +    size_t bitmap_size;
> >> +    long num_pages;
> >> +    int ret = 0;
> >> +
> >> +    ret = fixed_ram_read_header(f, &header);
> >> +    if (ret < 0) {
> >> +        error_report("Error reading fixed-ram header");
> >> +        return -EINVAL;
> >> +    }
> >> +
> >> +    block->pages_offset = header.pages_offset;
> >
> > Do you think it is worth sanity checking that 'pages_offset' is aligned
> > in some way.
> >
> > It is nice that we have flexibility to change the alignment in future
> > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> > check htere. Perhaps we could at least sanity check for alignment at
> > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
> >
> 
> I don't see why not. I'll add it.

Is there any explanation on why that 1MB offset, and how the number is
chosen?  Thanks,
Peter Xu Oct. 31, 2023, 7:09 p.m. UTC | #4
On Mon, Oct 23, 2023 at 05:35:55PM -0300, Fabiano Rosas wrote:
> From: Nikolay Borisov <nborisov@suse.com>
> 
> Add the necessary code to parse the format changes for the 'fixed-ram'
> capability.
> 
> One of the more notable changes in behavior is that in the 'fixed-ram'
> case ram pages are restored in one go rather than constantly looping
> through the migration stream.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
> (farosas) reused more of the common code by making the fixed-ram
> function take only one ramblock and calling it from inside
> parse_ramblock.
> ---
>  migration/ram.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 93 insertions(+)
> 
> diff --git a/migration/ram.c b/migration/ram.c
> index 152a03604f..cea6971ab2 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -3032,6 +3032,32 @@ static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
>      qemu_put_buffer(file, (uint8_t *) header, header_size);
>  }
>  
> +static int fixed_ram_read_header(QEMUFile *file, struct FixedRamHeader *header)
> +{
> +    size_t ret, header_size = sizeof(struct FixedRamHeader);
> +
> +    ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
> +    if (ret != header_size) {
> +        return -1;
> +    }
> +
> +    /* migration stream is big-endian */
> +    be32_to_cpus(&header->version);
> +
> +    if (header->version > FIXED_RAM_HDR_VERSION) {
> +        error_report("Migration fixed-ram capability version mismatch (expected %d, got %d)",
> +                     FIXED_RAM_HDR_VERSION, header->version);

I know it doesn't matter a lot for now, but it'll be good to start using
Error** in new codes?

> +        return -1;
> +    }
> +
> +    be64_to_cpus(&header->page_size);
> +    be64_to_cpus(&header->bitmap_offset);
> +    be64_to_cpus(&header->pages_offset);
> +
> +
> +    return 0;
> +}
> +
>  /*
>   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
>   * long-running RCU critical section.  When rcu-reclaims in the code
> @@ -3932,6 +3958,68 @@ void colo_flush_ram_cache(void)
>      trace_colo_flush_ram_cache_end();
>  }
>  
> +static void read_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block,
> +                                    long num_pages, unsigned long *bitmap)
> +{
> +    unsigned long set_bit_idx, clear_bit_idx;
> +    unsigned long len;
> +    ram_addr_t offset;
> +    void *host;
> +    size_t read, completed, read_len;
> +
> +    for (set_bit_idx = find_first_bit(bitmap, num_pages);
> +         set_bit_idx < num_pages;
> +         set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
> +
> +        clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
> +
> +        len = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
> +        offset = set_bit_idx << TARGET_PAGE_BITS;
> +
> +        for (read = 0, completed = 0; completed < len; offset += read) {
> +            host = host_from_ram_block_offset(block, offset);
> +            read_len = MIN(len, TARGET_PAGE_SIZE);

Why MIN()?  I didn't read qemu_get_buffer_at() yet, but shouldn't len
always be multiple of target page size or zero?

> +
> +            read = qemu_get_buffer_at(f, host, read_len,
> +                                      block->pages_offset + offset);
> +            completed += read;
> +        }
> +    }
> +}
> +
> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> +{
> +    g_autofree unsigned long *bitmap = NULL;
> +    struct FixedRamHeader header;
> +    size_t bitmap_size;
> +    long num_pages;
> +    int ret = 0;
> +
> +    ret = fixed_ram_read_header(f, &header);
> +    if (ret < 0) {
> +        error_report("Error reading fixed-ram header");

Same here on error handling; suggest to use Error** from the start.

> +        return -EINVAL;
> +    }
> +
> +    block->pages_offset = header.pages_offset;
> +    num_pages = length / header.page_size;
> +    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
> +
> +    bitmap = g_malloc0(bitmap_size);
> +    if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size,
> +                           header.bitmap_offset) != bitmap_size) {
> +        error_report("Error parsing dirty bitmap");
> +        return -EINVAL;
> +    }
> +
> +    read_ramblock_fixed_ram(f, block, num_pages, bitmap);
> +
> +    /* Skip pages array */
> +    qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
> +
> +    return ret;
> +}
> +
>  static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
>  {
>      int ret = 0;
> @@ -3940,6 +4028,10 @@ static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
>  
>      assert(block);
>  
> +    if (migrate_fixed_ram()) {
> +        return parse_ramblock_fixed_ram(f, block, length);
> +    }
> +
>      if (!qemu_ram_is_migratable(block)) {
>          error_report("block %s should not be migrated !", block->idstr);
>          return -EINVAL;
> @@ -4142,6 +4234,7 @@ static int ram_load_precopy(QEMUFile *f)
>                  migrate_multifd_flush_after_each_section()) {
>                  multifd_recv_sync_main();
>              }
> +
>              break;
>          case RAM_SAVE_FLAG_HOOK:
>              ret = rdma_registration_handle(f);
> -- 
> 2.35.3
>
Fabiano Rosas Oct. 31, 2023, 8 p.m. UTC | #5
Peter Xu <peterx@redhat.com> writes:

> On Mon, Oct 23, 2023 at 05:35:55PM -0300, Fabiano Rosas wrote:
>> From: Nikolay Borisov <nborisov@suse.com>
>> 
>> Add the necessary code to parse the format changes for the 'fixed-ram'
>> capability.
>> 
>> One of the more notable changes in behavior is that in the 'fixed-ram'
>> case ram pages are restored in one go rather than constantly looping
>> through the migration stream.
>> 
>> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> ---
>> (farosas) reused more of the common code by making the fixed-ram
>> function take only one ramblock and calling it from inside
>> parse_ramblock.
>> ---
>>  migration/ram.c | 93 +++++++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 93 insertions(+)
>> 
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 152a03604f..cea6971ab2 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -3032,6 +3032,32 @@ static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
>>      qemu_put_buffer(file, (uint8_t *) header, header_size);
>>  }
>>  
>> +static int fixed_ram_read_header(QEMUFile *file, struct FixedRamHeader *header)
>> +{
>> +    size_t ret, header_size = sizeof(struct FixedRamHeader);
>> +
>> +    ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
>> +    if (ret != header_size) {
>> +        return -1;
>> +    }
>> +
>> +    /* migration stream is big-endian */
>> +    be32_to_cpus(&header->version);
>> +
>> +    if (header->version > FIXED_RAM_HDR_VERSION) {
>> +        error_report("Migration fixed-ram capability version mismatch (expected %d, got %d)",
>> +                     FIXED_RAM_HDR_VERSION, header->version);
>
> I know it doesn't matter a lot for now, but it'll be good to start using
> Error** in new codes?

This whole series was written before the many discussions we had about
error handling. Thanks for pointing that out, I'll revise and change
where appropriate.

>> +        return -1;
>> +    }
>> +
>> +    be64_to_cpus(&header->page_size);
>> +    be64_to_cpus(&header->bitmap_offset);
>> +    be64_to_cpus(&header->pages_offset);
>> +
>> +
>> +    return 0;
>> +}
>> +
>>  /*
>>   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
>>   * long-running RCU critical section.  When rcu-reclaims in the code
>> @@ -3932,6 +3958,68 @@ void colo_flush_ram_cache(void)
>>      trace_colo_flush_ram_cache_end();
>>  }
>>  
>> +static void read_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block,
>> +                                    long num_pages, unsigned long *bitmap)
>> +{
>> +    unsigned long set_bit_idx, clear_bit_idx;
>> +    unsigned long len;
>> +    ram_addr_t offset;
>> +    void *host;
>> +    size_t read, completed, read_len;
>> +
>> +    for (set_bit_idx = find_first_bit(bitmap, num_pages);
>> +         set_bit_idx < num_pages;
>> +         set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
>> +
>> +        clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
>> +
>> +        len = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
>> +        offset = set_bit_idx << TARGET_PAGE_BITS;
>> +
>> +        for (read = 0, completed = 0; completed < len; offset += read) {
>> +            host = host_from_ram_block_offset(block, offset);
>> +            read_len = MIN(len, TARGET_PAGE_SIZE);
>
> Why MIN()?  I didn't read qemu_get_buffer_at() yet, but shouldn't len
> always be multiple of target page size or zero?
>

Hmm, this is not my code. The original code had MIN(len, BUFSIZE) with
BUFSIZE being defined at 4M. I think the idea might have been to cap the
amount of pages read.

So it seems I have made a mistake here and this could be reading more
pages at a time.
Daniel P. Berrangé Nov. 1, 2023, 9:26 a.m. UTC | #6
On Tue, Oct 31, 2023 at 03:03:50PM -0400, Peter Xu wrote:
> On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
> > >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> > >> +{
> > >> +    g_autofree unsigned long *bitmap = NULL;
> > >> +    struct FixedRamHeader header;
> > >> +    size_t bitmap_size;
> > >> +    long num_pages;
> > >> +    int ret = 0;
> > >> +
> > >> +    ret = fixed_ram_read_header(f, &header);
> > >> +    if (ret < 0) {
> > >> +        error_report("Error reading fixed-ram header");
> > >> +        return -EINVAL;
> > >> +    }
> > >> +
> > >> +    block->pages_offset = header.pages_offset;
> > >
> > > Do you think it is worth sanity checking that 'pages_offset' is aligned
> > > in some way.
> > >
> > > It is nice that we have flexibility to change the alignment in future
> > > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> > > check htere. Perhaps we could at least sanity check for alignment at
> > > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
> > >
> > 
> > I don't see why not. I'll add it.
> 
> Is there any explanation on why that 1MB offset, and how the number is
> chosen?  Thanks,

The fixed-ram format is anticipating the use of O_DIRECT.

With O_DIRECT both the buffers in memory, and the file handle offset
have alignment requirements. The buffer alignments are usually page
sized, and QEMU RAM blocks will trivially satisfy those.

The file handle offset alignment varies per filesystem. While you can
query the alignment for the FS holding the file with statx(), that is
not appropriate todo. If a user saves/restores QEMU state to file, we
must assume there is a chance the user will copy the saved state to a
different filesystem.

IOW, we want alignment to satisfy the likely worst case.

Picking 1 MB is a nice round number that is large enough that it is
almost certainly going to satisfy any filesystem alignment. In fact
it is likely massive overkill. None the less 1 MB is also still tiny
in the context of guest RAM sizes, so no one is going to notice the
padding holes in the file from this.

IOW, the 1 MB choice is an arbitrary, but somewhat informed choice.

With regards,
Daniel
Peter Xu Nov. 1, 2023, 2:21 p.m. UTC | #7
On Wed, Nov 01, 2023 at 09:26:46AM +0000, Daniel P. Berrangé wrote:
> On Tue, Oct 31, 2023 at 03:03:50PM -0400, Peter Xu wrote:
> > On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
> > > >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> > > >> +{
> > > >> +    g_autofree unsigned long *bitmap = NULL;
> > > >> +    struct FixedRamHeader header;
> > > >> +    size_t bitmap_size;
> > > >> +    long num_pages;
> > > >> +    int ret = 0;
> > > >> +
> > > >> +    ret = fixed_ram_read_header(f, &header);
> > > >> +    if (ret < 0) {
> > > >> +        error_report("Error reading fixed-ram header");
> > > >> +        return -EINVAL;
> > > >> +    }
> > > >> +
> > > >> +    block->pages_offset = header.pages_offset;
> > > >
> > > > Do you think it is worth sanity checking that 'pages_offset' is aligned
> > > > in some way.
> > > >
> > > > It is nice that we have flexibility to change the alignment in future
> > > > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> > > > check htere. Perhaps we could at least sanity check for alignment at
> > > > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
> > > >
> > > 
> > > I don't see why not. I'll add it.
> > 
> > Is there any explanation on why that 1MB offset, and how the number is
> > chosen?  Thanks,
> 
> The fixed-ram format is anticipating the use of O_DIRECT.
> 
> With O_DIRECT both the buffers in memory, and the file handle offset
> have alignment requirements. The buffer alignments are usually page
> sized, and QEMU RAM blocks will trivially satisfy those.
> 
> The file handle offset alignment varies per filesystem. While you can
> query the alignment for the FS holding the file with statx(), that is
> not appropriate todo. If a user saves/restores QEMU state to file, we
> must assume there is a chance the user will copy the saved state to a
> different filesystem.
> 
> IOW, we want alignment to satisfy the likely worst case.
> 
> Picking 1 MB is a nice round number that is large enough that it is
> almost certainly going to satisfy any filesystem alignment. In fact
> it is likely massive overkill. None the less 1 MB is also still tiny

Is that calculated by something like max of possible host (small) page
sizes?  I've no idea what's it for all archs, the max small page size I'm
aware of is 64K, but I don't know a lot archs.

> in the context of guest RAM sizes, so no one is going to notice the
> padding holes in the file from this.
> 
> IOW, the 1 MB choice is an arbitrary, but somewhat informed choice.

I see, thanks.  Shall we document it clearly?  Then if there's a need to
adjust that value we will know what to reference.
Daniel P. Berrangé Nov. 1, 2023, 2:28 p.m. UTC | #8
On Wed, Nov 01, 2023 at 10:21:07AM -0400, Peter Xu wrote:
> On Wed, Nov 01, 2023 at 09:26:46AM +0000, Daniel P. Berrangé wrote:
> > On Tue, Oct 31, 2023 at 03:03:50PM -0400, Peter Xu wrote:
> > > On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
> > > > >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> > > > >> +{
> > > > >> +    g_autofree unsigned long *bitmap = NULL;
> > > > >> +    struct FixedRamHeader header;
> > > > >> +    size_t bitmap_size;
> > > > >> +    long num_pages;
> > > > >> +    int ret = 0;
> > > > >> +
> > > > >> +    ret = fixed_ram_read_header(f, &header);
> > > > >> +    if (ret < 0) {
> > > > >> +        error_report("Error reading fixed-ram header");
> > > > >> +        return -EINVAL;
> > > > >> +    }
> > > > >> +
> > > > >> +    block->pages_offset = header.pages_offset;
> > > > >
> > > > > Do you think it is worth sanity checking that 'pages_offset' is aligned
> > > > > in some way.
> > > > >
> > > > > It is nice that we have flexibility to change the alignment in future
> > > > > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> > > > > check htere. Perhaps we could at least sanity check for alignment at
> > > > > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
> > > > >
> > > > 
> > > > I don't see why not. I'll add it.
> > > 
> > > Is there any explanation on why that 1MB offset, and how the number is
> > > chosen?  Thanks,
> > 
> > The fixed-ram format is anticipating the use of O_DIRECT.
> > 
> > With O_DIRECT both the buffers in memory, and the file handle offset
> > have alignment requirements. The buffer alignments are usually page
> > sized, and QEMU RAM blocks will trivially satisfy those.
> > 
> > The file handle offset alignment varies per filesystem. While you can
> > query the alignment for the FS holding the file with statx(), that is
> > not appropriate todo. If a user saves/restores QEMU state to file, we
> > must assume there is a chance the user will copy the saved state to a
> > different filesystem.
> > 
> > IOW, we want alignment to satisfy the likely worst case.
> > 
> > Picking 1 MB is a nice round number that is large enough that it is
> > almost certainly going to satisfy any filesystem alignment. In fact
> > it is likely massive overkill. None the less 1 MB is also still tiny
> 
> Is that calculated by something like max of possible host (small) page
> sizes?  I've no idea what's it for all archs, the max small page size I'm
> aware of is 64K, but I don't know a lot archs.

It wasn't anything as precise as that. It is literally just "1MB" looks
large enough that we don't need to spend time to investigate per arch
page sizes.

Having said that I'm now having slight self-doubt wrt huge pages, though
I swear we investigated it last year when first discussing this feature.
The guest memory will of course already be suitably aligned, but I'm
wondering if the filesystem I/O places any offset alignment constraints
related to non-default page size.


With regards,
Daniel
Peter Xu Nov. 1, 2023, 3 p.m. UTC | #9
On Wed, Nov 01, 2023 at 02:28:24PM +0000, Daniel P. Berrangé wrote:
> On Wed, Nov 01, 2023 at 10:21:07AM -0400, Peter Xu wrote:
> > On Wed, Nov 01, 2023 at 09:26:46AM +0000, Daniel P. Berrangé wrote:
> > > On Tue, Oct 31, 2023 at 03:03:50PM -0400, Peter Xu wrote:
> > > > On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
> > > > > >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> > > > > >> +{
> > > > > >> +    g_autofree unsigned long *bitmap = NULL;
> > > > > >> +    struct FixedRamHeader header;
> > > > > >> +    size_t bitmap_size;
> > > > > >> +    long num_pages;
> > > > > >> +    int ret = 0;
> > > > > >> +
> > > > > >> +    ret = fixed_ram_read_header(f, &header);
> > > > > >> +    if (ret < 0) {
> > > > > >> +        error_report("Error reading fixed-ram header");
> > > > > >> +        return -EINVAL;
> > > > > >> +    }
> > > > > >> +
> > > > > >> +    block->pages_offset = header.pages_offset;
> > > > > >
> > > > > > Do you think it is worth sanity checking that 'pages_offset' is aligned
> > > > > > in some way.
> > > > > >
> > > > > > It is nice that we have flexibility to change the alignment in future
> > > > > > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> > > > > > check htere. Perhaps we could at least sanity check for alignment at
> > > > > > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
> > > > > >
> > > > > 
> > > > > I don't see why not. I'll add it.
> > > > 
> > > > Is there any explanation on why that 1MB offset, and how the number is
> > > > chosen?  Thanks,
> > > 
> > > The fixed-ram format is anticipating the use of O_DIRECT.
> > > 
> > > With O_DIRECT both the buffers in memory, and the file handle offset
> > > have alignment requirements. The buffer alignments are usually page
> > > sized, and QEMU RAM blocks will trivially satisfy those.
> > > 
> > > The file handle offset alignment varies per filesystem. While you can
> > > query the alignment for the FS holding the file with statx(), that is
> > > not appropriate todo. If a user saves/restores QEMU state to file, we
> > > must assume there is a chance the user will copy the saved state to a
> > > different filesystem.
> > > 
> > > IOW, we want alignment to satisfy the likely worst case.
> > > 
> > > Picking 1 MB is a nice round number that is large enough that it is
> > > almost certainly going to satisfy any filesystem alignment. In fact
> > > it is likely massive overkill. None the less 1 MB is also still tiny
> > 
> > Is that calculated by something like max of possible host (small) page
> > sizes?  I've no idea what's it for all archs, the max small page size I'm
> > aware of is 64K, but I don't know a lot archs.
> 
> It wasn't anything as precise as that. It is literally just "1MB" looks
> large enough that we don't need to spend time to investigate per arch
> page sizes.

IMHO we need that precision on reasoning and document it, even if not on
the exact number we prefer, which can be prone to change later.  Otherwise
that value will be a pure magic soon after a few years or even less, it'll
be more of a challenge later to figure things out.

> 
> Having said that I'm now having slight self-doubt wrt huge pages, though
> I swear we investigated it last year when first discussing this feature.
> The guest memory will of course already be suitably aligned, but I'm
> wondering if the filesystem I/O places any offset alignment constraints
> related to non-default page size.

AFAIU direct IO is about pinning the IO buffers, playing the role of fs
cache instead.  If my understanding is correct, huge pages shouldn't be a
problem for such pinning, because it's legal to pin partial of a huge page.

After the partial huge pages pinned, they should be treated as normal fs
buffers when doing block IO.  And then the offset of file should, per my
understanding, not relevant to what is the type of backend of that user
buffer anymore that triggers read()/write().

But maybe I missed something, if so that will need to be part of
documentation of 1MB magic value, IMHO.  We may want to double check with
that by doing fixed-ram migration on e.g. 1GB hugetlb memory-backend-file
with 1MB file offset per-ramblock.

Thanks,
Fabiano Rosas Nov. 6, 2023, 1:18 p.m. UTC | #10
Peter Xu <peterx@redhat.com> writes:

> On Wed, Nov 01, 2023 at 02:28:24PM +0000, Daniel P. Berrangé wrote:
>> On Wed, Nov 01, 2023 at 10:21:07AM -0400, Peter Xu wrote:
>> > On Wed, Nov 01, 2023 at 09:26:46AM +0000, Daniel P. Berrangé wrote:
>> > > On Tue, Oct 31, 2023 at 03:03:50PM -0400, Peter Xu wrote:
>> > > > On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
>> > > > > >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
>> > > > > >> +{
>> > > > > >> +    g_autofree unsigned long *bitmap = NULL;
>> > > > > >> +    struct FixedRamHeader header;
>> > > > > >> +    size_t bitmap_size;
>> > > > > >> +    long num_pages;
>> > > > > >> +    int ret = 0;
>> > > > > >> +
>> > > > > >> +    ret = fixed_ram_read_header(f, &header);
>> > > > > >> +    if (ret < 0) {
>> > > > > >> +        error_report("Error reading fixed-ram header");
>> > > > > >> +        return -EINVAL;
>> > > > > >> +    }
>> > > > > >> +
>> > > > > >> +    block->pages_offset = header.pages_offset;
>> > > > > >
>> > > > > > Do you think it is worth sanity checking that 'pages_offset' is aligned
>> > > > > > in some way.
>> > > > > >
>> > > > > > It is nice that we have flexibility to change the alignment in future
>> > > > > > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
>> > > > > > check htere. Perhaps we could at least sanity check for alignment at
>> > > > > > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
>> > > > > >
>> > > > > 
>> > > > > I don't see why not. I'll add it.
>> > > > 
>> > > > Is there any explanation on why that 1MB offset, and how the number is
>> > > > chosen?  Thanks,
>> > > 
>> > > The fixed-ram format is anticipating the use of O_DIRECT.
>> > > 
>> > > With O_DIRECT both the buffers in memory, and the file handle offset
>> > > have alignment requirements. The buffer alignments are usually page
>> > > sized, and QEMU RAM blocks will trivially satisfy those.
>> > > 
>> > > The file handle offset alignment varies per filesystem. While you can
>> > > query the alignment for the FS holding the file with statx(), that is
>> > > not appropriate todo. If a user saves/restores QEMU state to file, we
>> > > must assume there is a chance the user will copy the saved state to a
>> > > different filesystem.
>> > > 
>> > > IOW, we want alignment to satisfy the likely worst case.
>> > > 
>> > > Picking 1 MB is a nice round number that is large enough that it is
>> > > almost certainly going to satisfy any filesystem alignment. In fact
>> > > it is likely massive overkill. None the less 1 MB is also still tiny
>> > 
>> > Is that calculated by something like max of possible host (small) page
>> > sizes?  I've no idea what's it for all archs, the max small page size I'm
>> > aware of is 64K, but I don't know a lot archs.
>> 
>> It wasn't anything as precise as that. It is literally just "1MB" looks
>> large enough that we don't need to spend time to investigate per arch
>> page sizes.
>
> IMHO we need that precision on reasoning and document it, even if not on
> the exact number we prefer, which can be prone to change later.  Otherwise
> that value will be a pure magic soon after a few years or even less, it'll
> be more of a challenge later to figure things out.
>
>> 
>> Having said that I'm now having slight self-doubt wrt huge pages, though
>> I swear we investigated it last year when first discussing this feature.
>> The guest memory will of course already be suitably aligned, but I'm
>> wondering if the filesystem I/O places any offset alignment constraints
>> related to non-default page size.
>
> AFAIU direct IO is about pinning the IO buffers, playing the role of fs
> cache instead.  If my understanding is correct, huge pages shouldn't be a
> problem for such pinning, because it's legal to pin partial of a huge page.
>
> After the partial huge pages pinned, they should be treated as normal fs
> buffers when doing block IO.  And then the offset of file should, per my
> understanding, not relevant to what is the type of backend of that user
> buffer anymore that triggers read()/write().
>
> But maybe I missed something, if so that will need to be part of
> documentation of 1MB magic value, IMHO.  We may want to double check with
> that by doing fixed-ram migration on e.g. 1GB hugetlb memory-backend-file
> with 1MB file offset per-ramblock.

Does anyone have any indication that we need to relate the aligment to
the page size? All I find online points to device block size being the
limiting factor for filesystems. There's also raw_probe_alignment() at
file-posix.c which seems to check up to 4k and recommend to disable
O_DIRECT if an alignment is not found.

Note that we shouldn't have any problems changing the alignment we
choose since we have a pointer to the start of the aligned region which
goes along with the fixed-ram header. We could even do some probing like
the block layer does if we wanted.
Peter Xu Nov. 6, 2023, 9 p.m. UTC | #11
On Mon, Nov 06, 2023 at 10:18:03AM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Wed, Nov 01, 2023 at 02:28:24PM +0000, Daniel P. Berrangé wrote:
> >> On Wed, Nov 01, 2023 at 10:21:07AM -0400, Peter Xu wrote:
> >> > On Wed, Nov 01, 2023 at 09:26:46AM +0000, Daniel P. Berrangé wrote:
> >> > > On Tue, Oct 31, 2023 at 03:03:50PM -0400, Peter Xu wrote:
> >> > > > On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
> >> > > > > >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> >> > > > > >> +{
> >> > > > > >> +    g_autofree unsigned long *bitmap = NULL;
> >> > > > > >> +    struct FixedRamHeader header;
> >> > > > > >> +    size_t bitmap_size;
> >> > > > > >> +    long num_pages;
> >> > > > > >> +    int ret = 0;
> >> > > > > >> +
> >> > > > > >> +    ret = fixed_ram_read_header(f, &header);
> >> > > > > >> +    if (ret < 0) {
> >> > > > > >> +        error_report("Error reading fixed-ram header");
> >> > > > > >> +        return -EINVAL;
> >> > > > > >> +    }
> >> > > > > >> +
> >> > > > > >> +    block->pages_offset = header.pages_offset;
> >> > > > > >
> >> > > > > > Do you think it is worth sanity checking that 'pages_offset' is aligned
> >> > > > > > in some way.
> >> > > > > >
> >> > > > > > It is nice that we have flexibility to change the alignment in future
> >> > > > > > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> >> > > > > > check htere. Perhaps we could at least sanity check for alignment at
> >> > > > > > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
> >> > > > > >
> >> > > > > 
> >> > > > > I don't see why not. I'll add it.
> >> > > > 
> >> > > > Is there any explanation on why that 1MB offset, and how the number is
> >> > > > chosen?  Thanks,
> >> > > 
> >> > > The fixed-ram format is anticipating the use of O_DIRECT.
> >> > > 
> >> > > With O_DIRECT both the buffers in memory, and the file handle offset
> >> > > have alignment requirements. The buffer alignments are usually page
> >> > > sized, and QEMU RAM blocks will trivially satisfy those.
> >> > > 
> >> > > The file handle offset alignment varies per filesystem. While you can
> >> > > query the alignment for the FS holding the file with statx(), that is
> >> > > not appropriate todo. If a user saves/restores QEMU state to file, we
> >> > > must assume there is a chance the user will copy the saved state to a
> >> > > different filesystem.
> >> > > 
> >> > > IOW, we want alignment to satisfy the likely worst case.
> >> > > 
> >> > > Picking 1 MB is a nice round number that is large enough that it is
> >> > > almost certainly going to satisfy any filesystem alignment. In fact
> >> > > it is likely massive overkill. None the less 1 MB is also still tiny
> >> > 
> >> > Is that calculated by something like max of possible host (small) page
> >> > sizes?  I've no idea what's it for all archs, the max small page size I'm
> >> > aware of is 64K, but I don't know a lot archs.
> >> 
> >> It wasn't anything as precise as that. It is literally just "1MB" looks
> >> large enough that we don't need to spend time to investigate per arch
> >> page sizes.
> >
> > IMHO we need that precision on reasoning and document it, even if not on
> > the exact number we prefer, which can be prone to change later.  Otherwise
> > that value will be a pure magic soon after a few years or even less, it'll
> > be more of a challenge later to figure things out.
> >
> >> 
> >> Having said that I'm now having slight self-doubt wrt huge pages, though
> >> I swear we investigated it last year when first discussing this feature.
> >> The guest memory will of course already be suitably aligned, but I'm
> >> wondering if the filesystem I/O places any offset alignment constraints
> >> related to non-default page size.
> >
> > AFAIU direct IO is about pinning the IO buffers, playing the role of fs
> > cache instead.  If my understanding is correct, huge pages shouldn't be a
> > problem for such pinning, because it's legal to pin partial of a huge page.
> >
> > After the partial huge pages pinned, they should be treated as normal fs
> > buffers when doing block IO.  And then the offset of file should, per my
> > understanding, not relevant to what is the type of backend of that user
> > buffer anymore that triggers read()/write().
> >
> > But maybe I missed something, if so that will need to be part of
> > documentation of 1MB magic value, IMHO.  We may want to double check with
> > that by doing fixed-ram migration on e.g. 1GB hugetlb memory-backend-file
> > with 1MB file offset per-ramblock.
> 
> Does anyone have any indication that we need to relate the aligment to
> the page size? All I find online points to device block size being the
> limiting factor for filesystems. There's also raw_probe_alignment() at
> file-posix.c which seems to check up to 4k and recommend to disable
> O_DIRECT if an alignment is not found.

Right, it should be more relevant to block size.

> 
> Note that we shouldn't have any problems changing the alignment we
> choose since we have a pointer to the start of the aligned region which
> goes along with the fixed-ram header. We could even do some probing like
> the block layer does if we wanted.

Having 1MB offset is fine, especially as you said we keep recv side fetch
that in the headers always.

Perhaps we make that 1MB a macro, and add a comment explaining whatever we
have on understanding how that 1MB come from?  I think we can also
reference raw_probe_alignment() in the comment.

Meanwhile now I'm wondering whether that 1MB is too conservative.  Only a
problem if there can be tons of ramblocks (e.g. 1000 ramblock means
1MB*1000=1G for the headers).  I can't think of any, though..  We can
definitely leave that for later.

Thanks,
Daniel P. Berrangé Nov. 7, 2023, 9:02 a.m. UTC | #12
On Mon, Nov 06, 2023 at 04:00:33PM -0500, Peter Xu wrote:
> On Mon, Nov 06, 2023 at 10:18:03AM -0300, Fabiano Rosas wrote:
> > Peter Xu <peterx@redhat.com> writes:
> > 
> > > On Wed, Nov 01, 2023 at 02:28:24PM +0000, Daniel P. Berrangé wrote:
> > >> On Wed, Nov 01, 2023 at 10:21:07AM -0400, Peter Xu wrote:
> > >> > On Wed, Nov 01, 2023 at 09:26:46AM +0000, Daniel P. Berrangé wrote:
> > >> > > On Tue, Oct 31, 2023 at 03:03:50PM -0400, Peter Xu wrote:
> > >> > > > On Wed, Oct 25, 2023 at 11:07:33AM -0300, Fabiano Rosas wrote:
> > >> > > > > >> +static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
> > >> > > > > >> +{
> > >> > > > > >> +    g_autofree unsigned long *bitmap = NULL;
> > >> > > > > >> +    struct FixedRamHeader header;
> > >> > > > > >> +    size_t bitmap_size;
> > >> > > > > >> +    long num_pages;
> > >> > > > > >> +    int ret = 0;
> > >> > > > > >> +
> > >> > > > > >> +    ret = fixed_ram_read_header(f, &header);
> > >> > > > > >> +    if (ret < 0) {
> > >> > > > > >> +        error_report("Error reading fixed-ram header");
> > >> > > > > >> +        return -EINVAL;
> > >> > > > > >> +    }
> > >> > > > > >> +
> > >> > > > > >> +    block->pages_offset = header.pages_offset;
> > >> > > > > >
> > >> > > > > > Do you think it is worth sanity checking that 'pages_offset' is aligned
> > >> > > > > > in some way.
> > >> > > > > >
> > >> > > > > > It is nice that we have flexibility to change the alignment in future
> > >> > > > > > if we find 1 MB is not optimal, so I wouldn't want to force 1MB align
> > >> > > > > > check htere. Perhaps we could at least sanity check for alignment at
> > >> > > > > > TARGET_PAGE_SIZE, to detect a gross data corruption problem ?
> > >> > > > > >
> > >> > > > > 
> > >> > > > > I don't see why not. I'll add it.
> > >> > > > 
> > >> > > > Is there any explanation on why that 1MB offset, and how the number is
> > >> > > > chosen?  Thanks,
> > >> > > 
> > >> > > The fixed-ram format is anticipating the use of O_DIRECT.
> > >> > > 
> > >> > > With O_DIRECT both the buffers in memory, and the file handle offset
> > >> > > have alignment requirements. The buffer alignments are usually page
> > >> > > sized, and QEMU RAM blocks will trivially satisfy those.
> > >> > > 
> > >> > > The file handle offset alignment varies per filesystem. While you can
> > >> > > query the alignment for the FS holding the file with statx(), that is
> > >> > > not appropriate todo. If a user saves/restores QEMU state to file, we
> > >> > > must assume there is a chance the user will copy the saved state to a
> > >> > > different filesystem.
> > >> > > 
> > >> > > IOW, we want alignment to satisfy the likely worst case.
> > >> > > 
> > >> > > Picking 1 MB is a nice round number that is large enough that it is
> > >> > > almost certainly going to satisfy any filesystem alignment. In fact
> > >> > > it is likely massive overkill. None the less 1 MB is also still tiny
> > >> > 
> > >> > Is that calculated by something like max of possible host (small) page
> > >> > sizes?  I've no idea what's it for all archs, the max small page size I'm
> > >> > aware of is 64K, but I don't know a lot archs.
> > >> 
> > >> It wasn't anything as precise as that. It is literally just "1MB" looks
> > >> large enough that we don't need to spend time to investigate per arch
> > >> page sizes.
> > >
> > > IMHO we need that precision on reasoning and document it, even if not on
> > > the exact number we prefer, which can be prone to change later.  Otherwise
> > > that value will be a pure magic soon after a few years or even less, it'll
> > > be more of a challenge later to figure things out.
> > >
> > >> 
> > >> Having said that I'm now having slight self-doubt wrt huge pages, though
> > >> I swear we investigated it last year when first discussing this feature.
> > >> The guest memory will of course already be suitably aligned, but I'm
> > >> wondering if the filesystem I/O places any offset alignment constraints
> > >> related to non-default page size.
> > >
> > > AFAIU direct IO is about pinning the IO buffers, playing the role of fs
> > > cache instead.  If my understanding is correct, huge pages shouldn't be a
> > > problem for such pinning, because it's legal to pin partial of a huge page.
> > >
> > > After the partial huge pages pinned, they should be treated as normal fs
> > > buffers when doing block IO.  And then the offset of file should, per my
> > > understanding, not relevant to what is the type of backend of that user
> > > buffer anymore that triggers read()/write().
> > >
> > > But maybe I missed something, if so that will need to be part of
> > > documentation of 1MB magic value, IMHO.  We may want to double check with
> > > that by doing fixed-ram migration on e.g. 1GB hugetlb memory-backend-file
> > > with 1MB file offset per-ramblock.
> > 
> > Does anyone have any indication that we need to relate the aligment to
> > the page size? All I find online points to device block size being the
> > limiting factor for filesystems. There's also raw_probe_alignment() at
> > file-posix.c which seems to check up to 4k and recommend to disable
> > O_DIRECT if an alignment is not found.
> 
> Right, it should be more relevant to block size.
> 
> > 
> > Note that we shouldn't have any problems changing the alignment we
> > choose since we have a pointer to the start of the aligned region which
> > goes along with the fixed-ram header. We could even do some probing like
> > the block layer does if we wanted.
> 
> Having 1MB offset is fine, especially as you said we keep recv side fetch
> that in the headers always.
> 
> Perhaps we make that 1MB a macro, and add a comment explaining whatever we
> have on understanding how that 1MB come from?  I think we can also
> reference raw_probe_alignment() in the comment.
> 
> Meanwhile now I'm wondering whether that 1MB is too conservative.  Only a
> problem if there can be tons of ramblocks (e.g. 1000 ramblock means
> 1MB*1000=1G for the headers).  I can't think of any, though..  We can
> definitely leave that for later.

NB, the space won't actually be consumed on disk. By seeking to ramblock
offset, the file simply gets an allocation hole on disk. IOW, with 1000
ramblocks we will have 1 GB of holes.

Apps just have to be aware to preserve sparseness if they copy the
file around. This is probably a good docs point.

With regards,
Daniel
diff mbox series

Patch

diff --git a/migration/ram.c b/migration/ram.c
index 152a03604f..cea6971ab2 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -3032,6 +3032,32 @@  static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
     qemu_put_buffer(file, (uint8_t *) header, header_size);
 }
 
+static int fixed_ram_read_header(QEMUFile *file, struct FixedRamHeader *header)
+{
+    size_t ret, header_size = sizeof(struct FixedRamHeader);
+
+    ret = qemu_get_buffer(file, (uint8_t *)header, header_size);
+    if (ret != header_size) {
+        return -1;
+    }
+
+    /* migration stream is big-endian */
+    be32_to_cpus(&header->version);
+
+    if (header->version > FIXED_RAM_HDR_VERSION) {
+        error_report("Migration fixed-ram capability version mismatch (expected %d, got %d)",
+                     FIXED_RAM_HDR_VERSION, header->version);
+        return -1;
+    }
+
+    be64_to_cpus(&header->page_size);
+    be64_to_cpus(&header->bitmap_offset);
+    be64_to_cpus(&header->pages_offset);
+
+
+    return 0;
+}
+
 /*
  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
  * long-running RCU critical section.  When rcu-reclaims in the code
@@ -3932,6 +3958,68 @@  void colo_flush_ram_cache(void)
     trace_colo_flush_ram_cache_end();
 }
 
+static void read_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block,
+                                    long num_pages, unsigned long *bitmap)
+{
+    unsigned long set_bit_idx, clear_bit_idx;
+    unsigned long len;
+    ram_addr_t offset;
+    void *host;
+    size_t read, completed, read_len;
+
+    for (set_bit_idx = find_first_bit(bitmap, num_pages);
+         set_bit_idx < num_pages;
+         set_bit_idx = find_next_bit(bitmap, num_pages, clear_bit_idx + 1)) {
+
+        clear_bit_idx = find_next_zero_bit(bitmap, num_pages, set_bit_idx + 1);
+
+        len = TARGET_PAGE_SIZE * (clear_bit_idx - set_bit_idx);
+        offset = set_bit_idx << TARGET_PAGE_BITS;
+
+        for (read = 0, completed = 0; completed < len; offset += read) {
+            host = host_from_ram_block_offset(block, offset);
+            read_len = MIN(len, TARGET_PAGE_SIZE);
+
+            read = qemu_get_buffer_at(f, host, read_len,
+                                      block->pages_offset + offset);
+            completed += read;
+        }
+    }
+}
+
+static int parse_ramblock_fixed_ram(QEMUFile *f, RAMBlock *block, ram_addr_t length)
+{
+    g_autofree unsigned long *bitmap = NULL;
+    struct FixedRamHeader header;
+    size_t bitmap_size;
+    long num_pages;
+    int ret = 0;
+
+    ret = fixed_ram_read_header(f, &header);
+    if (ret < 0) {
+        error_report("Error reading fixed-ram header");
+        return -EINVAL;
+    }
+
+    block->pages_offset = header.pages_offset;
+    num_pages = length / header.page_size;
+    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
+
+    bitmap = g_malloc0(bitmap_size);
+    if (qemu_get_buffer_at(f, (uint8_t *)bitmap, bitmap_size,
+                           header.bitmap_offset) != bitmap_size) {
+        error_report("Error parsing dirty bitmap");
+        return -EINVAL;
+    }
+
+    read_ramblock_fixed_ram(f, block, num_pages, bitmap);
+
+    /* Skip pages array */
+    qemu_set_offset(f, block->pages_offset + length, SEEK_SET);
+
+    return ret;
+}
+
 static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
 {
     int ret = 0;
@@ -3940,6 +4028,10 @@  static int parse_ramblock(QEMUFile *f, RAMBlock *block, ram_addr_t length)
 
     assert(block);
 
+    if (migrate_fixed_ram()) {
+        return parse_ramblock_fixed_ram(f, block, length);
+    }
+
     if (!qemu_ram_is_migratable(block)) {
         error_report("block %s should not be migrated !", block->idstr);
         return -EINVAL;
@@ -4142,6 +4234,7 @@  static int ram_load_precopy(QEMUFile *f)
                 migrate_multifd_flush_after_each_section()) {
                 multifd_recv_sync_main();
             }
+
             break;
         case RAM_SAVE_FLAG_HOOK:
             ret = rdma_registration_handle(f);