diff mbox series

[v2,15/29] migration/ram: Add support for 'fixed-ram' outgoing migration

Message ID 20231023203608.26370-16-farosas@suse.de
State New
Headers show
Series migration: File based migration with multifd and fixed-ram | expand

Commit Message

Fabiano Rosas Oct. 23, 2023, 8:35 p.m. UTC
From: Nikolay Borisov <nborisov@suse.com>

Implement the outgoing migration side for the 'fixed-ram' capability.

A bitmap is introduced to track which pages have been written in the
migration file. Pages are written at a fixed location for every
ramblock. Zero pages are ignored as they'd be zero in the destination
migration as well.

The migration stream is altered to put the dirty pages for a ramblock
after its header instead of having a sequential stream of pages that
follow the ramblock headers. Since all pages have a fixed location,
RAM_SAVE_FLAG_EOS is no longer generated on every migration iteration.

Without fixed-ram (current):

ramblock 1 header|ramblock 2 header|...|RAM_SAVE_FLAG_EOS|stream of
 pages (iter 1)|RAM_SAVE_FLAG_EOS|stream of pages (iter 2)|...

With fixed-ram (new):

ramblock 1 header|ramblock 1 fixed-ram header|ramblock 1 pages (fixed
 offsets)|ramblock 2 header|ramblock 2 fixed-ram header|ramblock 2
 pages (fixed offsets)|...|RAM_SAVE_FLAG_EOS

where:
 - ramblock header: the generic information for a ramblock, such as
   idstr, used_len, etc.

 - ramblock fixed-ram header: the new information added by this
   feature: bitmap of pages written, bitmap size and offset of pages
   in the migration file.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Fabiano Rosas <farosas@suse.de>
---
 include/exec/ramblock.h |  8 ++++
 migration/options.c     |  3 --
 migration/ram.c         | 98 ++++++++++++++++++++++++++++++++++++-----
 3 files changed, 96 insertions(+), 13 deletions(-)

Comments

Daniel P. Berrangé Oct. 25, 2023, 9:39 a.m. UTC | #1
On Mon, Oct 23, 2023 at 05:35:54PM -0300, Fabiano Rosas wrote:
> From: Nikolay Borisov <nborisov@suse.com>
> 
> Implement the outgoing migration side for the 'fixed-ram' capability.
> 
> A bitmap is introduced to track which pages have been written in the
> migration file. Pages are written at a fixed location for every
> ramblock. Zero pages are ignored as they'd be zero in the destination
> migration as well.
> 
> The migration stream is altered to put the dirty pages for a ramblock
> after its header instead of having a sequential stream of pages that
> follow the ramblock headers. Since all pages have a fixed location,
> RAM_SAVE_FLAG_EOS is no longer generated on every migration iteration.
> 
> Without fixed-ram (current):
> 
> ramblock 1 header|ramblock 2 header|...|RAM_SAVE_FLAG_EOS|stream of
>  pages (iter 1)|RAM_SAVE_FLAG_EOS|stream of pages (iter 2)|...

Clearer to diagram this vertically I think

 ---------------------
 | ramblock 1 header |
 ---------------------
 | ramblock 2 header |
 ---------------------
 | ...               |
 ---------------------
 | ramblock n header |
 ---------------------
 | RAM_SAVE_FLAG_EOS |
 ---------------------
 | stream of pages   |
 | (iter 1)          |
 | ...               |
 ---------------------
 | RAM_SAVE_FLAG_EOS |
 ---------------------
 | stream of pages   |
 | (iter 2)          |
 | ...               |
 ---------------------
 | ...               |
 ---------------------
 

> 
> With fixed-ram (new):
> 
> ramblock 1 header|ramblock 1 fixed-ram header|ramblock 1 pages (fixed
>  offsets)|ramblock 2 header|ramblock 2 fixed-ram header|ramblock 2
>  pages (fixed offsets)|...|RAM_SAVE_FLAG_EOS

If I'm reading the code correctly the new format has some padding
such that each "ramblock pages" region starts on a 1 MB boundary.

eg so we get:

 --------------------------------
 | ramblock 1 header            |
 --------------------------------
 | ramblock 1 fixed-ram header  |
 --------------------------------
 | padding to next 1MB boundary |
 | ...                          |
 --------------------------------
 | ramblock 1 pages             |
 | ...                          |
 --------------------------------
 | ramblock 2 header            |
 --------------------------------
 | ramblock 2 fixed-ram header  |
 --------------------------------
 | padding to next 1MB boundary |
 | ...                          |
 --------------------------------
 | ramblock 2 pages             |
 | ...                          |
 --------------------------------
 | ...                          |
 --------------------------------
 | RAM_SAVE_FLAG_EOS            |
 --------------------------------
 | ...                          |
 -------------------------------

> 
> where:
>  - ramblock header: the generic information for a ramblock, such as
>    idstr, used_len, etc.
> 
>  - ramblock fixed-ram header: the new information added by this
>    feature: bitmap of pages written, bitmap size and offset of pages
>    in the migration file.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
>  include/exec/ramblock.h |  8 ++++
>  migration/options.c     |  3 --
>  migration/ram.c         | 98 ++++++++++++++++++++++++++++++++++++-----
>  3 files changed, 96 insertions(+), 13 deletions(-)
> 
> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> index 69c6a53902..e0e3f16852 100644
> --- a/include/exec/ramblock.h
> +++ b/include/exec/ramblock.h
> @@ -44,6 +44,14 @@ struct RAMBlock {
>      size_t page_size;
>      /* dirty bitmap used during migration */
>      unsigned long *bmap;
> +    /* shadow dirty bitmap used when migrating to a file */
> +    unsigned long *shadow_bmap;
> +    /*
> +     * offset in the file pages belonging to this ramblock are saved,
> +     * used only during migration to a file.
> +     */
> +    off_t bitmap_offset;
> +    uint64_t pages_offset;
>      /* bitmap of already received pages in postcopy */
>      unsigned long *receivedmap;
>  
> diff --git a/migration/options.c b/migration/options.c
> index 2622d8c483..9f693d909f 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -271,12 +271,9 @@ bool migrate_events(void)
>  
>  bool migrate_fixed_ram(void)
>  {
> -/*
>      MigrationState *s = migrate_get_current();
>  
>      return s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
> -*/
> -    return false;
>  }
>  
>  bool migrate_ignore_shared(void)
> diff --git a/migration/ram.c b/migration/ram.c
> index 92769902bb..152a03604f 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1157,12 +1157,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
>          return 0;
>      }
>  
> +    stat64_add(&mig_stats.zero_pages, 1);
> +
> +    if (migrate_fixed_ram()) {
> +        /* zero pages are not transferred with fixed-ram */
> +        clear_bit(offset >> TARGET_PAGE_BITS, pss->block->shadow_bmap);
> +        return 1;
> +    }
> +
>      len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
>      qemu_put_byte(file, 0);
>      len += 1;
>      ram_release_page(pss->block->idstr, offset);
> -
> -    stat64_add(&mig_stats.zero_pages, 1);
>      ram_transferred_add(len);
>  
>      /*
> @@ -1220,14 +1226,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
>  {
>      QEMUFile *file = pss->pss_channel;
>  
> -    ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
> -                                         offset | RAM_SAVE_FLAG_PAGE));
> -    if (async) {
> -        qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
> -                              migrate_release_ram() &&
> -                              migration_in_postcopy());
> +    if (migrate_fixed_ram()) {
> +        qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
> +                           block->pages_offset + offset);
> +        set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
>      } else {
> -        qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
> +        ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
> +                                             offset | RAM_SAVE_FLAG_PAGE));
> +        if (async) {
> +            qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
> +                                  migrate_release_ram() &&
> +                                  migration_in_postcopy());
> +        } else {
> +            qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
> +        }
>      }
>      ram_transferred_add(TARGET_PAGE_SIZE);
>      stat64_add(&mig_stats.normal_pages, 1);
> @@ -2475,6 +2487,8 @@ static void ram_save_cleanup(void *opaque)
>          block->clear_bmap = NULL;
>          g_free(block->bmap);
>          block->bmap = NULL;
> +        g_free(block->shadow_bmap);
> +        block->shadow_bmap = NULL;
>      }
>  
>      xbzrle_cleanup();
> @@ -2842,6 +2856,7 @@ static void ram_list_init_bitmaps(void)
>               */
>              block->bmap = bitmap_new(pages);
>              bitmap_set(block->bmap, 0, pages);
> +            block->shadow_bmap = bitmap_new(block->used_length >> TARGET_PAGE_BITS);
>              block->clear_bmap_shift = shift;
>              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
>          }
> @@ -2979,6 +2994,44 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
>      }
>  }
>  
> +#define FIXED_RAM_HDR_VERSION 1
> +struct FixedRamHeader {
> +    uint32_t version;
> +    uint64_t page_size;
> +    uint64_t bitmap_offset;
> +    uint64_t pages_offset;
> +    /* end of v1 */
> +} QEMU_PACKED;
> +
> +static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
> +{
> +    g_autofree struct FixedRamHeader *header;
> +    size_t header_size, bitmap_size;
> +    long num_pages;
> +
> +    header = g_new0(struct FixedRamHeader, 1);
> +    header_size = sizeof(struct FixedRamHeader);
> +
> +    num_pages = block->used_length >> TARGET_PAGE_BITS;
> +    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
> +
> +    /*
> +     * Save the file offsets of where the bitmap and the pages should
> +     * go as they are written at the end of migration and during the
> +     * iterative phase, respectively.
> +     */
> +    block->bitmap_offset = qemu_get_offset(file) + header_size;
> +    block->pages_offset = ROUND_UP(block->bitmap_offset +
> +                                   bitmap_size, 0x100000);

The 0x100000 gives us our 1 MB alignment.

This is quite an important thing, so can we put a nice clear
constant at the top of the file:

  /* Align fixed-ram pages data to the next 1 MB boundary */
  #define FIXED_RAM_PAGE_OFFSET_ALIGNMENT 0x100000

> @@ -3179,7 +3252,6 @@ out:
>          qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>          qemu_fflush(f);
>          ram_transferred_add(8);
> -
>          ret = qemu_file_get_error(f);
>      }
>      if (ret < 0) {

Supurious whitspace change, possibly better in whatever patch
introduced it, or dropped ?


With regards,
Daniel
Fabiano Rosas Oct. 25, 2023, 2:03 p.m. UTC | #2
Daniel P. Berrangé <berrange@redhat.com> writes:

> On Mon, Oct 23, 2023 at 05:35:54PM -0300, Fabiano Rosas wrote:
>> From: Nikolay Borisov <nborisov@suse.com>
>> 
>> Implement the outgoing migration side for the 'fixed-ram' capability.
>> 
>> A bitmap is introduced to track which pages have been written in the
>> migration file. Pages are written at a fixed location for every
>> ramblock. Zero pages are ignored as they'd be zero in the destination
>> migration as well.
>> 
>> The migration stream is altered to put the dirty pages for a ramblock
>> after its header instead of having a sequential stream of pages that
>> follow the ramblock headers. Since all pages have a fixed location,
>> RAM_SAVE_FLAG_EOS is no longer generated on every migration iteration.
>> 
>> Without fixed-ram (current):
>> 
>> ramblock 1 header|ramblock 2 header|...|RAM_SAVE_FLAG_EOS|stream of
>>  pages (iter 1)|RAM_SAVE_FLAG_EOS|stream of pages (iter 2)|...
>
> Clearer to diagram this vertically I think
>
>  ---------------------
>  | ramblock 1 header |
>  ---------------------
>  | ramblock 2 header |
>  ---------------------
>  | ...               |
>  ---------------------
>  | ramblock n header |
>  ---------------------
>  | RAM_SAVE_FLAG_EOS |
>  ---------------------
>  | stream of pages   |
>  | (iter 1)          |
>  | ...               |
>  ---------------------
>  | RAM_SAVE_FLAG_EOS |
>  ---------------------
>  | stream of pages   |
>  | (iter 2)          |
>  | ...               |
>  ---------------------
>  | ...               |
>  ---------------------
>  
>
>> 
>> With fixed-ram (new):
>> 
>> ramblock 1 header|ramblock 1 fixed-ram header|ramblock 1 pages (fixed
>>  offsets)|ramblock 2 header|ramblock 2 fixed-ram header|ramblock 2
>>  pages (fixed offsets)|...|RAM_SAVE_FLAG_EOS
>
> If I'm reading the code correctly the new format has some padding
> such that each "ramblock pages" region starts on a 1 MB boundary.
>
> eg so we get:
>
>  --------------------------------
>  | ramblock 1 header            |
>  --------------------------------
>  | ramblock 1 fixed-ram header  |
>  --------------------------------
>  | padding to next 1MB boundary |
>  | ...                          |
>  --------------------------------
>  | ramblock 1 pages             |
>  | ...                          |
>  --------------------------------
>  | ramblock 2 header            |
>  --------------------------------
>  | ramblock 2 fixed-ram header  |
>  --------------------------------
>  | padding to next 1MB boundary |
>  | ...                          |
>  --------------------------------
>  | ramblock 2 pages             |
>  | ...                          |
>  --------------------------------
>  | ...                          |
>  --------------------------------
>  | RAM_SAVE_FLAG_EOS            |
>  --------------------------------
>  | ...                          |
>  -------------------------------
>
>> 
>> where:
>>  - ramblock header: the generic information for a ramblock, such as
>>    idstr, used_len, etc.
>> 
>>  - ramblock fixed-ram header: the new information added by this
>>    feature: bitmap of pages written, bitmap size and offset of pages
>>    in the migration file.
>> 
>> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> ---
>>  include/exec/ramblock.h |  8 ++++
>>  migration/options.c     |  3 --
>>  migration/ram.c         | 98 ++++++++++++++++++++++++++++++++++++-----
>>  3 files changed, 96 insertions(+), 13 deletions(-)
>> 
>> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
>> index 69c6a53902..e0e3f16852 100644
>> --- a/include/exec/ramblock.h
>> +++ b/include/exec/ramblock.h
>> @@ -44,6 +44,14 @@ struct RAMBlock {
>>      size_t page_size;
>>      /* dirty bitmap used during migration */
>>      unsigned long *bmap;
>> +    /* shadow dirty bitmap used when migrating to a file */
>> +    unsigned long *shadow_bmap;
>> +    /*
>> +     * offset in the file pages belonging to this ramblock are saved,
>> +     * used only during migration to a file.
>> +     */
>> +    off_t bitmap_offset;
>> +    uint64_t pages_offset;
>>      /* bitmap of already received pages in postcopy */
>>      unsigned long *receivedmap;
>>  
>> diff --git a/migration/options.c b/migration/options.c
>> index 2622d8c483..9f693d909f 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -271,12 +271,9 @@ bool migrate_events(void)
>>  
>>  bool migrate_fixed_ram(void)
>>  {
>> -/*
>>      MigrationState *s = migrate_get_current();
>>  
>>      return s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
>> -*/
>> -    return false;
>>  }
>>  
>>  bool migrate_ignore_shared(void)
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 92769902bb..152a03604f 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -1157,12 +1157,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
>>          return 0;
>>      }
>>  
>> +    stat64_add(&mig_stats.zero_pages, 1);
>> +
>> +    if (migrate_fixed_ram()) {
>> +        /* zero pages are not transferred with fixed-ram */
>> +        clear_bit(offset >> TARGET_PAGE_BITS, pss->block->shadow_bmap);
>> +        return 1;
>> +    }
>> +
>>      len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
>>      qemu_put_byte(file, 0);
>>      len += 1;
>>      ram_release_page(pss->block->idstr, offset);
>> -
>> -    stat64_add(&mig_stats.zero_pages, 1);
>>      ram_transferred_add(len);
>>  
>>      /*
>> @@ -1220,14 +1226,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
>>  {
>>      QEMUFile *file = pss->pss_channel;
>>  
>> -    ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
>> -                                         offset | RAM_SAVE_FLAG_PAGE));
>> -    if (async) {
>> -        qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
>> -                              migrate_release_ram() &&
>> -                              migration_in_postcopy());
>> +    if (migrate_fixed_ram()) {
>> +        qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
>> +                           block->pages_offset + offset);
>> +        set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
>>      } else {
>> -        qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
>> +        ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
>> +                                             offset | RAM_SAVE_FLAG_PAGE));
>> +        if (async) {
>> +            qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
>> +                                  migrate_release_ram() &&
>> +                                  migration_in_postcopy());
>> +        } else {
>> +            qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
>> +        }
>>      }
>>      ram_transferred_add(TARGET_PAGE_SIZE);
>>      stat64_add(&mig_stats.normal_pages, 1);
>> @@ -2475,6 +2487,8 @@ static void ram_save_cleanup(void *opaque)
>>          block->clear_bmap = NULL;
>>          g_free(block->bmap);
>>          block->bmap = NULL;
>> +        g_free(block->shadow_bmap);
>> +        block->shadow_bmap = NULL;
>>      }
>>  
>>      xbzrle_cleanup();
>> @@ -2842,6 +2856,7 @@ static void ram_list_init_bitmaps(void)
>>               */
>>              block->bmap = bitmap_new(pages);
>>              bitmap_set(block->bmap, 0, pages);
>> +            block->shadow_bmap = bitmap_new(block->used_length >> TARGET_PAGE_BITS);
>>              block->clear_bmap_shift = shift;
>>              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
>>          }
>> @@ -2979,6 +2994,44 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
>>      }
>>  }
>>  
>> +#define FIXED_RAM_HDR_VERSION 1
>> +struct FixedRamHeader {
>> +    uint32_t version;
>> +    uint64_t page_size;
>> +    uint64_t bitmap_offset;
>> +    uint64_t pages_offset;
>> +    /* end of v1 */
>> +} QEMU_PACKED;
>> +
>> +static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
>> +{
>> +    g_autofree struct FixedRamHeader *header;
>> +    size_t header_size, bitmap_size;
>> +    long num_pages;
>> +
>> +    header = g_new0(struct FixedRamHeader, 1);
>> +    header_size = sizeof(struct FixedRamHeader);
>> +
>> +    num_pages = block->used_length >> TARGET_PAGE_BITS;
>> +    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
>> +
>> +    /*
>> +     * Save the file offsets of where the bitmap and the pages should
>> +     * go as they are written at the end of migration and during the
>> +     * iterative phase, respectively.
>> +     */
>> +    block->bitmap_offset = qemu_get_offset(file) + header_size;
>> +    block->pages_offset = ROUND_UP(block->bitmap_offset +
>> +                                   bitmap_size, 0x100000);
>
> The 0x100000 gives us our 1 MB alignment.
>
> This is quite an important thing, so can we put a nice clear
> constant at the top of the file:
>
>   /* Align fixed-ram pages data to the next 1 MB boundary */
>   #define FIXED_RAM_PAGE_OFFSET_ALIGNMENT 0x100000
>
>> @@ -3179,7 +3252,6 @@ out:
>>          qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>>          qemu_fflush(f);
>>          ram_transferred_add(8);
>> -
>>          ret = qemu_file_get_error(f);
>>      }
>>      if (ret < 0) {
>
> Supurious whitspace change, possibly better in whatever patch
> introduced it, or dropped ?
>
>
> With regards,
> Daniel

I'll apply all the suggestions. Thanks!
Peter Xu Oct. 31, 2023, 4:52 p.m. UTC | #3
On Mon, Oct 23, 2023 at 05:35:54PM -0300, Fabiano Rosas wrote:
> From: Nikolay Borisov <nborisov@suse.com>
> 
> Implement the outgoing migration side for the 'fixed-ram' capability.
> 
> A bitmap is introduced to track which pages have been written in the
> migration file. Pages are written at a fixed location for every
> ramblock. Zero pages are ignored as they'd be zero in the destination
> migration as well.
> 
> The migration stream is altered to put the dirty pages for a ramblock
> after its header instead of having a sequential stream of pages that
> follow the ramblock headers. Since all pages have a fixed location,
> RAM_SAVE_FLAG_EOS is no longer generated on every migration iteration.
> 
> Without fixed-ram (current):
> 
> ramblock 1 header|ramblock 2 header|...|RAM_SAVE_FLAG_EOS|stream of
>  pages (iter 1)|RAM_SAVE_FLAG_EOS|stream of pages (iter 2)|...
> 
> With fixed-ram (new):
> 
> ramblock 1 header|ramblock 1 fixed-ram header|ramblock 1 pages (fixed
>  offsets)|ramblock 2 header|ramblock 2 fixed-ram header|ramblock 2
>  pages (fixed offsets)|...|RAM_SAVE_FLAG_EOS
> 
> where:
>  - ramblock header: the generic information for a ramblock, such as
>    idstr, used_len, etc.
> 
>  - ramblock fixed-ram header: the new information added by this
>    feature: bitmap of pages written, bitmap size and offset of pages
>    in the migration file.
> 
> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> ---
>  include/exec/ramblock.h |  8 ++++
>  migration/options.c     |  3 --
>  migration/ram.c         | 98 ++++++++++++++++++++++++++++++++++++-----
>  3 files changed, 96 insertions(+), 13 deletions(-)
> 
> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> index 69c6a53902..e0e3f16852 100644
> --- a/include/exec/ramblock.h
> +++ b/include/exec/ramblock.h
> @@ -44,6 +44,14 @@ struct RAMBlock {
>      size_t page_size;
>      /* dirty bitmap used during migration */
>      unsigned long *bmap;
> +    /* shadow dirty bitmap used when migrating to a file */
> +    unsigned long *shadow_bmap;
> +    /*
> +     * offset in the file pages belonging to this ramblock are saved,
> +     * used only during migration to a file.
> +     */
> +    off_t bitmap_offset;
> +    uint64_t pages_offset;
>      /* bitmap of already received pages in postcopy */
>      unsigned long *receivedmap;
>  
> diff --git a/migration/options.c b/migration/options.c
> index 2622d8c483..9f693d909f 100644
> --- a/migration/options.c
> +++ b/migration/options.c
> @@ -271,12 +271,9 @@ bool migrate_events(void)
>  
>  bool migrate_fixed_ram(void)
>  {
> -/*
>      MigrationState *s = migrate_get_current();
>  
>      return s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
> -*/
> -    return false;
>  }
>  
>  bool migrate_ignore_shared(void)
> diff --git a/migration/ram.c b/migration/ram.c
> index 92769902bb..152a03604f 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -1157,12 +1157,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
>          return 0;
>      }
>  
> +    stat64_add(&mig_stats.zero_pages, 1);

Here we keep zero page accounting, but..

> +
> +    if (migrate_fixed_ram()) {
> +        /* zero pages are not transferred with fixed-ram */
> +        clear_bit(offset >> TARGET_PAGE_BITS, pss->block->shadow_bmap);
> +        return 1;
> +    }
> +
>      len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
>      qemu_put_byte(file, 0);
>      len += 1;
>      ram_release_page(pss->block->idstr, offset);
> -
> -    stat64_add(&mig_stats.zero_pages, 1);
>      ram_transferred_add(len);
>  
>      /*
> @@ -1220,14 +1226,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
>  {
>      QEMUFile *file = pss->pss_channel;
>  
> -    ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
> -                                         offset | RAM_SAVE_FLAG_PAGE));
> -    if (async) {
> -        qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
> -                              migrate_release_ram() &&
> -                              migration_in_postcopy());
> +    if (migrate_fixed_ram()) {
> +        qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
> +                           block->pages_offset + offset);
> +        set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
>      } else {
> -        qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
> +        ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
> +                                             offset | RAM_SAVE_FLAG_PAGE));

.. here we ignored normal page accounting.

I think we should have the same behavior on both, perhaps keep them always?

> +        if (async) {
> +            qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
> +                                  migrate_release_ram() &&
> +                                  migration_in_postcopy());
> +        } else {
> +            qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
> +        }
>      }
>      ram_transferred_add(TARGET_PAGE_SIZE);
>      stat64_add(&mig_stats.normal_pages, 1);
> @@ -2475,6 +2487,8 @@ static void ram_save_cleanup(void *opaque)
>          block->clear_bmap = NULL;
>          g_free(block->bmap);
>          block->bmap = NULL;
> +        g_free(block->shadow_bmap);
> +        block->shadow_bmap = NULL;
>      }
>  
>      xbzrle_cleanup();
> @@ -2842,6 +2856,7 @@ static void ram_list_init_bitmaps(void)
>               */
>              block->bmap = bitmap_new(pages);
>              bitmap_set(block->bmap, 0, pages);
> +            block->shadow_bmap = bitmap_new(block->used_length >> TARGET_PAGE_BITS);

AFAICT bmap should also use used_length.  How about adding one more patch
to change that, then you can use "pages" here?

See ram_mig_ram_block_resized() where we call migration_cancel() if resized.

>              block->clear_bmap_shift = shift;
>              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
>          }
> @@ -2979,6 +2994,44 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
>      }
>  }
>  
> +#define FIXED_RAM_HDR_VERSION 1
> +struct FixedRamHeader {
> +    uint32_t version;
> +    uint64_t page_size;
> +    uint64_t bitmap_offset;
> +    uint64_t pages_offset;
> +    /* end of v1 */
> +} QEMU_PACKED;
> +
> +static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
> +{
> +    g_autofree struct FixedRamHeader *header;
> +    size_t header_size, bitmap_size;
> +    long num_pages;
> +
> +    header = g_new0(struct FixedRamHeader, 1);
> +    header_size = sizeof(struct FixedRamHeader);
> +
> +    num_pages = block->used_length >> TARGET_PAGE_BITS;
> +    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
> +
> +    /*
> +     * Save the file offsets of where the bitmap and the pages should
> +     * go as they are written at the end of migration and during the
> +     * iterative phase, respectively.
> +     */
> +    block->bitmap_offset = qemu_get_offset(file) + header_size;
> +    block->pages_offset = ROUND_UP(block->bitmap_offset +
> +                                   bitmap_size, 0x100000);
> +
> +    header->version = cpu_to_be32(FIXED_RAM_HDR_VERSION);
> +    header->page_size = cpu_to_be64(TARGET_PAGE_SIZE);

This is the "page size" for the shadow bitmap, right?  Shall we state it
somewhere (e.g. explaining why it's not block->page_size)?

It's unfortunate that we already have things like:

            if (migrate_postcopy_ram() && block->page_size !=
                                          qemu_host_page_size) {
                qemu_put_be64(f, block->page_size);
            }

But indeed we can't merge them because they seem to service different
purpose.

> +    header->bitmap_offset = cpu_to_be64(block->bitmap_offset);
> +    header->pages_offset = cpu_to_be64(block->pages_offset);
> +
> +    qemu_put_buffer(file, (uint8_t *) header, header_size);
> +}
> +
>  /*
>   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
>   * long-running RCU critical section.  When rcu-reclaims in the code
> @@ -3028,6 +3081,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>              if (migrate_ignore_shared()) {
>                  qemu_put_be64(f, block->mr->addr);
>              }
> +
> +            if (migrate_fixed_ram()) {
> +                fixed_ram_insert_header(f, block);
> +                /* prepare offset for next ramblock */
> +                qemu_set_offset(f, block->pages_offset + block->used_length, SEEK_SET);
> +            }
>          }
>      }
>  
> @@ -3061,6 +3120,20 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>      return 0;
>  }
>  
> +static void ram_save_shadow_bmap(QEMUFile *f)
> +{
> +    RAMBlock *block;
> +
> +    RAMBLOCK_FOREACH_MIGRATABLE(block) {
> +        long num_pages = block->used_length >> TARGET_PAGE_BITS;
> +        long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
> +        qemu_put_buffer_at(f, (uint8_t *)block->shadow_bmap, bitmap_size,
> +                           block->bitmap_offset);
> +        /* to catch any thread late sending pages */
> +        block->shadow_bmap = NULL;

What is this for?  Wouldn't this leak the buffer already?

> +    }
> +}
> +
>  /**
>   * ram_save_iterate: iterative stage for migration
>   *
> @@ -3179,7 +3252,6 @@ out:
>          qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
>          qemu_fflush(f);
>          ram_transferred_add(8);
> -
>          ret = qemu_file_get_error(f);
>      }
>      if (ret < 0) {
> @@ -3256,7 +3328,13 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
>      if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) {
>          qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
>      }
> +
> +    if (migrate_fixed_ram()) {
> +        ram_save_shadow_bmap(f);
> +    }
> +
>      qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
> +
>      qemu_fflush(f);
>  
>      return 0;
> -- 
> 2.35.3
>
Fabiano Rosas Oct. 31, 2023, 5:33 p.m. UTC | #4
Peter Xu <peterx@redhat.com> writes:

> On Mon, Oct 23, 2023 at 05:35:54PM -0300, Fabiano Rosas wrote:
>> From: Nikolay Borisov <nborisov@suse.com>
>> 
>> Implement the outgoing migration side for the 'fixed-ram' capability.
>> 
>> A bitmap is introduced to track which pages have been written in the
>> migration file. Pages are written at a fixed location for every
>> ramblock. Zero pages are ignored as they'd be zero in the destination
>> migration as well.
>> 
>> The migration stream is altered to put the dirty pages for a ramblock
>> after its header instead of having a sequential stream of pages that
>> follow the ramblock headers. Since all pages have a fixed location,
>> RAM_SAVE_FLAG_EOS is no longer generated on every migration iteration.
>> 
>> Without fixed-ram (current):
>> 
>> ramblock 1 header|ramblock 2 header|...|RAM_SAVE_FLAG_EOS|stream of
>>  pages (iter 1)|RAM_SAVE_FLAG_EOS|stream of pages (iter 2)|...
>> 
>> With fixed-ram (new):
>> 
>> ramblock 1 header|ramblock 1 fixed-ram header|ramblock 1 pages (fixed
>>  offsets)|ramblock 2 header|ramblock 2 fixed-ram header|ramblock 2
>>  pages (fixed offsets)|...|RAM_SAVE_FLAG_EOS
>> 
>> where:
>>  - ramblock header: the generic information for a ramblock, such as
>>    idstr, used_len, etc.
>> 
>>  - ramblock fixed-ram header: the new information added by this
>>    feature: bitmap of pages written, bitmap size and offset of pages
>>    in the migration file.
>> 
>> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
>> Signed-off-by: Fabiano Rosas <farosas@suse.de>
>> ---
>>  include/exec/ramblock.h |  8 ++++
>>  migration/options.c     |  3 --
>>  migration/ram.c         | 98 ++++++++++++++++++++++++++++++++++++-----
>>  3 files changed, 96 insertions(+), 13 deletions(-)
>> 
>> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
>> index 69c6a53902..e0e3f16852 100644
>> --- a/include/exec/ramblock.h
>> +++ b/include/exec/ramblock.h
>> @@ -44,6 +44,14 @@ struct RAMBlock {
>>      size_t page_size;
>>      /* dirty bitmap used during migration */
>>      unsigned long *bmap;
>> +    /* shadow dirty bitmap used when migrating to a file */
>> +    unsigned long *shadow_bmap;
>> +    /*
>> +     * offset in the file pages belonging to this ramblock are saved,
>> +     * used only during migration to a file.
>> +     */
>> +    off_t bitmap_offset;
>> +    uint64_t pages_offset;
>>      /* bitmap of already received pages in postcopy */
>>      unsigned long *receivedmap;
>>  
>> diff --git a/migration/options.c b/migration/options.c
>> index 2622d8c483..9f693d909f 100644
>> --- a/migration/options.c
>> +++ b/migration/options.c
>> @@ -271,12 +271,9 @@ bool migrate_events(void)
>>  
>>  bool migrate_fixed_ram(void)
>>  {
>> -/*
>>      MigrationState *s = migrate_get_current();
>>  
>>      return s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
>> -*/
>> -    return false;
>>  }
>>  
>>  bool migrate_ignore_shared(void)
>> diff --git a/migration/ram.c b/migration/ram.c
>> index 92769902bb..152a03604f 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -1157,12 +1157,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
>>          return 0;
>>      }
>>  
>> +    stat64_add(&mig_stats.zero_pages, 1);
>
> Here we keep zero page accounting, but..
>
>> +
>> +    if (migrate_fixed_ram()) {
>> +        /* zero pages are not transferred with fixed-ram */
>> +        clear_bit(offset >> TARGET_PAGE_BITS, pss->block->shadow_bmap);
>> +        return 1;
>> +    }
>> +
>>      len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
>>      qemu_put_byte(file, 0);
>>      len += 1;
>>      ram_release_page(pss->block->idstr, offset);
>> -
>> -    stat64_add(&mig_stats.zero_pages, 1);
>>      ram_transferred_add(len);
>>  
>>      /*
>> @@ -1220,14 +1226,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
>>  {
>>      QEMUFile *file = pss->pss_channel;
>>  
>> -    ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
>> -                                         offset | RAM_SAVE_FLAG_PAGE));
>> -    if (async) {
>> -        qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
>> -                              migrate_release_ram() &&
>> -                              migration_in_postcopy());
>> +    if (migrate_fixed_ram()) {
>> +        qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
>> +                           block->pages_offset + offset);
>> +        set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
>>      } else {
>> -        qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
>> +        ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
>> +                                             offset | RAM_SAVE_FLAG_PAGE));
>
> .. here we ignored normal page accounting.
>
> I think we should have the same behavior on both, perhaps keep them always?
>

This is the accounting for the header only if I'm not mistaken.

>> +        if (async) {
>> +            qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
>> +                                  migrate_release_ram() &&
>> +                                  migration_in_postcopy());
>> +        } else {
>> +            qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
>> +        }
>>      }
>>      ram_transferred_add(TARGET_PAGE_SIZE);
>>      stat64_add(&mig_stats.normal_pages, 1);

Here's the page accounting.

>> @@ -2475,6 +2487,8 @@ static void ram_save_cleanup(void *opaque)
>>          block->clear_bmap = NULL;
>>          g_free(block->bmap);
>>          block->bmap = NULL;
>> +        g_free(block->shadow_bmap);
>> +        block->shadow_bmap = NULL;
>>      }
>>  
>>      xbzrle_cleanup();
>> @@ -2842,6 +2856,7 @@ static void ram_list_init_bitmaps(void)
>>               */
>>              block->bmap = bitmap_new(pages);
>>              bitmap_set(block->bmap, 0, pages);
>> +            block->shadow_bmap = bitmap_new(block->used_length >> TARGET_PAGE_BITS);
>
> AFAICT bmap should also use used_length.  How about adding one more patch
> to change that, then you can use "pages" here?

It uses max_length. I don't know what are the effects of that
change. I'll look into it.

> See ram_mig_ram_block_resized() where we call migration_cancel() if resized.
>
>>              block->clear_bmap_shift = shift;
>>              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
>>          }
>> @@ -2979,6 +2994,44 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
>>      }
>>  }
>>  
>> +#define FIXED_RAM_HDR_VERSION 1
>> +struct FixedRamHeader {
>> +    uint32_t version;
>> +    uint64_t page_size;
>> +    uint64_t bitmap_offset;
>> +    uint64_t pages_offset;
>> +    /* end of v1 */
>> +} QEMU_PACKED;
>> +
>> +static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
>> +{
>> +    g_autofree struct FixedRamHeader *header;
>> +    size_t header_size, bitmap_size;
>> +    long num_pages;
>> +
>> +    header = g_new0(struct FixedRamHeader, 1);
>> +    header_size = sizeof(struct FixedRamHeader);
>> +
>> +    num_pages = block->used_length >> TARGET_PAGE_BITS;
>> +    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
>> +
>> +    /*
>> +     * Save the file offsets of where the bitmap and the pages should
>> +     * go as they are written at the end of migration and during the
>> +     * iterative phase, respectively.
>> +     */
>> +    block->bitmap_offset = qemu_get_offset(file) + header_size;
>> +    block->pages_offset = ROUND_UP(block->bitmap_offset +
>> +                                   bitmap_size, 0x100000);
>> +
>> +    header->version = cpu_to_be32(FIXED_RAM_HDR_VERSION);
>> +    header->page_size = cpu_to_be64(TARGET_PAGE_SIZE);
>
> This is the "page size" for the shadow bitmap, right?  Shall we state it
> somewhere (e.g. explaining why it's not block->page_size)?

Ok.

> It's unfortunate that we already have things like:
>
>             if (migrate_postcopy_ram() && block->page_size !=
>                                           qemu_host_page_size) {
>                 qemu_put_be64(f, block->page_size);
>             }
>
> But indeed we can't merge them because they seem to service different
> purpose.
>
>> +    header->bitmap_offset = cpu_to_be64(block->bitmap_offset);
>> +    header->pages_offset = cpu_to_be64(block->pages_offset);
>> +
>> +    qemu_put_buffer(file, (uint8_t *) header, header_size);
>> +}
>> +
>>  /*
>>   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
>>   * long-running RCU critical section.  When rcu-reclaims in the code
>> @@ -3028,6 +3081,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>              if (migrate_ignore_shared()) {
>>                  qemu_put_be64(f, block->mr->addr);
>>              }
>> +
>> +            if (migrate_fixed_ram()) {
>> +                fixed_ram_insert_header(f, block);
>> +                /* prepare offset for next ramblock */
>> +                qemu_set_offset(f, block->pages_offset + block->used_length, SEEK_SET);
>> +            }
>>          }
>>      }
>>  
>> @@ -3061,6 +3120,20 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
>>      return 0;
>>  }
>>  
>> +static void ram_save_shadow_bmap(QEMUFile *f)
>> +{
>> +    RAMBlock *block;
>> +
>> +    RAMBLOCK_FOREACH_MIGRATABLE(block) {
>> +        long num_pages = block->used_length >> TARGET_PAGE_BITS;
>> +        long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
>> +        qemu_put_buffer_at(f, (uint8_t *)block->shadow_bmap, bitmap_size,
>> +                           block->bitmap_offset);
>> +        /* to catch any thread late sending pages */
>> +        block->shadow_bmap = NULL;
>
> What is this for?  Wouldn't this leak the buffer already?
>

Ah this is debug code. It's because of multifd. In this series I don't
use sem_sync because there's no packets. But doing so causes
multifd_send_sync_main() to return before the multifd channel has sent
all its pages. This is here so the channel crashes when writing the
bitmap.

I think it's worth it to keep it but I'd have to move it to the multifd
patch and free the bitmap properly.

Thanks!
Peter Xu Oct. 31, 2023, 5:59 p.m. UTC | #5
On Tue, Oct 31, 2023 at 02:33:04PM -0300, Fabiano Rosas wrote:
> Peter Xu <peterx@redhat.com> writes:
> 
> > On Mon, Oct 23, 2023 at 05:35:54PM -0300, Fabiano Rosas wrote:
> >> From: Nikolay Borisov <nborisov@suse.com>
> >> 
> >> Implement the outgoing migration side for the 'fixed-ram' capability.
> >> 
> >> A bitmap is introduced to track which pages have been written in the
> >> migration file. Pages are written at a fixed location for every
> >> ramblock. Zero pages are ignored as they'd be zero in the destination
> >> migration as well.
> >> 
> >> The migration stream is altered to put the dirty pages for a ramblock
> >> after its header instead of having a sequential stream of pages that
> >> follow the ramblock headers. Since all pages have a fixed location,
> >> RAM_SAVE_FLAG_EOS is no longer generated on every migration iteration.
> >> 
> >> Without fixed-ram (current):
> >> 
> >> ramblock 1 header|ramblock 2 header|...|RAM_SAVE_FLAG_EOS|stream of
> >>  pages (iter 1)|RAM_SAVE_FLAG_EOS|stream of pages (iter 2)|...
> >> 
> >> With fixed-ram (new):
> >> 
> >> ramblock 1 header|ramblock 1 fixed-ram header|ramblock 1 pages (fixed
> >>  offsets)|ramblock 2 header|ramblock 2 fixed-ram header|ramblock 2
> >>  pages (fixed offsets)|...|RAM_SAVE_FLAG_EOS
> >> 
> >> where:
> >>  - ramblock header: the generic information for a ramblock, such as
> >>    idstr, used_len, etc.
> >> 
> >>  - ramblock fixed-ram header: the new information added by this
> >>    feature: bitmap of pages written, bitmap size and offset of pages
> >>    in the migration file.
> >> 
> >> Signed-off-by: Nikolay Borisov <nborisov@suse.com>
> >> Signed-off-by: Fabiano Rosas <farosas@suse.de>
> >> ---
> >>  include/exec/ramblock.h |  8 ++++
> >>  migration/options.c     |  3 --
> >>  migration/ram.c         | 98 ++++++++++++++++++++++++++++++++++++-----
> >>  3 files changed, 96 insertions(+), 13 deletions(-)
> >> 
> >> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> >> index 69c6a53902..e0e3f16852 100644
> >> --- a/include/exec/ramblock.h
> >> +++ b/include/exec/ramblock.h
> >> @@ -44,6 +44,14 @@ struct RAMBlock {
> >>      size_t page_size;
> >>      /* dirty bitmap used during migration */
> >>      unsigned long *bmap;
> >> +    /* shadow dirty bitmap used when migrating to a file */
> >> +    unsigned long *shadow_bmap;
> >> +    /*
> >> +     * offset in the file pages belonging to this ramblock are saved,
> >> +     * used only during migration to a file.
> >> +     */
> >> +    off_t bitmap_offset;
> >> +    uint64_t pages_offset;
> >>      /* bitmap of already received pages in postcopy */
> >>      unsigned long *receivedmap;
> >>  
> >> diff --git a/migration/options.c b/migration/options.c
> >> index 2622d8c483..9f693d909f 100644
> >> --- a/migration/options.c
> >> +++ b/migration/options.c
> >> @@ -271,12 +271,9 @@ bool migrate_events(void)
> >>  
> >>  bool migrate_fixed_ram(void)
> >>  {
> >> -/*
> >>      MigrationState *s = migrate_get_current();
> >>  
> >>      return s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
> >> -*/
> >> -    return false;

One more thing: maybe we can avoid this and just assume nobody will only
apply the previous patch and cause trouble.

> >>  }
> >>  
> >>  bool migrate_ignore_shared(void)
> >> diff --git a/migration/ram.c b/migration/ram.c
> >> index 92769902bb..152a03604f 100644
> >> --- a/migration/ram.c
> >> +++ b/migration/ram.c
> >> @@ -1157,12 +1157,18 @@ static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
> >>          return 0;
> >>      }
> >>  
> >> +    stat64_add(&mig_stats.zero_pages, 1);
> >
> > Here we keep zero page accounting, but..
> >
> >> +
> >> +    if (migrate_fixed_ram()) {
> >> +        /* zero pages are not transferred with fixed-ram */
> >> +        clear_bit(offset >> TARGET_PAGE_BITS, pss->block->shadow_bmap);
> >> +        return 1;
> >> +    }
> >> +
> >>      len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
> >>      qemu_put_byte(file, 0);
> >>      len += 1;
> >>      ram_release_page(pss->block->idstr, offset);
> >> -
> >> -    stat64_add(&mig_stats.zero_pages, 1);
> >>      ram_transferred_add(len);
> >>  
> >>      /*
> >> @@ -1220,14 +1226,20 @@ static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
> >>  {
> >>      QEMUFile *file = pss->pss_channel;
> >>  
> >> -    ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
> >> -                                         offset | RAM_SAVE_FLAG_PAGE));
> >> -    if (async) {
> >> -        qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
> >> -                              migrate_release_ram() &&
> >> -                              migration_in_postcopy());
> >> +    if (migrate_fixed_ram()) {
> >> +        qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
> >> +                           block->pages_offset + offset);
> >> +        set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
> >>      } else {
> >> -        qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
> >> +        ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
> >> +                                             offset | RAM_SAVE_FLAG_PAGE));
> >
> > .. here we ignored normal page accounting.
> >
> > I think we should have the same behavior on both, perhaps keep them always?
> >
> 
> This is the accounting for the header only if I'm not mistaken.
> 
> >> +        if (async) {
> >> +            qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
> >> +                                  migrate_release_ram() &&
> >> +                                  migration_in_postcopy());
> >> +        } else {
> >> +            qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
> >> +        }
> >>      }
> >>      ram_transferred_add(TARGET_PAGE_SIZE);
> >>      stat64_add(&mig_stats.normal_pages, 1);
> 
> Here's the page accounting.

Oh, that's okay then.

> 
> >> @@ -2475,6 +2487,8 @@ static void ram_save_cleanup(void *opaque)
> >>          block->clear_bmap = NULL;
> >>          g_free(block->bmap);
> >>          block->bmap = NULL;
> >> +        g_free(block->shadow_bmap);
> >> +        block->shadow_bmap = NULL;
> >>      }
> >>  
> >>      xbzrle_cleanup();
> >> @@ -2842,6 +2856,7 @@ static void ram_list_init_bitmaps(void)
> >>               */
> >>              block->bmap = bitmap_new(pages);
> >>              bitmap_set(block->bmap, 0, pages);
> >> +            block->shadow_bmap = bitmap_new(block->used_length >> TARGET_PAGE_BITS);
> >
> > AFAICT bmap should also use used_length.  How about adding one more patch
> > to change that, then you can use "pages" here?
> 
> It uses max_length. I don't know what are the effects of that
> change. I'll look into it.
> 
> > See ram_mig_ram_block_resized() where we call migration_cancel() if resized.
> >
> >>              block->clear_bmap_shift = shift;
> >>              block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
> >>          }
> >> @@ -2979,6 +2994,44 @@ void qemu_guest_free_page_hint(void *addr, size_t len)
> >>      }
> >>  }
> >>  
> >> +#define FIXED_RAM_HDR_VERSION 1
> >> +struct FixedRamHeader {
> >> +    uint32_t version;
> >> +    uint64_t page_size;
> >> +    uint64_t bitmap_offset;
> >> +    uint64_t pages_offset;
> >> +    /* end of v1 */
> >> +} QEMU_PACKED;
> >> +
> >> +static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
> >> +{
> >> +    g_autofree struct FixedRamHeader *header;
> >> +    size_t header_size, bitmap_size;
> >> +    long num_pages;
> >> +
> >> +    header = g_new0(struct FixedRamHeader, 1);
> >> +    header_size = sizeof(struct FixedRamHeader);
> >> +
> >> +    num_pages = block->used_length >> TARGET_PAGE_BITS;
> >> +    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
> >> +
> >> +    /*
> >> +     * Save the file offsets of where the bitmap and the pages should
> >> +     * go as they are written at the end of migration and during the
> >> +     * iterative phase, respectively.
> >> +     */
> >> +    block->bitmap_offset = qemu_get_offset(file) + header_size;
> >> +    block->pages_offset = ROUND_UP(block->bitmap_offset +
> >> +                                   bitmap_size, 0x100000);
> >> +
> >> +    header->version = cpu_to_be32(FIXED_RAM_HDR_VERSION);
> >> +    header->page_size = cpu_to_be64(TARGET_PAGE_SIZE);
> >
> > This is the "page size" for the shadow bitmap, right?  Shall we state it
> > somewhere (e.g. explaining why it's not block->page_size)?
> 
> Ok.
> 
> > It's unfortunate that we already have things like:
> >
> >             if (migrate_postcopy_ram() && block->page_size !=
> >                                           qemu_host_page_size) {
> >                 qemu_put_be64(f, block->page_size);
> >             }
> >
> > But indeed we can't merge them because they seem to service different
> > purpose.
> >
> >> +    header->bitmap_offset = cpu_to_be64(block->bitmap_offset);
> >> +    header->pages_offset = cpu_to_be64(block->pages_offset);
> >> +
> >> +    qemu_put_buffer(file, (uint8_t *) header, header_size);
> >> +}
> >> +
> >>  /*
> >>   * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
> >>   * long-running RCU critical section.  When rcu-reclaims in the code
> >> @@ -3028,6 +3081,12 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> >>              if (migrate_ignore_shared()) {
> >>                  qemu_put_be64(f, block->mr->addr);
> >>              }
> >> +
> >> +            if (migrate_fixed_ram()) {
> >> +                fixed_ram_insert_header(f, block);
> >> +                /* prepare offset for next ramblock */
> >> +                qemu_set_offset(f, block->pages_offset + block->used_length, SEEK_SET);
> >> +            }
> >>          }
> >>      }
> >>  
> >> @@ -3061,6 +3120,20 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
> >>      return 0;
> >>  }
> >>  
> >> +static void ram_save_shadow_bmap(QEMUFile *f)
> >> +{
> >> +    RAMBlock *block;
> >> +
> >> +    RAMBLOCK_FOREACH_MIGRATABLE(block) {
> >> +        long num_pages = block->used_length >> TARGET_PAGE_BITS;
> >> +        long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
> >> +        qemu_put_buffer_at(f, (uint8_t *)block->shadow_bmap, bitmap_size,
> >> +                           block->bitmap_offset);
> >> +        /* to catch any thread late sending pages */
> >> +        block->shadow_bmap = NULL;
> >
> > What is this for?  Wouldn't this leak the buffer already?
> >
> 
> Ah this is debug code. It's because of multifd. In this series I don't
> use sem_sync because there's no packets. But doing so causes
> multifd_send_sync_main() to return before the multifd channel has sent
> all its pages. This is here so the channel crashes when writing the
> bitmap.
> 
> I think it's worth it to keep it but I'd have to move it to the multifd
> patch and free the bitmap properly.

Ok, I'll keep reading.
Peter Xu Nov. 1, 2023, 3:23 p.m. UTC | #6
On Wed, Oct 25, 2023 at 10:39:58AM +0100, Daniel P. Berrangé wrote:
> If I'm reading the code correctly the new format has some padding
> such that each "ramblock pages" region starts on a 1 MB boundary.
> 
> eg so we get:
> 
>  --------------------------------
>  | ramblock 1 header            |
>  --------------------------------
>  | ramblock 1 fixed-ram header  |
>  --------------------------------
>  | padding to next 1MB boundary |
>  | ...                          |
>  --------------------------------
>  | ramblock 1 pages             |
>  | ...                          |
>  --------------------------------
>  | ramblock 2 header            |
>  --------------------------------
>  | ramblock 2 fixed-ram header  |
>  --------------------------------
>  | padding to next 1MB boundary |
>  | ...                          |
>  --------------------------------
>  | ramblock 2 pages             |
>  | ...                          |
>  --------------------------------
>  | ...                          |
>  --------------------------------
>  | RAM_SAVE_FLAG_EOS            |
>  --------------------------------
>  | ...                          |
>  -------------------------------

When reading the series, I was thinking one more thing on whether fixed-ram
would like to leverage compression in the future?

To be exact, not really fixed-ram as a feature, but non-live snapshot as
the real use case.  More below.

I just noticed that compression can be a great feature to have for such use
case, where the image size can be further shrinked noticeably.  In this
case, speed of savevm may not matter as much as image size (as compression
can take some more cpu overhead): VM will be stopped anyway.

With current fixed-ram layout, we probably can't have compression due to
two reasons:

  - We offset each page with page alignment in the final image, and that's
    where fixed-ram as the term comes from; more fundamentally,

  - We allow src VM to run (dropping auto-pause as the plan, even if we
    plan to guarantee it not run; QEMU still can't take that as
    guaranteed), then we need page granule on storing pages, and then it's
    hard to know the size of each page after compressed.

If with the guarantee that VM is stopped, I think compression should be
easy to get?  Because right after dropping the page-granule requirement, we
can compress in chunks, storing binary in the image, one page written once.
We may lose O_DIRECT but we can consider the hardware accelerators on
[de]compress if necessary.

I'm just raising this up to discuss to make sure we're on the same page.
Again, maybe that's not a concern to anyone, but I want to double check
with all of us, because it will affect the whole design including the image
layout.  I want to make sure we don't introduce another totally different
image layout in the near future just to support compression.

Thanks,
Daniel P. Berrangé Nov. 1, 2023, 3:52 p.m. UTC | #7
On Wed, Nov 01, 2023 at 11:23:37AM -0400, Peter Xu wrote:
> On Wed, Oct 25, 2023 at 10:39:58AM +0100, Daniel P. Berrangé wrote:
> > If I'm reading the code correctly the new format has some padding
> > such that each "ramblock pages" region starts on a 1 MB boundary.
> > 
> > eg so we get:
> > 
> >  --------------------------------
> >  | ramblock 1 header            |
> >  --------------------------------
> >  | ramblock 1 fixed-ram header  |
> >  --------------------------------
> >  | padding to next 1MB boundary |
> >  | ...                          |
> >  --------------------------------
> >  | ramblock 1 pages             |
> >  | ...                          |
> >  --------------------------------
> >  | ramblock 2 header            |
> >  --------------------------------
> >  | ramblock 2 fixed-ram header  |
> >  --------------------------------
> >  | padding to next 1MB boundary |
> >  | ...                          |
> >  --------------------------------
> >  | ramblock 2 pages             |
> >  | ...                          |
> >  --------------------------------
> >  | ...                          |
> >  --------------------------------
> >  | RAM_SAVE_FLAG_EOS            |
> >  --------------------------------
> >  | ...                          |
> >  -------------------------------
> 
> When reading the series, I was thinking one more thing on whether fixed-ram
> would like to leverage compression in the future?

Libvirt currently supports compression of saved state images, so yes,
I think compression is a desirable feature.

Due to libvirt's architecture it does compression on the stream and
the final step in the sequence bounc buffers into suitably aligned
memory required for O_DIRECT.

> To be exact, not really fixed-ram as a feature, but non-live snapshot as
> the real use case.  More below.
> 
> I just noticed that compression can be a great feature to have for such use
> case, where the image size can be further shrinked noticeably.  In this
> case, speed of savevm may not matter as much as image size (as compression
> can take some more cpu overhead): VM will be stopped anyway.
> 
> With current fixed-ram layout, we probably can't have compression due to
> two reasons:
> 
>   - We offset each page with page alignment in the final image, and that's
>     where fixed-ram as the term comes from; more fundamentally,
> 
>   - We allow src VM to run (dropping auto-pause as the plan, even if we
>     plan to guarantee it not run; QEMU still can't take that as
>     guaranteed), then we need page granule on storing pages, and then it's
>     hard to know the size of each page after compressed.
> 
> If with the guarantee that VM is stopped, I think compression should be
> easy to get?  Because right after dropping the page-granule requirement, we
> can compress in chunks, storing binary in the image, one page written once.
> We may lose O_DIRECT but we can consider the hardware accelerators on
> [de]compress if necessary.

We can keep O_DIRECT if we buffer in QEMU between compressor output
and disk I/O, which is what libvirt does. QEMU would still be saving
at least one extra copy compared to libvirt


The fixed RAM layout was primarily intended to allow easy parallel
I/O without needing any synchronization between threads. In theory
fixed RAM layout even allows you todo something fun like

   maped_addr = mmap(save-stat-fd, offset, ramblocksize);
   memcpy(ramblock, maped_addr, ramblocksize)
   munmap(maped_addr)

which would still be buffered I/O without O_DIRECT, but might be better
than many writes() as you avoid 1000's of syscalls.

Anyway back to compression, I think if you wanted to allow for parallel
I/O, then it would require a different "fixed ram" approach, where each
multifd  thread requested use of a 64 MB region, compressed until that
was full, then asked for another 64 MB region, repeat until done.

The reason we didn't want to break up the file format into regions like
this is because we wanted to allow for flexbility into configuration on
save / restore. eg  you might save using 7 threads, but restore using
3 threads. We didn't want the on-disk layout to have any structural
artifact that was related to the number of threads saving data, as that
would make restore less efficient. eg 2 threads would process 2 chunks
each and  and 1 thread would process 3 chunks, which is unbalanced.

With regards,
Daniel
Peter Xu Nov. 1, 2023, 4:24 p.m. UTC | #8
On Wed, Nov 01, 2023 at 03:52:18PM +0000, Daniel P. Berrangé wrote:
> On Wed, Nov 01, 2023 at 11:23:37AM -0400, Peter Xu wrote:
> > On Wed, Oct 25, 2023 at 10:39:58AM +0100, Daniel P. Berrangé wrote:
> > > If I'm reading the code correctly the new format has some padding
> > > such that each "ramblock pages" region starts on a 1 MB boundary.
> > > 
> > > eg so we get:
> > > 
> > >  --------------------------------
> > >  | ramblock 1 header            |
> > >  --------------------------------
> > >  | ramblock 1 fixed-ram header  |
> > >  --------------------------------
> > >  | padding to next 1MB boundary |
> > >  | ...                          |
> > >  --------------------------------
> > >  | ramblock 1 pages             |
> > >  | ...                          |
> > >  --------------------------------
> > >  | ramblock 2 header            |
> > >  --------------------------------
> > >  | ramblock 2 fixed-ram header  |
> > >  --------------------------------
> > >  | padding to next 1MB boundary |
> > >  | ...                          |
> > >  --------------------------------
> > >  | ramblock 2 pages             |
> > >  | ...                          |
> > >  --------------------------------
> > >  | ...                          |
> > >  --------------------------------
> > >  | RAM_SAVE_FLAG_EOS            |
> > >  --------------------------------
> > >  | ...                          |
> > >  -------------------------------
> > 
> > When reading the series, I was thinking one more thing on whether fixed-ram
> > would like to leverage compression in the future?
> 
> Libvirt currently supports compression of saved state images, so yes,
> I think compression is a desirable feature.

Ah, yeah this will work too; one more copy as you mentioned below, but
assume that's not a major concern so far (or.. will it?).

> 
> Due to libvirt's architecture it does compression on the stream and
> the final step in the sequence bounc buffers into suitably aligned
> memory required for O_DIRECT.
> 
> > To be exact, not really fixed-ram as a feature, but non-live snapshot as
> > the real use case.  More below.
> > 
> > I just noticed that compression can be a great feature to have for such use
> > case, where the image size can be further shrinked noticeably.  In this
> > case, speed of savevm may not matter as much as image size (as compression
> > can take some more cpu overhead): VM will be stopped anyway.
> > 
> > With current fixed-ram layout, we probably can't have compression due to
> > two reasons:
> > 
> >   - We offset each page with page alignment in the final image, and that's
> >     where fixed-ram as the term comes from; more fundamentally,
> > 
> >   - We allow src VM to run (dropping auto-pause as the plan, even if we
> >     plan to guarantee it not run; QEMU still can't take that as
> >     guaranteed), then we need page granule on storing pages, and then it's
> >     hard to know the size of each page after compressed.
> > 
> > If with the guarantee that VM is stopped, I think compression should be
> > easy to get?  Because right after dropping the page-granule requirement, we
> > can compress in chunks, storing binary in the image, one page written once.
> > We may lose O_DIRECT but we can consider the hardware accelerators on
> > [de]compress if necessary.
> 
> We can keep O_DIRECT if we buffer in QEMU between compressor output
> and disk I/O, which is what libvirt does. QEMU would still be saving
> at least one extra copy compared to libvirt
> 
> 
> The fixed RAM layout was primarily intended to allow easy parallel
> I/O without needing any synchronization between threads. In theory
> fixed RAM layout even allows you todo something fun like
> 
>    maped_addr = mmap(save-stat-fd, offset, ramblocksize);
>    memcpy(ramblock, maped_addr, ramblocksize)
>    munmap(maped_addr)
> 
> which would still be buffered I/O without O_DIRECT, but might be better
> than many writes() as you avoid 1000's of syscalls.
> 
> Anyway back to compression, I think if you wanted to allow for parallel
> I/O, then it would require a different "fixed ram" approach, where each
> multifd  thread requested use of a 64 MB region, compressed until that
> was full, then asked for another 64 MB region, repeat until done.

Right, we need a constant buffer per-thread if so.

> 
> The reason we didn't want to break up the file format into regions like
> this is because we wanted to allow for flexbility into configuration on
> save / restore. eg  you might save using 7 threads, but restore using
> 3 threads. We didn't want the on-disk layout to have any structural
> artifact that was related to the number of threads saving data, as that
> would make restore less efficient. eg 2 threads would process 2 chunks
> each and  and 1 thread would process 3 chunks, which is unbalanced.

I didn't follow on why the image needs to contain thread number
information.

Can the sub-header for each compressed chunk be described as (assuming
under specific ramblock header, so ramblock is known):

  - size of compressed data
  - (start_offset, end_offset) of pages this chunk of data represents

Then when saving, we assign 64M to each thread no matter how many are
there, for each thread it first compresses 64M into binary, knowing the
size, then request for a writeback to image, with the chunk header and
binary flushed.

Then the final image will be a sequence of chunks for each ramblock.

Assuming decompress can do the same by assigning different chunks to each
decompress thread, no matter how many are there.

Would that work?

To go back to the original topic: I think it's fine if Libvirt will do the
compression, that is more flexible indeed to do per-file with whatever
compression algorithm the uesr wants, and even cover non-RAM data.

I think such considerations / thoughts over compression solution may also
be nice to be documented in the docs/ under this feature.

Thanks,
Daniel P. Berrangé Nov. 1, 2023, 4:37 p.m. UTC | #9
On Wed, Nov 01, 2023 at 12:24:22PM -0400, Peter Xu wrote:
> On Wed, Nov 01, 2023 at 03:52:18PM +0000, Daniel P. Berrangé wrote:
> > On Wed, Nov 01, 2023 at 11:23:37AM -0400, Peter Xu wrote:
> > > On Wed, Oct 25, 2023 at 10:39:58AM +0100, Daniel P. Berrangé wrote:
> > > > If I'm reading the code correctly the new format has some padding
> > > > such that each "ramblock pages" region starts on a 1 MB boundary.
> > > > 
> > > > eg so we get:
> > > > 
> > > >  --------------------------------
> > > >  | ramblock 1 header            |
> > > >  --------------------------------
> > > >  | ramblock 1 fixed-ram header  |
> > > >  --------------------------------
> > > >  | padding to next 1MB boundary |
> > > >  | ...                          |
> > > >  --------------------------------
> > > >  | ramblock 1 pages             |
> > > >  | ...                          |
> > > >  --------------------------------
> > > >  | ramblock 2 header            |
> > > >  --------------------------------
> > > >  | ramblock 2 fixed-ram header  |
> > > >  --------------------------------
> > > >  | padding to next 1MB boundary |
> > > >  | ...                          |
> > > >  --------------------------------
> > > >  | ramblock 2 pages             |
> > > >  | ...                          |
> > > >  --------------------------------
> > > >  | ...                          |
> > > >  --------------------------------
> > > >  | RAM_SAVE_FLAG_EOS            |
> > > >  --------------------------------
> > > >  | ...                          |
> > > >  -------------------------------
> > > 
> > > When reading the series, I was thinking one more thing on whether fixed-ram
> > > would like to leverage compression in the future?
> > 
> > Libvirt currently supports compression of saved state images, so yes,
> > I think compression is a desirable feature.
> 
> Ah, yeah this will work too; one more copy as you mentioned below, but
> assume that's not a major concern so far (or.. will it?).
> 
> > 
> > Due to libvirt's architecture it does compression on the stream and
> > the final step in the sequence bounc buffers into suitably aligned
> > memory required for O_DIRECT.
> > 
> > > To be exact, not really fixed-ram as a feature, but non-live snapshot as
> > > the real use case.  More below.
> > > 
> > > I just noticed that compression can be a great feature to have for such use
> > > case, where the image size can be further shrinked noticeably.  In this
> > > case, speed of savevm may not matter as much as image size (as compression
> > > can take some more cpu overhead): VM will be stopped anyway.
> > > 
> > > With current fixed-ram layout, we probably can't have compression due to
> > > two reasons:
> > > 
> > >   - We offset each page with page alignment in the final image, and that's
> > >     where fixed-ram as the term comes from; more fundamentally,
> > > 
> > >   - We allow src VM to run (dropping auto-pause as the plan, even if we
> > >     plan to guarantee it not run; QEMU still can't take that as
> > >     guaranteed), then we need page granule on storing pages, and then it's
> > >     hard to know the size of each page after compressed.
> > > 
> > > If with the guarantee that VM is stopped, I think compression should be
> > > easy to get?  Because right after dropping the page-granule requirement, we
> > > can compress in chunks, storing binary in the image, one page written once.
> > > We may lose O_DIRECT but we can consider the hardware accelerators on
> > > [de]compress if necessary.
> > 
> > We can keep O_DIRECT if we buffer in QEMU between compressor output
> > and disk I/O, which is what libvirt does. QEMU would still be saving
> > at least one extra copy compared to libvirt
> > 
> > 
> > The fixed RAM layout was primarily intended to allow easy parallel
> > I/O without needing any synchronization between threads. In theory
> > fixed RAM layout even allows you todo something fun like
> > 
> >    maped_addr = mmap(save-stat-fd, offset, ramblocksize);
> >    memcpy(ramblock, maped_addr, ramblocksize)
> >    munmap(maped_addr)
> > 
> > which would still be buffered I/O without O_DIRECT, but might be better
> > than many writes() as you avoid 1000's of syscalls.
> > 
> > Anyway back to compression, I think if you wanted to allow for parallel
> > I/O, then it would require a different "fixed ram" approach, where each
> > multifd  thread requested use of a 64 MB region, compressed until that
> > was full, then asked for another 64 MB region, repeat until done.
> 
> Right, we need a constant buffer per-thread if so.
> 
> > 
> > The reason we didn't want to break up the file format into regions like
> > this is because we wanted to allow for flexbility into configuration on
> > save / restore. eg  you might save using 7 threads, but restore using
> > 3 threads. We didn't want the on-disk layout to have any structural
> > artifact that was related to the number of threads saving data, as that
> > would make restore less efficient. eg 2 threads would process 2 chunks
> > each and  and 1 thread would process 3 chunks, which is unbalanced.
> 
> I didn't follow on why the image needs to contain thread number
> information.

It doesn't contain thread number information directly, but it can
be implicit from the data layout.

If you want parallel I/O, each thread has to know it is the only
one reading/writing to a particular region of the file. With the
fixed RAM layout in this series, the file offset directly maps
to the memory region. So if a thread has been given a guest page
to save it knows it will be the only thing writing to the file
at that offset. There is no relationship at all between the
number of threads and the file layout.

If you can't directly map pages to file offsets, then you need
some other way to lay out date such that each thread can safely
write. If you split up a file based on fixed size chunks, then
the number of chunks you end up with in the file is likely to be
a multiple of the number of threads you had saving data.

This means if you restore using a different number of threads,
you can't evenly assign file chunks to each restore thread.

There's no info about thread IDs in the file, but the data layout
reflects how mcuh threads were doing work.

> Assuming decompress can do the same by assigning different chunks to each
> decompress thread, no matter how many are there.
> 
> Would that work?

Again you get uneven workloads if the number of restore threads is
different than the save threads, as some threads will have to process
more chunks than other threads. If the chunks are small this might
not matter, if they are big it could matter.

With regards,
Daniel
Peter Xu Nov. 1, 2023, 5:30 p.m. UTC | #10
On Wed, Nov 01, 2023 at 04:37:12PM +0000, Daniel P. Berrangé wrote:
> It doesn't contain thread number information directly, but it can
> be implicit from the data layout.
> 
> If you want parallel I/O, each thread has to know it is the only
> one reading/writing to a particular region of the file. With the
> fixed RAM layout in this series, the file offset directly maps
> to the memory region. So if a thread has been given a guest page
> to save it knows it will be the only thing writing to the file
> at that offset. There is no relationship at all between the
> number of threads and the file layout.
> 
> If you can't directly map pages to file offsets, then you need
> some other way to lay out date such that each thread can safely
> write. If you split up a file based on fixed size chunks, then
> the number of chunks you end up with in the file is likely to be
> a multiple of the number of threads you had saving data.

What I was thinking is provision fixed size chunk in ramblock address
space, e.g. 64M pages for each thread.  It compresses with a local buffer,
then request the file offsets to write only after the compression
completed, because we'll need that to request file offset.

> 
> This means if you restore using a different number of threads,
> you can't evenly assign file chunks to each restore thread.
> 
> There's no info about thread IDs in the file, but the data layout
> reflects how mcuh threads were doing work.
> 
> > Assuming decompress can do the same by assigning different chunks to each
> > decompress thread, no matter how many are there.
> > 
> > Would that work?
> 
> Again you get uneven workloads if the number of restore threads is
> different than the save threads, as some threads will have to process
> more chunks than other threads. If the chunks are small this might
> not matter, if they are big it could matter.

Maybe you meant when the chunk size is only calculated from thread numbers,
and when chunk is very large?

If we have fixed size ramblock chunks, the number of chunks can be mostly
irrelevant, e.g. for 4G guest it can contain 4G/64M=128 chunks.  128 chunks
can easily be decompressed concurrently with mostly whatever number of recv
threads.

Parallel IO is not a problem either, afaict, if each thread can request its
file offset to read/write.  The write side is a bit tricky if with what I
said above, it can only be requested and exclusively assigned to the writer
thread after compression has finished and the thread knows how many bytes
it needs to put the results.  On read side we know the binary size of each
chunk, so we can already mark each chunk exclusive to the each reader
thread.

Thanks,
diff mbox series

Patch

diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 69c6a53902..e0e3f16852 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -44,6 +44,14 @@  struct RAMBlock {
     size_t page_size;
     /* dirty bitmap used during migration */
     unsigned long *bmap;
+    /* shadow dirty bitmap used when migrating to a file */
+    unsigned long *shadow_bmap;
+    /*
+     * offset in the file pages belonging to this ramblock are saved,
+     * used only during migration to a file.
+     */
+    off_t bitmap_offset;
+    uint64_t pages_offset;
     /* bitmap of already received pages in postcopy */
     unsigned long *receivedmap;
 
diff --git a/migration/options.c b/migration/options.c
index 2622d8c483..9f693d909f 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -271,12 +271,9 @@  bool migrate_events(void)
 
 bool migrate_fixed_ram(void)
 {
-/*
     MigrationState *s = migrate_get_current();
 
     return s->capabilities[MIGRATION_CAPABILITY_FIXED_RAM];
-*/
-    return false;
 }
 
 bool migrate_ignore_shared(void)
diff --git a/migration/ram.c b/migration/ram.c
index 92769902bb..152a03604f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1157,12 +1157,18 @@  static int save_zero_page(RAMState *rs, PageSearchStatus *pss,
         return 0;
     }
 
+    stat64_add(&mig_stats.zero_pages, 1);
+
+    if (migrate_fixed_ram()) {
+        /* zero pages are not transferred with fixed-ram */
+        clear_bit(offset >> TARGET_PAGE_BITS, pss->block->shadow_bmap);
+        return 1;
+    }
+
     len += save_page_header(pss, file, pss->block, offset | RAM_SAVE_FLAG_ZERO);
     qemu_put_byte(file, 0);
     len += 1;
     ram_release_page(pss->block->idstr, offset);
-
-    stat64_add(&mig_stats.zero_pages, 1);
     ram_transferred_add(len);
 
     /*
@@ -1220,14 +1226,20 @@  static int save_normal_page(PageSearchStatus *pss, RAMBlock *block,
 {
     QEMUFile *file = pss->pss_channel;
 
-    ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
-                                         offset | RAM_SAVE_FLAG_PAGE));
-    if (async) {
-        qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
-                              migrate_release_ram() &&
-                              migration_in_postcopy());
+    if (migrate_fixed_ram()) {
+        qemu_put_buffer_at(file, buf, TARGET_PAGE_SIZE,
+                           block->pages_offset + offset);
+        set_bit(offset >> TARGET_PAGE_BITS, block->shadow_bmap);
     } else {
-        qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
+        ram_transferred_add(save_page_header(pss, pss->pss_channel, block,
+                                             offset | RAM_SAVE_FLAG_PAGE));
+        if (async) {
+            qemu_put_buffer_async(file, buf, TARGET_PAGE_SIZE,
+                                  migrate_release_ram() &&
+                                  migration_in_postcopy());
+        } else {
+            qemu_put_buffer(file, buf, TARGET_PAGE_SIZE);
+        }
     }
     ram_transferred_add(TARGET_PAGE_SIZE);
     stat64_add(&mig_stats.normal_pages, 1);
@@ -2475,6 +2487,8 @@  static void ram_save_cleanup(void *opaque)
         block->clear_bmap = NULL;
         g_free(block->bmap);
         block->bmap = NULL;
+        g_free(block->shadow_bmap);
+        block->shadow_bmap = NULL;
     }
 
     xbzrle_cleanup();
@@ -2842,6 +2856,7 @@  static void ram_list_init_bitmaps(void)
              */
             block->bmap = bitmap_new(pages);
             bitmap_set(block->bmap, 0, pages);
+            block->shadow_bmap = bitmap_new(block->used_length >> TARGET_PAGE_BITS);
             block->clear_bmap_shift = shift;
             block->clear_bmap = bitmap_new(clear_bmap_size(pages, shift));
         }
@@ -2979,6 +2994,44 @@  void qemu_guest_free_page_hint(void *addr, size_t len)
     }
 }
 
+#define FIXED_RAM_HDR_VERSION 1
+struct FixedRamHeader {
+    uint32_t version;
+    uint64_t page_size;
+    uint64_t bitmap_offset;
+    uint64_t pages_offset;
+    /* end of v1 */
+} QEMU_PACKED;
+
+static void fixed_ram_insert_header(QEMUFile *file, RAMBlock *block)
+{
+    g_autofree struct FixedRamHeader *header;
+    size_t header_size, bitmap_size;
+    long num_pages;
+
+    header = g_new0(struct FixedRamHeader, 1);
+    header_size = sizeof(struct FixedRamHeader);
+
+    num_pages = block->used_length >> TARGET_PAGE_BITS;
+    bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
+
+    /*
+     * Save the file offsets of where the bitmap and the pages should
+     * go as they are written at the end of migration and during the
+     * iterative phase, respectively.
+     */
+    block->bitmap_offset = qemu_get_offset(file) + header_size;
+    block->pages_offset = ROUND_UP(block->bitmap_offset +
+                                   bitmap_size, 0x100000);
+
+    header->version = cpu_to_be32(FIXED_RAM_HDR_VERSION);
+    header->page_size = cpu_to_be64(TARGET_PAGE_SIZE);
+    header->bitmap_offset = cpu_to_be64(block->bitmap_offset);
+    header->pages_offset = cpu_to_be64(block->pages_offset);
+
+    qemu_put_buffer(file, (uint8_t *) header, header_size);
+}
+
 /*
  * Each of ram_save_setup, ram_save_iterate and ram_save_complete has
  * long-running RCU critical section.  When rcu-reclaims in the code
@@ -3028,6 +3081,12 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
             if (migrate_ignore_shared()) {
                 qemu_put_be64(f, block->mr->addr);
             }
+
+            if (migrate_fixed_ram()) {
+                fixed_ram_insert_header(f, block);
+                /* prepare offset for next ramblock */
+                qemu_set_offset(f, block->pages_offset + block->used_length, SEEK_SET);
+            }
         }
     }
 
@@ -3061,6 +3120,20 @@  static int ram_save_setup(QEMUFile *f, void *opaque)
     return 0;
 }
 
+static void ram_save_shadow_bmap(QEMUFile *f)
+{
+    RAMBlock *block;
+
+    RAMBLOCK_FOREACH_MIGRATABLE(block) {
+        long num_pages = block->used_length >> TARGET_PAGE_BITS;
+        long bitmap_size = BITS_TO_LONGS(num_pages) * sizeof(unsigned long);
+        qemu_put_buffer_at(f, (uint8_t *)block->shadow_bmap, bitmap_size,
+                           block->bitmap_offset);
+        /* to catch any thread late sending pages */
+        block->shadow_bmap = NULL;
+    }
+}
+
 /**
  * ram_save_iterate: iterative stage for migration
  *
@@ -3179,7 +3252,6 @@  out:
         qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
         qemu_fflush(f);
         ram_transferred_add(8);
-
         ret = qemu_file_get_error(f);
     }
     if (ret < 0) {
@@ -3256,7 +3328,13 @@  static int ram_save_complete(QEMUFile *f, void *opaque)
     if (migrate_multifd() && !migrate_multifd_flush_after_each_section()) {
         qemu_put_be64(f, RAM_SAVE_FLAG_MULTIFD_FLUSH);
     }
+
+    if (migrate_fixed_ram()) {
+        ram_save_shadow_bmap(f);
+    }
+
     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
+
     qemu_fflush(f);
 
     return 0;