diff mbox series

[v5,5/7] migration/multifd: implement initialization of qpl compression

Message ID 20240319164527.1873891-6-yuan1.liu@intel.com
State New
Headers show
Series Live Migration With IAA | expand

Commit Message

Yuan Liu March 19, 2024, 4:45 p.m. UTC
the qpl initialization includes memory allocation for compressed
data and the qpl job initialization.

the qpl initialization will check whether the In-Memory Analytics
Accelerator(IAA) hardware is available, if the platform does not
have IAA hardware or the IAA hardware is not available, the QPL
compression initialization will fail.

Signed-off-by: Yuan Liu <yuan1.liu@intel.com>
Reviewed-by: Nanhai Zou <nanhai.zou@intel.com>
---
 migration/multifd-qpl.c | 243 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 242 insertions(+), 1 deletion(-)

Comments

Daniel P. Berrangé March 20, 2024, 10:42 a.m. UTC | #1
On Wed, Mar 20, 2024 at 12:45:25AM +0800, Yuan Liu wrote:
> the qpl initialization includes memory allocation for compressed
> data and the qpl job initialization.
> 
> the qpl initialization will check whether the In-Memory Analytics
> Accelerator(IAA) hardware is available, if the platform does not
> have IAA hardware or the IAA hardware is not available, the QPL
> compression initialization will fail.
> 
> Signed-off-by: Yuan Liu <yuan1.liu@intel.com>
> Reviewed-by: Nanhai Zou <nanhai.zou@intel.com>
> ---
>  migration/multifd-qpl.c | 243 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 242 insertions(+), 1 deletion(-)
> 
> diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> index 056a68a060..6de65e9da7 100644
> --- a/migration/multifd-qpl.c
> +++ b/migration/multifd-qpl.c
> @@ -9,12 +9,253 @@
>   * This work is licensed under the terms of the GNU GPL, version 2 or later.
>   * See the COPYING file in the top-level directory.
>   */
> +
>  #include "qemu/osdep.h"
>  #include "qemu/module.h"
> +#include "qapi/error.h"
> +#include "migration.h"
> +#include "multifd.h"
> +#include "qpl/qpl.h"
> +
> +typedef struct {
> +    qpl_job **job_array;
> +    /* the number of allocated jobs */
> +    uint32_t job_num;
> +    /* the size of data processed by a qpl job */
> +    uint32_t data_size;
> +    /* compressed data buffer */
> +    uint8_t *zbuf;
> +    /* the length of compressed data */
> +    uint32_t *zbuf_hdr;
> +} QplData;
> +
> +static void free_zbuf(QplData *qpl)
> +{
> +    if (qpl->zbuf != NULL) {
> +        munmap(qpl->zbuf, qpl->job_num * qpl->data_size);
> +        qpl->zbuf = NULL;
> +    }
> +    if (qpl->zbuf_hdr != NULL) {
> +        g_free(qpl->zbuf_hdr);
> +        qpl->zbuf_hdr = NULL;
> +    }
> +}
> +
> +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> +{
> +    int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> +    uint32_t size = qpl->job_num * qpl->data_size;
> +    uint8_t *buf;
> +
> +    buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
> +    if (buf == MAP_FAILED) {
> +        error_setg(errp, "multifd: %u: alloc_zbuf failed, job num %u, size %u",
> +                   chan_id, qpl->job_num, qpl->data_size);
> +        return -1;
> +    }

What's the reason for using mmap here, rather than a normal
malloc ?

> +    qpl->zbuf = buf;
> +    qpl->zbuf_hdr = g_new0(uint32_t, qpl->job_num);
> +    return 0;
> +}
> +
> +static void free_jobs(QplData *qpl)
> +{
> +    for (int i = 0; i < qpl->job_num; i++) {
> +        qpl_fini_job(qpl->job_array[i]);
> +        g_free(qpl->job_array[i]);
> +        qpl->job_array[i] = NULL;
> +    }
> +    g_free(qpl->job_array);
> +    qpl->job_array = NULL;
> +}
> +
> +static int alloc_jobs(QplData *qpl, uint8_t chan_id, Error **errp)
> +{
> +    qpl_status status;
> +    uint32_t job_size = 0;
> +    qpl_job *job = NULL;
> +    /* always use IAA hardware accelerator */
> +    qpl_path_t path = qpl_path_hardware;
> +
> +    status = qpl_get_job_size(path, &job_size);
> +    if (status != QPL_STS_OK) {
> +        error_setg(errp, "multifd: %u: qpl_get_job_size failed with error %d",
> +                   chan_id, status);
> +        return -1;
> +    }
> +    qpl->job_array = g_new0(qpl_job *, qpl->job_num);
> +    for (int i = 0; i < qpl->job_num; i++) {
> +        job = g_malloc0(job_size);
> +        status = qpl_init_job(path, job);
> +        if (status != QPL_STS_OK) {
> +            error_setg(errp, "multifd: %u: qpl_init_job failed with error %d",
> +                       chan_id, status);
> +            free_jobs(qpl);
> +            return -1;
> +        }
> +        qpl->job_array[i] = job;
> +    }
> +    return 0;
> +}
> +
> +static int init_qpl(QplData *qpl, uint32_t job_num, uint32_t data_size,
> +                    uint8_t chan_id, Error **errp)
> +{

IMHO this method should be a normal constructor, it it should
be responsible for allocating 'qpl' struct too, and returning
it, not have the caller allocate it.

> +    qpl->job_num = job_num;
> +    qpl->data_size = data_size;
> +    if (alloc_zbuf(qpl, chan_id, errp) != 0) {
> +        return -1;
> +    }
> +    if (alloc_jobs(qpl, chan_id, errp) != 0) {
> +        free_zbuf(qpl);
> +        return -1;
> +    }
> +    return 0;
> +}
> +
> +static void deinit_qpl(QplData *qpl)
> +{
> +    if (qpl != NULL) {
> +        free_jobs(qpl);
> +        free_zbuf(qpl);
> +        qpl->job_num = 0;
> +        qpl->data_size = 0;
> +    }
> +}

This should also free 'qpl' instead of leaving it upto the
caller.

> +
> +/**
> + * qpl_send_setup: setup send side
> + *
> + * Setup each channel with QPL compression.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @errp: pointer to an error
> + */
> +static int qpl_send_setup(MultiFDSendParams *p, Error **errp)
> +{
> +    QplData *qpl;
> +
> +    qpl = g_new0(QplData, 1);
> +    if (init_qpl(qpl, p->page_count, p->page_size, p->id, errp) != 0) {
> +        g_free(qpl);
> +        return -1;
> +    }
> +    p->compress_data = qpl;
> +
> +    assert(p->iov == NULL);
> +    /*
> +     * Each page will be compressed independently and sent using an IOV. The
> +     * additional two IOVs are used to store packet header and compressed data
> +     * length
> +     */
> +    p->iov = g_new0(struct iovec, p->page_count + 2);
> +    return 0;
> +}
> +
> +/**
> + * qpl_send_cleanup: cleanup send side
> + *
> + * Close the channel and return memory.
> + *
> + * @p: Params for the channel that we are using
> + * @errp: pointer to an error
> + */
> +static void qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
> +{
> +    QplData *qpl = p->compress_data;
> +
> +    deinit_qpl(qpl);
> +    g_free(p->compress_data);
> +    p->compress_data = NULL;
> +}
> +
> +/**
> + * qpl_send_prepare: prepare data to be able to send
> + *
> + * Create a compressed buffer with all the pages that we are going to
> + * send.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @errp: pointer to an error
> + */
> +static int qpl_send_prepare(MultiFDSendParams *p, Error **errp)
> +{
> +    /* Implement in next patch */
> +    return -1;
> +}
> +
> +/**
> + * qpl_recv_setup: setup receive side
> + *
> + * Create the compressed channel and buffer.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @errp: pointer to an error
> + */
> +static int qpl_recv_setup(MultiFDRecvParams *p, Error **errp)
> +{
> +    QplData *qpl;
> +
> +    qpl = g_new0(QplData, 1);
> +    if (init_qpl(qpl, p->page_count, p->page_size, p->id, errp) != 0) {
> +        g_free(qpl);
> +        return -1;
> +    }
> +    p->compress_data = qpl;
> +    return 0;
> +}
> +
> +/**
> + * qpl_recv_cleanup: setup receive side
> + *
> + * Close the channel and return memory.
> + *
> + * @p: Params for the channel that we are using
> + */
> +static void qpl_recv_cleanup(MultiFDRecvParams *p)
> +{
> +    QplData *qpl = p->compress_data;
> +
> +    deinit_qpl(qpl);
> +    g_free(p->compress_data);
> +    p->compress_data = NULL;
> +}
> +
> +/**
> + * qpl_recv: read the data from the channel into actual pages
> + *
> + * Read the compressed buffer, and uncompress it into the actual
> + * pages.
> + *
> + * Returns 0 for success or -1 for error
> + *
> + * @p: Params for the channel that we are using
> + * @errp: pointer to an error
> + */
> +static int qpl_recv(MultiFDRecvParams *p, Error **errp)
> +{
> +    /* Implement in next patch */
> +    return -1;
> +}

The qpl library uses 'qpl_' as its name prefix, so using the
same prefix in QEMU is fragile if future APIs are added to
the library.

Please consistently use 'multifd_qpl_' as the prefix for
*every* method in this file.

> +
> +static MultiFDMethods multifd_qpl_ops = {
> +    .send_setup = qpl_send_setup,
> +    .send_cleanup = qpl_send_cleanup,
> +    .send_prepare = qpl_send_prepare,
> +    .recv_setup = qpl_recv_setup,
> +    .recv_cleanup = qpl_recv_cleanup,
> +    .recv = qpl_recv,
> +};
>  
>  static void multifd_qpl_register(void)
>  {
> -    /* noop */
> +    multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops);
>  }
>  
>  migration_init(multifd_qpl_register);
> -- 
> 2.39.3
> 
> 

With regards,
Daniel
Yuan Liu March 20, 2024, 3:02 p.m. UTC | #2
> -----Original Message-----
> From: Daniel P. Berrangé <berrange@redhat.com>
> Sent: Wednesday, March 20, 2024 6:42 PM
> To: Liu, Yuan1 <yuan1.liu@intel.com>
> Cc: peterx@redhat.com; farosas@suse.de; qemu-devel@nongnu.org;
> hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou, Nanhai
> <nanhai.zou@intel.com>
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 12:45:25AM +0800, Yuan Liu wrote:
> > the qpl initialization includes memory allocation for compressed
> > data and the qpl job initialization.
> >
> > the qpl initialization will check whether the In-Memory Analytics
> > Accelerator(IAA) hardware is available, if the platform does not
> > have IAA hardware or the IAA hardware is not available, the QPL
> > compression initialization will fail.
> >
> > Signed-off-by: Yuan Liu <yuan1.liu@intel.com>
> > Reviewed-by: Nanhai Zou <nanhai.zou@intel.com>
> > ---
> >  migration/multifd-qpl.c | 243 +++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 242 insertions(+), 1 deletion(-)
> >
> > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > index 056a68a060..6de65e9da7 100644
> > --- a/migration/multifd-qpl.c
> > +++ b/migration/multifd-qpl.c
> > @@ -9,12 +9,253 @@
> >   * This work is licensed under the terms of the GNU GPL, version 2 or
> later.
> >   * See the COPYING file in the top-level directory.
> >   */
> > +
> >  #include "qemu/osdep.h"
> >  #include "qemu/module.h"
> > +#include "qapi/error.h"
> > +#include "migration.h"
> > +#include "multifd.h"
> > +#include "qpl/qpl.h"
> > +
> > +typedef struct {
> > +    qpl_job **job_array;
> > +    /* the number of allocated jobs */
> > +    uint32_t job_num;
> > +    /* the size of data processed by a qpl job */
> > +    uint32_t data_size;
> > +    /* compressed data buffer */
> > +    uint8_t *zbuf;
> > +    /* the length of compressed data */
> > +    uint32_t *zbuf_hdr;
> > +} QplData;
> > +
> > +static void free_zbuf(QplData *qpl)
> > +{
> > +    if (qpl->zbuf != NULL) {
> > +        munmap(qpl->zbuf, qpl->job_num * qpl->data_size);
> > +        qpl->zbuf = NULL;
> > +    }
> > +    if (qpl->zbuf_hdr != NULL) {
> > +        g_free(qpl->zbuf_hdr);
> > +        qpl->zbuf_hdr = NULL;
> > +    }
> > +}
> > +
> > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > +{
> > +    int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > +    uint32_t size = qpl->job_num * qpl->data_size;
> > +    uint8_t *buf;
> > +
> > +    buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -
> 1, 0);
> > +    if (buf == MAP_FAILED) {
> > +        error_setg(errp, "multifd: %u: alloc_zbuf failed, job num %u,
> size %u",
> > +                   chan_id, qpl->job_num, qpl->data_size);
> > +        return -1;
> > +    }
> 
> What's the reason for using mmap here, rather than a normal
> malloc ?

I want to populate the memory accessed by the IAA device in the initialization
phase, and then avoid initiating I/O page faults through the IAA device during
migration, a large number of I/O page faults are not good for performance. 

This problem also occurs at the destination, therefore, I recommend that
customers need to add -mem-prealloc for destination boot parameters.

> > +    qpl->zbuf = buf;
> > +    qpl->zbuf_hdr = g_new0(uint32_t, qpl->job_num);
> > +    return 0;
> > +}
> > +
> > +static void free_jobs(QplData *qpl)
> > +{
> > +    for (int i = 0; i < qpl->job_num; i++) {
> > +        qpl_fini_job(qpl->job_array[i]);
> > +        g_free(qpl->job_array[i]);
> > +        qpl->job_array[i] = NULL;
> > +    }
> > +    g_free(qpl->job_array);
> > +    qpl->job_array = NULL;
> > +}
> > +
> > +static int alloc_jobs(QplData *qpl, uint8_t chan_id, Error **errp)
> > +{
> > +    qpl_status status;
> > +    uint32_t job_size = 0;
> > +    qpl_job *job = NULL;
> > +    /* always use IAA hardware accelerator */
> > +    qpl_path_t path = qpl_path_hardware;
> > +
> > +    status = qpl_get_job_size(path, &job_size);
> > +    if (status != QPL_STS_OK) {
> > +        error_setg(errp, "multifd: %u: qpl_get_job_size failed with
> error %d",
> > +                   chan_id, status);
> > +        return -1;
> > +    }
> > +    qpl->job_array = g_new0(qpl_job *, qpl->job_num);
> > +    for (int i = 0; i < qpl->job_num; i++) {
> > +        job = g_malloc0(job_size);
> > +        status = qpl_init_job(path, job);
> > +        if (status != QPL_STS_OK) {
> > +            error_setg(errp, "multifd: %u: qpl_init_job failed with
> error %d",
> > +                       chan_id, status);
> > +            free_jobs(qpl);
> > +            return -1;
> > +        }
> > +        qpl->job_array[i] = job;
> > +    }
> > +    return 0;
> > +}
> > +
> > +static int init_qpl(QplData *qpl, uint32_t job_num, uint32_t data_size,
> > +                    uint8_t chan_id, Error **errp)
> > +{
> 
> IMHO this method should be a normal constructor, it it should
> be responsible for allocating 'qpl' struct too, and returning
> it, not have the caller allocate it.

Thanks for your comments, I will refine this.

> > +    qpl->job_num = job_num;
> > +    qpl->data_size = data_size;
> > +    if (alloc_zbuf(qpl, chan_id, errp) != 0) {
> > +        return -1;
> > +    }
> > +    if (alloc_jobs(qpl, chan_id, errp) != 0) {
> > +        free_zbuf(qpl);
> > +        return -1;
> > +    }
> > +    return 0;
> > +}
> > +
> > +static void deinit_qpl(QplData *qpl)
> > +{
> > +    if (qpl != NULL) {
> > +        free_jobs(qpl);
> > +        free_zbuf(qpl);
> > +        qpl->job_num = 0;
> > +        qpl->data_size = 0;
> > +    }
> > +}
> 
> This should also free 'qpl' instead of leaving it upto the
> caller.

Sure, I will refine this in the next version.

> > +/**
> > + * qpl_send_setup: setup send side
> > + *
> > + * Setup each channel with QPL compression.
> > + *
> > + * Returns 0 for success or -1 for error
> > + *
> > + * @p: Params for the channel that we are using
> > + * @errp: pointer to an error
> > + */
> > +static int qpl_send_setup(MultiFDSendParams *p, Error **errp)
> > +{
> > +    QplData *qpl;
> > +
> > +    qpl = g_new0(QplData, 1);
> > +    if (init_qpl(qpl, p->page_count, p->page_size, p->id, errp) != 0) {
> > +        g_free(qpl);
> > +        return -1;
> > +    }
> > +    p->compress_data = qpl;
> > +
> > +    assert(p->iov == NULL);
> > +    /*
> > +     * Each page will be compressed independently and sent using an
> IOV. The
> > +     * additional two IOVs are used to store packet header and
> compressed data
> > +     * length
> > +     */
> > +    p->iov = g_new0(struct iovec, p->page_count + 2);
> > +    return 0;
> > +}
> > +
> > +/**
> > + * qpl_send_cleanup: cleanup send side
> > + *
> > + * Close the channel and return memory.
> > + *
> > + * @p: Params for the channel that we are using
> > + * @errp: pointer to an error
> > + */
> > +static void qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
> > +{
> > +    QplData *qpl = p->compress_data;
> > +
> > +    deinit_qpl(qpl);
> > +    g_free(p->compress_data);
> > +    p->compress_data = NULL;
> > +}
> > +
> > +/**
> > + * qpl_send_prepare: prepare data to be able to send
> > + *
> > + * Create a compressed buffer with all the pages that we are going to
> > + * send.
> > + *
> > + * Returns 0 for success or -1 for error
> > + *
> > + * @p: Params for the channel that we are using
> > + * @errp: pointer to an error
> > + */
> > +static int qpl_send_prepare(MultiFDSendParams *p, Error **errp)
> > +{
> > +    /* Implement in next patch */
> > +    return -1;
> > +}
> > +
> > +/**
> > + * qpl_recv_setup: setup receive side
> > + *
> > + * Create the compressed channel and buffer.
> > + *
> > + * Returns 0 for success or -1 for error
> > + *
> > + * @p: Params for the channel that we are using
> > + * @errp: pointer to an error
> > + */
> > +static int qpl_recv_setup(MultiFDRecvParams *p, Error **errp)
> > +{
> > +    QplData *qpl;
> > +
> > +    qpl = g_new0(QplData, 1);
> > +    if (init_qpl(qpl, p->page_count, p->page_size, p->id, errp) != 0) {
> > +        g_free(qpl);
> > +        return -1;
> > +    }
> > +    p->compress_data = qpl;
> > +    return 0;
> > +}
> > +
> > +/**
> > + * qpl_recv_cleanup: setup receive side
> > + *
> > + * Close the channel and return memory.
> > + *
> > + * @p: Params for the channel that we are using
> > + */
> > +static void qpl_recv_cleanup(MultiFDRecvParams *p)
> > +{
> > +    QplData *qpl = p->compress_data;
> > +
> > +    deinit_qpl(qpl);
> > +    g_free(p->compress_data);
> > +    p->compress_data = NULL;
> > +}
> > +
> > +/**
> > + * qpl_recv: read the data from the channel into actual pages
> > + *
> > + * Read the compressed buffer, and uncompress it into the actual
> > + * pages.
> > + *
> > + * Returns 0 for success or -1 for error
> > + *
> > + * @p: Params for the channel that we are using
> > + * @errp: pointer to an error
> > + */
> > +static int qpl_recv(MultiFDRecvParams *p, Error **errp)
> > +{
> > +    /* Implement in next patch */
> > +    return -1;
> > +}
> 
> The qpl library uses 'qpl_' as its name prefix, so using the
> same prefix in QEMU is fragile if future APIs are added to
> the library.
> 
> Please consistently use 'multifd_qpl_' as the prefix for
> *every* method in this file.

Get it, thanks for the guidance, I will fix this.

> > +
> > +static MultiFDMethods multifd_qpl_ops = {
> > +    .send_setup = qpl_send_setup,
> > +    .send_cleanup = qpl_send_cleanup,
> > +    .send_prepare = qpl_send_prepare,
> > +    .recv_setup = qpl_recv_setup,
> > +    .recv_cleanup = qpl_recv_cleanup,
> > +    .recv = qpl_recv,
> > +};
> >
> >  static void multifd_qpl_register(void)
> >  {
> > -    /* noop */
> > +    multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops);
> >  }
> >
> >  migration_init(multifd_qpl_register);
> > --
> > 2.39.3
> >
> >
> 
> With regards,
> Daniel
> --
> |: https://berrange.com      -o-
> https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-
> https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-
> https://www.instagram.com/dberrange :|
Daniel P. Berrangé March 20, 2024, 3:20 p.m. UTC | #3
On Wed, Mar 20, 2024 at 03:02:59PM +0000, Liu, Yuan1 wrote:
> > -----Original Message-----
> > From: Daniel P. Berrangé <berrange@redhat.com>
> > Sent: Wednesday, March 20, 2024 6:42 PM
> > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > Cc: peterx@redhat.com; farosas@suse.de; qemu-devel@nongnu.org;
> > hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou, Nanhai
> > <nanhai.zou@intel.com>
> > Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> > qpl compression
> > 
> > On Wed, Mar 20, 2024 at 12:45:25AM +0800, Yuan Liu wrote:
> > > the qpl initialization includes memory allocation for compressed
> > > data and the qpl job initialization.
> > >
> > > the qpl initialization will check whether the In-Memory Analytics
> > > Accelerator(IAA) hardware is available, if the platform does not
> > > have IAA hardware or the IAA hardware is not available, the QPL
> > > compression initialization will fail.
> > >
> > > Signed-off-by: Yuan Liu <yuan1.liu@intel.com>
> > > Reviewed-by: Nanhai Zou <nanhai.zou@intel.com>
> > > ---
> > >  migration/multifd-qpl.c | 243 +++++++++++++++++++++++++++++++++++++++-
> > >  1 file changed, 242 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > > index 056a68a060..6de65e9da7 100644
> > > --- a/migration/multifd-qpl.c
> > > +++ b/migration/multifd-qpl.c
> > > @@ -9,12 +9,253 @@
> > >   * This work is licensed under the terms of the GNU GPL, version 2 or
> > later.
> > >   * See the COPYING file in the top-level directory.
> > >   */
> > > +
> > >  #include "qemu/osdep.h"
> > >  #include "qemu/module.h"
> > > +#include "qapi/error.h"
> > > +#include "migration.h"
> > > +#include "multifd.h"
> > > +#include "qpl/qpl.h"
> > > +
> > > +typedef struct {
> > > +    qpl_job **job_array;
> > > +    /* the number of allocated jobs */
> > > +    uint32_t job_num;
> > > +    /* the size of data processed by a qpl job */
> > > +    uint32_t data_size;
> > > +    /* compressed data buffer */
> > > +    uint8_t *zbuf;
> > > +    /* the length of compressed data */
> > > +    uint32_t *zbuf_hdr;
> > > +} QplData;
> > > +
> > > +static void free_zbuf(QplData *qpl)
> > > +{
> > > +    if (qpl->zbuf != NULL) {
> > > +        munmap(qpl->zbuf, qpl->job_num * qpl->data_size);
> > > +        qpl->zbuf = NULL;
> > > +    }
> > > +    if (qpl->zbuf_hdr != NULL) {
> > > +        g_free(qpl->zbuf_hdr);
> > > +        qpl->zbuf_hdr = NULL;
> > > +    }
> > > +}
> > > +
> > > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > > +{
> > > +    int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > > +    uint32_t size = qpl->job_num * qpl->data_size;
> > > +    uint8_t *buf;
> > > +
> > > +    buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -
> > 1, 0);
> > > +    if (buf == MAP_FAILED) {
> > > +        error_setg(errp, "multifd: %u: alloc_zbuf failed, job num %u,
> > size %u",
> > > +                   chan_id, qpl->job_num, qpl->data_size);
> > > +        return -1;
> > > +    }
> > 
> > What's the reason for using mmap here, rather than a normal
> > malloc ?
> 
> I want to populate the memory accessed by the IAA device in the initialization
> phase, and then avoid initiating I/O page faults through the IAA device during
> migration, a large number of I/O page faults are not good for performance.

Does this mmap actually make a measurable difference ?

If I've followed the code paths correctly, I think this
alloc_zbuf method only gets called during initial setup
of each migration thread.

So this use of MAP_POPULATE seems to only make a difference
between faulting in before starting sending data, and faulting
in on first bit of data that's sent. I'm surprised if that's
noticable as a difference.


> This problem also occurs at the destination, therefore, I recommend that
> customers need to add -mem-prealloc for destination boot parameters.

I can understand mem-prelloc making a difference as that guarantees
all of guest RAM is faulted in.


With regards,
Daniel
Peter Xu March 20, 2024, 3:34 p.m. UTC | #4
On Wed, Mar 20, 2024 at 03:02:59PM +0000, Liu, Yuan1 wrote:
> > > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > > +{
> > > +    int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > > +    uint32_t size = qpl->job_num * qpl->data_size;
> > > +    uint8_t *buf;
> > > +
> > > +    buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -
> > 1, 0);
> > > +    if (buf == MAP_FAILED) {
> > > +        error_setg(errp, "multifd: %u: alloc_zbuf failed, job num %u,
> > size %u",
> > > +                   chan_id, qpl->job_num, qpl->data_size);
> > > +        return -1;
> > > +    }
> > 
> > What's the reason for using mmap here, rather than a normal
> > malloc ?
> 
> I want to populate the memory accessed by the IAA device in the initialization
> phase, and then avoid initiating I/O page faults through the IAA device during
> migration, a large number of I/O page faults are not good for performance. 

mmap() doesn't populate pages, unless with MAP_POPULATE.  And even with
that it shouldn't be guaranteed, as the populate phase should ignore all
errors.

       MAP_POPULATE (since Linux 2.5.46)
              Populate (prefault) page tables for a mapping.  For a file  map‐
              ping, this causes read-ahead on the file.  This will help to re‐
              duce  blocking  on  page  faults later.  The mmap() call doesn't
              fail if the mapping cannot be populated  (for  example,  due  to
              limitations  on  the  number  of  mapped  huge  pages when using
              MAP_HUGETLB).  Support for MAP_POPULATE in conjunction with pri‐
              vate mappings was added in Linux 2.6.23.

OTOH, I think g_malloc0() should guarantee to prefault everything in as
long as the call returned (even though they can be swapped out later, but
that applies to all cases anyway).

> 
> This problem also occurs at the destination, therefore, I recommend that
> customers need to add -mem-prealloc for destination boot parameters.

I'm not sure what issue you hit when testing it, but -mem-prealloc flag
should only control the guest memory backends not the buffers that QEMU
internally use, afaiu.

Thanks,
Yuan Liu March 20, 2024, 4:04 p.m. UTC | #5
> -----Original Message-----
> From: Daniel P. Berrangé <berrange@redhat.com>
> Sent: Wednesday, March 20, 2024 11:21 PM
> To: Liu, Yuan1 <yuan1.liu@intel.com>
> Cc: peterx@redhat.com; farosas@suse.de; qemu-devel@nongnu.org;
> hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou, Nanhai
> <nanhai.zou@intel.com>
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 03:02:59PM +0000, Liu, Yuan1 wrote:
> > > -----Original Message-----
> > > From: Daniel P. Berrangé <berrange@redhat.com>
> > > Sent: Wednesday, March 20, 2024 6:42 PM
> > > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > > Cc: peterx@redhat.com; farosas@suse.de; qemu-devel@nongnu.org;
> > > hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou, Nanhai
> > > <nanhai.zou@intel.com>
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> initialization of
> > > qpl compression
> > >
> > > On Wed, Mar 20, 2024 at 12:45:25AM +0800, Yuan Liu wrote:
> > > > the qpl initialization includes memory allocation for compressed
> > > > data and the qpl job initialization.
> > > >
> > > > the qpl initialization will check whether the In-Memory Analytics
> > > > Accelerator(IAA) hardware is available, if the platform does not
> > > > have IAA hardware or the IAA hardware is not available, the QPL
> > > > compression initialization will fail.
> > > >
> > > > Signed-off-by: Yuan Liu <yuan1.liu@intel.com>
> > > > Reviewed-by: Nanhai Zou <nanhai.zou@intel.com>
> > > > ---
> > > >  migration/multifd-qpl.c | 243
> +++++++++++++++++++++++++++++++++++++++-
> > > >  1 file changed, 242 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
> > > > index 056a68a060..6de65e9da7 100644
> > > > --- a/migration/multifd-qpl.c
> > > > +++ b/migration/multifd-qpl.c
> > > > @@ -9,12 +9,253 @@
> > > >   * This work is licensed under the terms of the GNU GPL, version 2
> or
> > > later.
> > > >   * See the COPYING file in the top-level directory.
> > > >   */
> > > > +
> > > >  #include "qemu/osdep.h"
> > > >  #include "qemu/module.h"
> > > > +#include "qapi/error.h"
> > > > +#include "migration.h"
> > > > +#include "multifd.h"
> > > > +#include "qpl/qpl.h"
> > > > +
> > > > +typedef struct {
> > > > +    qpl_job **job_array;
> > > > +    /* the number of allocated jobs */
> > > > +    uint32_t job_num;
> > > > +    /* the size of data processed by a qpl job */
> > > > +    uint32_t data_size;
> > > > +    /* compressed data buffer */
> > > > +    uint8_t *zbuf;
> > > > +    /* the length of compressed data */
> > > > +    uint32_t *zbuf_hdr;
> > > > +} QplData;
> > > > +
> > > > +static void free_zbuf(QplData *qpl)
> > > > +{
> > > > +    if (qpl->zbuf != NULL) {
> > > > +        munmap(qpl->zbuf, qpl->job_num * qpl->data_size);
> > > > +        qpl->zbuf = NULL;
> > > > +    }
> > > > +    if (qpl->zbuf_hdr != NULL) {
> > > > +        g_free(qpl->zbuf_hdr);
> > > > +        qpl->zbuf_hdr = NULL;
> > > > +    }
> > > > +}
> > > > +
> > > > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > > > +{
> > > > +    int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > > > +    uint32_t size = qpl->job_num * qpl->data_size;
> > > > +    uint8_t *buf;
> > > > +
> > > > +    buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE,
> flags, -
> > > 1, 0);
> > > > +    if (buf == MAP_FAILED) {
> > > > +        error_setg(errp, "multifd: %u: alloc_zbuf failed, job
> num %u,
> > > size %u",
> > > > +                   chan_id, qpl->job_num, qpl->data_size);
> > > > +        return -1;
> > > > +    }
> > >
> > > What's the reason for using mmap here, rather than a normal
> > > malloc ?
> >
> > I want to populate the memory accessed by the IAA device in the
> initialization
> > phase, and then avoid initiating I/O page faults through the IAA device
> during
> > migration, a large number of I/O page faults are not good for
> performance.
> 
> Does this mmap actually make a measurable difference ?
> 
> If I've followed the code paths correctly, I think this
> alloc_zbuf method only gets called during initial setup
> of each migration thread.
> 
> So this use of MAP_POPULATE seems to only make a difference
> between faulting in before starting sending data, and faulting
> in on first bit of data that's sent. I'm surprised if that's
> noticable as a difference.

You are right, the performance impact is only on the first page fault 
processing, and has little impact on the overall live migration performance.

I just did a simple test. The total time of live migration using g_malloc is
2321ms, and it takes 2098ms using mmap. I need more tests to check if 
g_malloc/g_malloc0 is feasible.

> > This problem also occurs at the destination, therefore, I recommend that
> > customers need to add -mem-prealloc for destination boot parameters.
> 
> I can understand mem-prelloc making a difference as that guarantees
> all of guest RAM is faulted in.
> 
> 
> With regards,
> Daniel
> --
> |: https://berrange.com      -o-
> https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-
> https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-
> https://www.instagram.com/dberrange :|
Yuan Liu March 20, 2024, 4:23 p.m. UTC | #6
> -----Original Message-----
> From: Peter Xu <peterx@redhat.com>
> Sent: Wednesday, March 20, 2024 11:35 PM
> To: Liu, Yuan1 <yuan1.liu@intel.com>
> Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> Nanhai <nanhai.zou@intel.com>
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 03:02:59PM +0000, Liu, Yuan1 wrote:
> > > > +static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
> > > > +{
> > > > +    int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
> > > > +    uint32_t size = qpl->job_num * qpl->data_size;
> > > > +    uint8_t *buf;
> > > > +
> > > > +    buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE,
> flags, -
> > > 1, 0);
> > > > +    if (buf == MAP_FAILED) {
> > > > +        error_setg(errp, "multifd: %u: alloc_zbuf failed, job
> num %u,
> > > size %u",
> > > > +                   chan_id, qpl->job_num, qpl->data_size);
> > > > +        return -1;
> > > > +    }
> > >
> > > What's the reason for using mmap here, rather than a normal
> > > malloc ?
> >
> > I want to populate the memory accessed by the IAA device in the
> initialization
> > phase, and then avoid initiating I/O page faults through the IAA device
> during
> > migration, a large number of I/O page faults are not good for
> performance.
> 
> mmap() doesn't populate pages, unless with MAP_POPULATE.  And even with
> that it shouldn't be guaranteed, as the populate phase should ignore all
> errors.
> 
>        MAP_POPULATE (since Linux 2.5.46)
>               Populate (prefault) page tables for a mapping.  For a file
> map‐
>               ping, this causes read-ahead on the file.  This will help to
> re‐
>               duce  blocking  on  page  faults later.  The mmap() call
> doesn't
>               fail if the mapping cannot be populated  (for  example,  due
> to
>               limitations  on  the  number  of  mapped  huge  pages when
> using
>               MAP_HUGETLB).  Support for MAP_POPULATE in conjunction with
> pri‐
>               vate mappings was added in Linux 2.6.23.
> 
> OTOH, I think g_malloc0() should guarantee to prefault everything in as
> long as the call returned (even though they can be swapped out later, but
> that applies to all cases anyway).

Thanks, Peter. I will try the g_malloc0 method here

> > This problem also occurs at the destination, therefore, I recommend that
> > customers need to add -mem-prealloc for destination boot parameters.
> 
> I'm not sure what issue you hit when testing it, but -mem-prealloc flag
> should only control the guest memory backends not the buffers that QEMU
> internally use, afaiu.
> 
> Thanks,
> 
> --
> Peter Xu

let me explain here, during the decompression operation of IAA, the decompressed data
can be directly output to the virtual address of the guest memory by IAA hardware. 
It can avoid copying the decompressed data to guest memory by CPU.

Without -mem-prealloc, all the guest memory is not populated, and IAA hardware needs to trigger
I/O page fault first and then output the decompressed data to the guest memory region. 
Besides that, CPU page faults will also trigger IOTLB flush operation when IAA devices use SVM. 

Due to the inability to quickly resolve a large number of IO page faults and IOTLB flushes, the
decompression throughput of the IAA device will decrease significantly.
Peter Xu March 20, 2024, 8:31 p.m. UTC | #7
On Wed, Mar 20, 2024 at 04:23:01PM +0000, Liu, Yuan1 wrote:
> let me explain here, during the decompression operation of IAA, the
> decompressed data can be directly output to the virtual address of the
> guest memory by IAA hardware.  It can avoid copying the decompressed data
> to guest memory by CPU.

I see.

> Without -mem-prealloc, all the guest memory is not populated, and IAA
> hardware needs to trigger I/O page fault first and then output the
> decompressed data to the guest memory region.  Besides that, CPU page
> faults will also trigger IOTLB flush operation when IAA devices use SVM.

Oh so the IAA hardware already can use CPU pgtables?  Nice..

Why IOTLB flush is needed?  AFAIU we're only installing new pages, the
request can either come from a CPU access or a DMA.  In all cases there
should have no tearing down of an old page.  Isn't an iotlb flush only
needed if a tear down happens?

>
> Due to the inability to quickly resolve a large number of IO page faults
> and IOTLB flushes, the decompression throughput of the IAA device will
> decrease significantly.
Yuan Liu March 21, 2024, 1:37 a.m. UTC | #8
> -----Original Message-----
> From: Peter Xu <peterx@redhat.com>
> Sent: Thursday, March 21, 2024 4:32 AM
> To: Liu, Yuan1 <yuan1.liu@intel.com>
> Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> Nanhai <nanhai.zou@intel.com>
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Wed, Mar 20, 2024 at 04:23:01PM +0000, Liu, Yuan1 wrote:
> > let me explain here, during the decompression operation of IAA, the
> > decompressed data can be directly output to the virtual address of the
> > guest memory by IAA hardware.  It can avoid copying the decompressed
> data
> > to guest memory by CPU.
> 
> I see.
> 
> > Without -mem-prealloc, all the guest memory is not populated, and IAA
> > hardware needs to trigger I/O page fault first and then output the
> > decompressed data to the guest memory region.  Besides that, CPU page
> > faults will also trigger IOTLB flush operation when IAA devices use SVM.
> 
> Oh so the IAA hardware already can use CPU pgtables?  Nice..
> 
> Why IOTLB flush is needed?  AFAIU we're only installing new pages, the
> request can either come from a CPU access or a DMA.  In all cases there
> should have no tearing down of an old page.  Isn't an iotlb flush only
> needed if a tear down happens?

As far as I know, IAA hardware uses SVM technology to use the CPU's page table 
for address translation (IOMMU scalable mode directly accesses the CPU page table).
Therefore, when the CPU page table changes, the device's Invalidation operation needs
to be triggered to update the IOMMU and the device's cache. 

My current kernel version is mainline 6.2. The issue I see is as follows:
--Handle_mm_fault
 |
  -- wp_page_copy
    |
    -- mmu_notifier_invalidate_range
      |
      -- intel_invalidate_rage
        |
        -- qi_flush_piotlb
        -- qi_flush_dev_iotlb_pasid
	 

> > Due to the inability to quickly resolve a large number of IO page faults
> > and IOTLB flushes, the decompression throughput of the IAA device will
> > decrease significantly.
> 
> --
> Peter Xu
Peter Xu March 21, 2024, 3:28 p.m. UTC | #9
On Thu, Mar 21, 2024 at 01:37:36AM +0000, Liu, Yuan1 wrote:
> > -----Original Message-----
> > From: Peter Xu <peterx@redhat.com>
> > Sent: Thursday, March 21, 2024 4:32 AM
> > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> > Nanhai <nanhai.zou@intel.com>
> > Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> > qpl compression
> > 
> > On Wed, Mar 20, 2024 at 04:23:01PM +0000, Liu, Yuan1 wrote:
> > > let me explain here, during the decompression operation of IAA, the
> > > decompressed data can be directly output to the virtual address of the
> > > guest memory by IAA hardware.  It can avoid copying the decompressed
> > data
> > > to guest memory by CPU.
> > 
> > I see.
> > 
> > > Without -mem-prealloc, all the guest memory is not populated, and IAA
> > > hardware needs to trigger I/O page fault first and then output the
> > > decompressed data to the guest memory region.  Besides that, CPU page
> > > faults will also trigger IOTLB flush operation when IAA devices use SVM.
> > 
> > Oh so the IAA hardware already can use CPU pgtables?  Nice..
> > 
> > Why IOTLB flush is needed?  AFAIU we're only installing new pages, the
> > request can either come from a CPU access or a DMA.  In all cases there
> > should have no tearing down of an old page.  Isn't an iotlb flush only
> > needed if a tear down happens?
> 
> As far as I know, IAA hardware uses SVM technology to use the CPU's page table 
> for address translation (IOMMU scalable mode directly accesses the CPU page table).
> Therefore, when the CPU page table changes, the device's Invalidation operation needs
> to be triggered to update the IOMMU and the device's cache. 
> 
> My current kernel version is mainline 6.2. The issue I see is as follows:
> --Handle_mm_fault
>  |
>   -- wp_page_copy

This is the CoW path.  Not usual at all..

I assume this issue should only present on destination.  Then the guest
pages should be the destination of such DMAs to happen, which means these
should be write faults, and as we see here it is, otherwise it won't
trigger a CoW.

However it's not clear to me why a pre-installed zero page existed.  It
means someone read the guest pages first.

It might be interesting to know _why_ someone reads the guest pages, even
if we know they're all zeros.  If we can avoid such reads then it'll be a
hole rather than a prefaulted read on zero page, then invalidations are not
needed, and I expect that should fix the iotlb storm issue.

It'll still be good we can fix this first to not make qpl special from this
regard, so that the hope is migration submodule shouldn't rely on any
pre-config (-mem-prealloc) on guest memory behaviors to work properly.

>     |
>     -- mmu_notifier_invalidate_range
>       |
>       -- intel_invalidate_rage
>         |
>         -- qi_flush_piotlb
>         -- qi_flush_dev_iotlb_pasid
Yuan Liu March 22, 2024, 2:06 a.m. UTC | #10
> -----Original Message-----
> From: Peter Xu <peterx@redhat.com>
> Sent: Thursday, March 21, 2024 11:28 PM
> To: Liu, Yuan1 <yuan1.liu@intel.com>
> Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> Nanhai <nanhai.zou@intel.com>
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Thu, Mar 21, 2024 at 01:37:36AM +0000, Liu, Yuan1 wrote:
> > > -----Original Message-----
> > > From: Peter Xu <peterx@redhat.com>
> > > Sent: Thursday, March 21, 2024 4:32 AM
> > > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > > devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com;
> Zou,
> > > Nanhai <nanhai.zou@intel.com>
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> initialization of
> > > qpl compression
> > >
> > > On Wed, Mar 20, 2024 at 04:23:01PM +0000, Liu, Yuan1 wrote:
> > > > let me explain here, during the decompression operation of IAA, the
> > > > decompressed data can be directly output to the virtual address of
> the
> > > > guest memory by IAA hardware.  It can avoid copying the decompressed
> > > data
> > > > to guest memory by CPU.
> > >
> > > I see.
> > >
> > > > Without -mem-prealloc, all the guest memory is not populated, and
> IAA
> > > > hardware needs to trigger I/O page fault first and then output the
> > > > decompressed data to the guest memory region.  Besides that, CPU
> page
> > > > faults will also trigger IOTLB flush operation when IAA devices use
> SVM.
> > >
> > > Oh so the IAA hardware already can use CPU pgtables?  Nice..
> > >
> > > Why IOTLB flush is needed?  AFAIU we're only installing new pages, the
> > > request can either come from a CPU access or a DMA.  In all cases
> there
> > > should have no tearing down of an old page.  Isn't an iotlb flush only
> > > needed if a tear down happens?
> >
> > As far as I know, IAA hardware uses SVM technology to use the CPU's page
> table
> > for address translation (IOMMU scalable mode directly accesses the CPU
> page table).
> > Therefore, when the CPU page table changes, the device's Invalidation
> operation needs
> > to be triggered to update the IOMMU and the device's cache.
> >
> > My current kernel version is mainline 6.2. The issue I see is as
> follows:
> > --Handle_mm_fault
> >  |
> >   -- wp_page_copy
> 
> This is the CoW path.  Not usual at all..
> 
> I assume this issue should only present on destination.  Then the guest
> pages should be the destination of such DMAs to happen, which means these
> should be write faults, and as we see here it is, otherwise it won't
> trigger a CoW.
> 
> However it's not clear to me why a pre-installed zero page existed.  It
> means someone read the guest pages first.
> 
> It might be interesting to know _why_ someone reads the guest pages, even
> if we know they're all zeros.  If we can avoid such reads then it'll be a
> hole rather than a prefaulted read on zero page, then invalidations are
> not
> needed, and I expect that should fix the iotlb storm issue.

The received pages will be read for zero pages check first. Although
these pages are zero pages, and IAA hardware will not access them, the
COW happens and causes following IOTLB flush operation. As far as I know, 
IOMMU quickly detects whether the address range has been used by the device,
and does not invalidate the address that is not used by the device, this has 
not yet been resolved in Linux kernel 6.2. I will check the latest status for
this.
void multifd_recv_zero_page_process(MultiFDRecvParams *p)
{
    for (int i = 0; i < p->zero_num; i++) {
        void *page = p->host + p->zero[i];
        if (!buffer_is_zero(page, p->page_size)) {
            memset(page, 0, p->page_size);
        }
    }
}


> It'll still be good we can fix this first to not make qpl special from
> this
> regard, so that the hope is migration submodule shouldn't rely on any
> pre-config (-mem-prealloc) on guest memory behaviors to work properly.

Even if the IOTLB problem can be avoided, the I/O page fault problem (normal
pages are loaded by the IAA device and solving normal page faults through IOMMU,
the performance is not good)

It can let the decompressed data of the IAA device be output to a pre-populated
memory instead of directly outputting to the guest address, but then each multifd
thread needs two memory copies, one copy from the network to the IAA input 
memory(pre-populated), and another copy from the IAA output memory(pre-populated)
to the guest address, which may become a performance bottleneck at the destination
during the live migration process.

So I think it is still necessary to use the -mem-prealloc option

> >     -- mmu_notifier_invalidate_range
> >       |
> >       -- intel_invalidate_rage
> >         |
> >         -- qi_flush_piotlb
> >         -- qi_flush_dev_iotlb_pasid
> 
> --
> Peter Xu
Yuan Liu March 22, 2024, 2:47 p.m. UTC | #11
> -----Original Message-----
> From: Liu, Yuan1
> Sent: Friday, March 22, 2024 10:07 AM
> To: Peter Xu <peterx@redhat.com>
> Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> Nanhai <nanhai.zou@intel.com>
> Subject: RE: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> > -----Original Message-----
> > From: Peter Xu <peterx@redhat.com>
> > Sent: Thursday, March 21, 2024 11:28 PM
> > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com;
> Zou,
> > Nanhai <nanhai.zou@intel.com>
> > Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization
> of
> > qpl compression
> >
> > On Thu, Mar 21, 2024 at 01:37:36AM +0000, Liu, Yuan1 wrote:
> > > > -----Original Message-----
> > > > From: Peter Xu <peterx@redhat.com>
> > > > Sent: Thursday, March 21, 2024 4:32 AM
> > > > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > > > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > > > devel@nongnu.org; hao.xiang@bytedance.com;
> bryan.zhang@bytedance.com;
> > Zou,
> > > > Nanhai <nanhai.zou@intel.com>
> > > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> > initialization of
> > > > qpl compression
> > > >
> > > > On Wed, Mar 20, 2024 at 04:23:01PM +0000, Liu, Yuan1 wrote:
> > > > > let me explain here, during the decompression operation of IAA,
> the
> > > > > decompressed data can be directly output to the virtual address of
> > the
> > > > > guest memory by IAA hardware.  It can avoid copying the
> decompressed
> > > > data
> > > > > to guest memory by CPU.
> > > >
> > > > I see.
> > > >
> > > > > Without -mem-prealloc, all the guest memory is not populated, and
> > IAA
> > > > > hardware needs to trigger I/O page fault first and then output the
> > > > > decompressed data to the guest memory region.  Besides that, CPU
> > page
> > > > > faults will also trigger IOTLB flush operation when IAA devices
> use
> > SVM.
> > > >
> > > > Oh so the IAA hardware already can use CPU pgtables?  Nice..
> > > >
> > > > Why IOTLB flush is needed?  AFAIU we're only installing new pages,
> the
> > > > request can either come from a CPU access or a DMA.  In all cases
> > there
> > > > should have no tearing down of an old page.  Isn't an iotlb flush
> only
> > > > needed if a tear down happens?
> > >
> > > As far as I know, IAA hardware uses SVM technology to use the CPU's
> page
> > table
> > > for address translation (IOMMU scalable mode directly accesses the CPU
> > page table).
> > > Therefore, when the CPU page table changes, the device's Invalidation
> > operation needs
> > > to be triggered to update the IOMMU and the device's cache.
> > >
> > > My current kernel version is mainline 6.2. The issue I see is as
> > follows:
> > > --Handle_mm_fault
> > >  |
> > >   -- wp_page_copy
> >
> > This is the CoW path.  Not usual at all..
> >
> > I assume this issue should only present on destination.  Then the guest
> > pages should be the destination of such DMAs to happen, which means
> these
> > should be write faults, and as we see here it is, otherwise it won't
> > trigger a CoW.
> >
> > However it's not clear to me why a pre-installed zero page existed.  It
> > means someone read the guest pages first.
> >
> > It might be interesting to know _why_ someone reads the guest pages,
> even
> > if we know they're all zeros.  If we can avoid such reads then it'll be
> a
> > hole rather than a prefaulted read on zero page, then invalidations are
> > not
> > needed, and I expect that should fix the iotlb storm issue.
> 
> The received pages will be read for zero pages check first. Although
> these pages are zero pages, and IAA hardware will not access them, the
> COW happens and causes following IOTLB flush operation. As far as I know,
> IOMMU quickly detects whether the address range has been used by the
> device,
> and does not invalidate the address that is not used by the device, this
> has
> not yet been resolved in Linux kernel 6.2. I will check the latest status
> for
> this.

I checked the Linux mainline 6.8 code, there are no big changes for this.
In version 6.8, if the process needs to flush MMU TLB, then I/O TLB flush
will be also triggered when the process has SVM devices. I haven't found
the code to check if pages have been set EA (Extended-Accessed) bit before
submitting invalidation operations, this is same with version 6.2.

VT-d 3.6.2
If the Extended-Accessed-Flag-Enable (EAFE) is 1 in a scalable-mode PASID-table
entry that references a first-stage paging-structure entry used by the remapping
hardware, it atomically sets the EA field in that entry. Whenever EA field is 
atomically set, the A field is also set in the same atomic operation. For software
usages where the first-stage paging structures are shared across heterogeneous agents
(e.g., CPUs and accelerator devices such as GPUs), the EA flag may be used by software
to identify pages accessed by non-CPU agent(s) (as opposed to the A flag which indicates
access by any agent sharing the paging structures).

> void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> {
>     for (int i = 0; i < p->zero_num; i++) {
>         void *page = p->host + p->zero[i];
>         if (!buffer_is_zero(page, p->page_size)) {
>             memset(page, 0, p->page_size);
>         }
>     }
> }
> 
> 
> > It'll still be good we can fix this first to not make qpl special from
> > this
> > regard, so that the hope is migration submodule shouldn't rely on any
> > pre-config (-mem-prealloc) on guest memory behaviors to work properly.
> 
> Even if the IOTLB problem can be avoided, the I/O page fault problem
> (normal
> pages are loaded by the IAA device and solving normal page faults through
> IOMMU,
> the performance is not good)
> 
> It can let the decompressed data of the IAA device be output to a pre-
> populated
> memory instead of directly outputting to the guest address, but then each
> multifd
> thread needs two memory copies, one copy from the network to the IAA input
> memory(pre-populated), and another copy from the IAA output memory(pre-
> populated)
> to the guest address, which may become a performance bottleneck at the
> destination
> during the live migration process.
> 
> So I think it is still necessary to use the -mem-prealloc option
> 
> > >     -- mmu_notifier_invalidate_range
> > >       |
> > >       -- intel_invalidate_rage
> > >         |
> > >         -- qi_flush_piotlb
> > >         -- qi_flush_dev_iotlb_pasid
> >
> > --
> > Peter Xu
Peter Xu March 22, 2024, 4:40 p.m. UTC | #12
On Fri, Mar 22, 2024 at 02:47:02PM +0000, Liu, Yuan1 wrote:
> > -----Original Message-----
> > From: Liu, Yuan1
> > Sent: Friday, March 22, 2024 10:07 AM
> > To: Peter Xu <peterx@redhat.com>
> > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> > Nanhai <nanhai.zou@intel.com>
> > Subject: RE: [PATCH v5 5/7] migration/multifd: implement initialization of
> > qpl compression
> > 
> > > -----Original Message-----
> > > From: Peter Xu <peterx@redhat.com>
> > > Sent: Thursday, March 21, 2024 11:28 PM
> > > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > > devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com;
> > Zou,
> > > Nanhai <nanhai.zou@intel.com>
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization
> > of
> > > qpl compression
> > >
> > > On Thu, Mar 21, 2024 at 01:37:36AM +0000, Liu, Yuan1 wrote:
> > > > > -----Original Message-----
> > > > > From: Peter Xu <peterx@redhat.com>
> > > > > Sent: Thursday, March 21, 2024 4:32 AM
> > > > > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > > > > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > > > > devel@nongnu.org; hao.xiang@bytedance.com;
> > bryan.zhang@bytedance.com;
> > > Zou,
> > > > > Nanhai <nanhai.zou@intel.com>
> > > > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> > > initialization of
> > > > > qpl compression
> > > > >
> > > > > On Wed, Mar 20, 2024 at 04:23:01PM +0000, Liu, Yuan1 wrote:
> > > > > > let me explain here, during the decompression operation of IAA,
> > the
> > > > > > decompressed data can be directly output to the virtual address of
> > > the
> > > > > > guest memory by IAA hardware.  It can avoid copying the
> > decompressed
> > > > > data
> > > > > > to guest memory by CPU.
> > > > >
> > > > > I see.
> > > > >
> > > > > > Without -mem-prealloc, all the guest memory is not populated, and
> > > IAA
> > > > > > hardware needs to trigger I/O page fault first and then output the
> > > > > > decompressed data to the guest memory region.  Besides that, CPU
> > > page
> > > > > > faults will also trigger IOTLB flush operation when IAA devices
> > use
> > > SVM.
> > > > >
> > > > > Oh so the IAA hardware already can use CPU pgtables?  Nice..
> > > > >
> > > > > Why IOTLB flush is needed?  AFAIU we're only installing new pages,
> > the
> > > > > request can either come from a CPU access or a DMA.  In all cases
> > > there
> > > > > should have no tearing down of an old page.  Isn't an iotlb flush
> > only
> > > > > needed if a tear down happens?
> > > >
> > > > As far as I know, IAA hardware uses SVM technology to use the CPU's
> > page
> > > table
> > > > for address translation (IOMMU scalable mode directly accesses the CPU
> > > page table).
> > > > Therefore, when the CPU page table changes, the device's Invalidation
> > > operation needs
> > > > to be triggered to update the IOMMU and the device's cache.
> > > >
> > > > My current kernel version is mainline 6.2. The issue I see is as
> > > follows:
> > > > --Handle_mm_fault
> > > >  |
> > > >   -- wp_page_copy
> > >
> > > This is the CoW path.  Not usual at all..
> > >
> > > I assume this issue should only present on destination.  Then the guest
> > > pages should be the destination of such DMAs to happen, which means
> > these
> > > should be write faults, and as we see here it is, otherwise it won't
> > > trigger a CoW.
> > >
> > > However it's not clear to me why a pre-installed zero page existed.  It
> > > means someone read the guest pages first.
> > >
> > > It might be interesting to know _why_ someone reads the guest pages,
> > even
> > > if we know they're all zeros.  If we can avoid such reads then it'll be
> > a
> > > hole rather than a prefaulted read on zero page, then invalidations are
> > > not
> > > needed, and I expect that should fix the iotlb storm issue.
> > 
> > The received pages will be read for zero pages check first. Although
> > these pages are zero pages, and IAA hardware will not access them, the
> > COW happens and causes following IOTLB flush operation. As far as I know,
> > IOMMU quickly detects whether the address range has been used by the
> > device,
> > and does not invalidate the address that is not used by the device, this
> > has
> > not yet been resolved in Linux kernel 6.2. I will check the latest status
> > for
> > this.
> 
> I checked the Linux mainline 6.8 code, there are no big changes for this.
> In version 6.8, if the process needs to flush MMU TLB, then I/O TLB flush
> will be also triggered when the process has SVM devices. I haven't found
> the code to check if pages have been set EA (Extended-Accessed) bit before
> submitting invalidation operations, this is same with version 6.2.
> 
> VT-d 3.6.2
> If the Extended-Accessed-Flag-Enable (EAFE) is 1 in a scalable-mode PASID-table
> entry that references a first-stage paging-structure entry used by the remapping
> hardware, it atomically sets the EA field in that entry. Whenever EA field is 
> atomically set, the A field is also set in the same atomic operation. For software
> usages where the first-stage paging structures are shared across heterogeneous agents
> (e.g., CPUs and accelerator devices such as GPUs), the EA flag may be used by software
> to identify pages accessed by non-CPU agent(s) (as opposed to the A flag which indicates
> access by any agent sharing the paging structures).

This seems pretty new hardware features.  I didn't check in depths but what
you said makes sense.

> 
> > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > {
> >     for (int i = 0; i < p->zero_num; i++) {
> >         void *page = p->host + p->zero[i];
> >         if (!buffer_is_zero(page, p->page_size)) {
> >             memset(page, 0, p->page_size);
> >         }
> >     }
> > }

It may not matter much (where I also see your below comments), but just to
mention another solution to avoid this read is that we can maintain
RAMBlock->receivedmap for precopy (especially, multifd, afaiu multifd
doesn't yet update this bitmap.. even if normal precopy does), then here
instead of scanning every time, maybe we can do:

  /*
   * If it's the 1st time receiving it, no need to clear it as it must be
   * all zeros now.
   */
  if (bitmap_test(rb->receivedmap, page_offset)) {
      memset(page, 0, ...);
  } else {
      bitmap_set(rb->receivedmap, page_offset);
  }

And we also always set the bit when !zero too.
    
My rational is that it's unlikely a zero page if it's sent once or more,
while OTOH for the 1st time we receive it, it must be a zero page, so no
need to scan for the 1st round.

> > 
> > 
> > > It'll still be good we can fix this first to not make qpl special from
> > > this
> > > regard, so that the hope is migration submodule shouldn't rely on any
> > > pre-config (-mem-prealloc) on guest memory behaviors to work properly.
> > 
> > Even if the IOTLB problem can be avoided, the I/O page fault problem
> > (normal
> > pages are loaded by the IAA device and solving normal page faults through
> > IOMMU,
> > the performance is not good)

Do you have a rough estimate on how slow that could be? It'll be good to
mention some details too in the doc file in that case.

> > 
> > It can let the decompressed data of the IAA device be output to a pre-
> > populated
> > memory instead of directly outputting to the guest address, but then each
> > multifd
> > thread needs two memory copies, one copy from the network to the IAA input
> > memory(pre-populated), and another copy from the IAA output memory(pre-
> > populated)
> > to the guest address, which may become a performance bottleneck at the
> > destination
> > during the live migration process.
> >  
> > So I think it is still necessary to use the -mem-prealloc option

Right, that complexity may not be necessary, in that case, maybe such
suggestion is fine.

Thanks,

> > 
> > > >     -- mmu_notifier_invalidate_range
> > > >       |
> > > >       -- intel_invalidate_rage
> > > >         |
> > > >         -- qi_flush_piotlb
> > > >         -- qi_flush_dev_iotlb_pasid
> > >
> > > --
> > > Peter Xu
>
Peter Xu March 27, 2024, 7:25 p.m. UTC | #13
On Fri, Mar 22, 2024 at 12:40:32PM -0400, Peter Xu wrote:
> > > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > > {
> > >     for (int i = 0; i < p->zero_num; i++) {
> > >         void *page = p->host + p->zero[i];
> > >         if (!buffer_is_zero(page, p->page_size)) {
> > >             memset(page, 0, p->page_size);
> > >         }
> > >     }
> > > }
> 
> It may not matter much (where I also see your below comments), but just to
> mention another solution to avoid this read is that we can maintain
> RAMBlock->receivedmap for precopy (especially, multifd, afaiu multifd
> doesn't yet update this bitmap.. even if normal precopy does), then here
> instead of scanning every time, maybe we can do:
> 
>   /*
>    * If it's the 1st time receiving it, no need to clear it as it must be
>    * all zeros now.
>    */
>   if (bitmap_test(rb->receivedmap, page_offset)) {
>       memset(page, 0, ...);
>   } else {
>       bitmap_set(rb->receivedmap, page_offset);
>   }
> 
> And we also always set the bit when !zero too.
>     
> My rational is that it's unlikely a zero page if it's sent once or more,
> while OTOH for the 1st time we receive it, it must be a zero page, so no
> need to scan for the 1st round.

Thinking about this, I'm wondering whether we should have this regardless.
IIUC now multifd will always require two page faults on destination for
anonymous guest memories (I suppose shmem/hugetlb is fine as no zero page
in those worlds).  Even though it should be faster than DMA faults, it
still is unwanted.

I'll take a note myself as todo to do some measurements in the future
first.  However if anyone thinks that makes sense and want to have a look,
please say so.  It'll be more than welcomed.

Thanks,
Yuan Liu March 28, 2024, 2:32 a.m. UTC | #14
> -----Original Message-----
> From: Peter Xu <peterx@redhat.com>
> Sent: Thursday, March 28, 2024 3:26 AM
> To: Liu, Yuan1 <yuan1.liu@intel.com>
> Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> Nanhai <nanhai.zou@intel.com>
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Fri, Mar 22, 2024 at 12:40:32PM -0400, Peter Xu wrote:
> > > > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > > > {
> > > >     for (int i = 0; i < p->zero_num; i++) {
> > > >         void *page = p->host + p->zero[i];
> > > >         if (!buffer_is_zero(page, p->page_size)) {
> > > >             memset(page, 0, p->page_size);
> > > >         }
> > > >     }
> > > > }
> >
> > It may not matter much (where I also see your below comments), but just
> to
> > mention another solution to avoid this read is that we can maintain
> > RAMBlock->receivedmap for precopy (especially, multifd, afaiu multifd
> > doesn't yet update this bitmap.. even if normal precopy does), then here
> > instead of scanning every time, maybe we can do:
> >
> >   /*
> >    * If it's the 1st time receiving it, no need to clear it as it must
> be
> >    * all zeros now.
> >    */
> >   if (bitmap_test(rb->receivedmap, page_offset)) {
> >       memset(page, 0, ...);
> >   } else {
> >       bitmap_set(rb->receivedmap, page_offset);
> >   }
> >
> > And we also always set the bit when !zero too.
> >
> > My rational is that it's unlikely a zero page if it's sent once or more,
> > while OTOH for the 1st time we receive it, it must be a zero page, so no
> > need to scan for the 1st round.
> 
> Thinking about this, I'm wondering whether we should have this regardless.
> IIUC now multifd will always require two page faults on destination for
> anonymous guest memories (I suppose shmem/hugetlb is fine as no zero page
> in those worlds).  Even though it should be faster than DMA faults, it
> still is unwanted.
> 
> I'll take a note myself as todo to do some measurements in the future
> first.  However if anyone thinks that makes sense and want to have a look,
> please say so.  It'll be more than welcomed.

Yes, I think this is a better improvement to avoid two page faults. I can test
the performance impact of this change on SVM-capable devices and give some data
later. As we saw before, the IOTLB flush occurs via COW, with the change, the 
impact of the COW should be gone.

If you need more testing and analysis on this, please let me know
Peter Xu March 28, 2024, 3:16 p.m. UTC | #15
On Thu, Mar 28, 2024 at 02:32:37AM +0000, Liu, Yuan1 wrote:
> > -----Original Message-----
> > From: Peter Xu <peterx@redhat.com>
> > Sent: Thursday, March 28, 2024 3:26 AM
> > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> > Nanhai <nanhai.zou@intel.com>
> > Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> > qpl compression
> > 
> > On Fri, Mar 22, 2024 at 12:40:32PM -0400, Peter Xu wrote:
> > > > > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > > > > {
> > > > >     for (int i = 0; i < p->zero_num; i++) {
> > > > >         void *page = p->host + p->zero[i];
> > > > >         if (!buffer_is_zero(page, p->page_size)) {
> > > > >             memset(page, 0, p->page_size);
> > > > >         }
> > > > >     }
> > > > > }
> > >
> > > It may not matter much (where I also see your below comments), but just
> > to
> > > mention another solution to avoid this read is that we can maintain
> > > RAMBlock->receivedmap for precopy (especially, multifd, afaiu multifd
> > > doesn't yet update this bitmap.. even if normal precopy does), then here
> > > instead of scanning every time, maybe we can do:
> > >
> > >   /*
> > >    * If it's the 1st time receiving it, no need to clear it as it must
> > be
> > >    * all zeros now.
> > >    */
> > >   if (bitmap_test(rb->receivedmap, page_offset)) {
> > >       memset(page, 0, ...);
> > >   } else {
> > >       bitmap_set(rb->receivedmap, page_offset);
> > >   }
> > >
> > > And we also always set the bit when !zero too.
> > >
> > > My rational is that it's unlikely a zero page if it's sent once or more,
> > > while OTOH for the 1st time we receive it, it must be a zero page, so no
> > > need to scan for the 1st round.
> > 
> > Thinking about this, I'm wondering whether we should have this regardless.
> > IIUC now multifd will always require two page faults on destination for
> > anonymous guest memories (I suppose shmem/hugetlb is fine as no zero page
> > in those worlds).  Even though it should be faster than DMA faults, it
> > still is unwanted.
> > 
> > I'll take a note myself as todo to do some measurements in the future
> > first.  However if anyone thinks that makes sense and want to have a look,
> > please say so.  It'll be more than welcomed.
> 
> Yes, I think this is a better improvement to avoid two page faults. I can test
> the performance impact of this change on SVM-capable devices and give some data
> later. As we saw before, the IOTLB flush occurs via COW, with the change, the 
> impact of the COW should be gone.
> 
> If you need more testing and analysis on this, please let me know

Nothing more than that.  Just a heads up that Xiang used to mention a test
case where Richard used to suggest dropping the zero check:

https://lore.kernel.org/r/CAAYibXib+TWnJpV22E=adncdBmwXJRqgRjJXK7X71J=bDfaxDg@mail.gmail.com

AFAIU this should be resolved if we have the bitmap maintained, but we can
double check.  IIUC that's exactly the case for an idle guest, in that case
it should be even faster to skip the memcmp when bit clear.

If you're going to post the patches, feel free to post that as a standalone
small series first, then that can be considered merge even earlier.

Thanks a lot for doing this.
Yuan Liu March 29, 2024, 2:04 a.m. UTC | #16
> -----Original Message-----
> From: Peter Xu <peterx@redhat.com>
> Sent: Thursday, March 28, 2024 11:16 PM
> To: Liu, Yuan1 <yuan1.liu@intel.com>
> Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com; Zou,
> Nanhai <nanhai.zou@intel.com>
> Subject: Re: [PATCH v5 5/7] migration/multifd: implement initialization of
> qpl compression
> 
> On Thu, Mar 28, 2024 at 02:32:37AM +0000, Liu, Yuan1 wrote:
> > > -----Original Message-----
> > > From: Peter Xu <peterx@redhat.com>
> > > Sent: Thursday, March 28, 2024 3:26 AM
> > > To: Liu, Yuan1 <yuan1.liu@intel.com>
> > > Cc: Daniel P. Berrangé <berrange@redhat.com>; farosas@suse.de; qemu-
> > > devel@nongnu.org; hao.xiang@bytedance.com; bryan.zhang@bytedance.com;
> Zou,
> > > Nanhai <nanhai.zou@intel.com>
> > > Subject: Re: [PATCH v5 5/7] migration/multifd: implement
> initialization of
> > > qpl compression
> > >
> > > On Fri, Mar 22, 2024 at 12:40:32PM -0400, Peter Xu wrote:
> > > > > > void multifd_recv_zero_page_process(MultiFDRecvParams *p)
> > > > > > {
> > > > > >     for (int i = 0; i < p->zero_num; i++) {
> > > > > >         void *page = p->host + p->zero[i];
> > > > > >         if (!buffer_is_zero(page, p->page_size)) {
> > > > > >             memset(page, 0, p->page_size);
> > > > > >         }
> > > > > >     }
> > > > > > }
> > > >
> > > > It may not matter much (where I also see your below comments), but
> just
> > > to
> > > > mention another solution to avoid this read is that we can maintain
> > > > RAMBlock->receivedmap for precopy (especially, multifd, afaiu
> multifd
> > > > doesn't yet update this bitmap.. even if normal precopy does), then
> here
> > > > instead of scanning every time, maybe we can do:
> > > >
> > > >   /*
> > > >    * If it's the 1st time receiving it, no need to clear it as it
> must
> > > be
> > > >    * all zeros now.
> > > >    */
> > > >   if (bitmap_test(rb->receivedmap, page_offset)) {
> > > >       memset(page, 0, ...);
> > > >   } else {
> > > >       bitmap_set(rb->receivedmap, page_offset);
> > > >   }
> > > >
> > > > And we also always set the bit when !zero too.
> > > >
> > > > My rational is that it's unlikely a zero page if it's sent once or
> more,
> > > > while OTOH for the 1st time we receive it, it must be a zero page,
> so no
> > > > need to scan for the 1st round.
> > >
> > > Thinking about this, I'm wondering whether we should have this
> regardless.
> > > IIUC now multifd will always require two page faults on destination
> for
> > > anonymous guest memories (I suppose shmem/hugetlb is fine as no zero
> page
> > > in those worlds).  Even though it should be faster than DMA faults, it
> > > still is unwanted.
> > >
> > > I'll take a note myself as todo to do some measurements in the future
> > > first.  However if anyone thinks that makes sense and want to have a
> look,
> > > please say so.  It'll be more than welcomed.
> >
> > Yes, I think this is a better improvement to avoid two page faults. I
> can test
> > the performance impact of this change on SVM-capable devices and give
> some data
> > later. As we saw before, the IOTLB flush occurs via COW, with the
> change, the
> > impact of the COW should be gone.
> >
> > If you need more testing and analysis on this, please let me know
> 
> Nothing more than that.  Just a heads up that Xiang used to mention a test
> case where Richard used to suggest dropping the zero check:
> 
> https://lore.kernel.org/r/CAAYibXib+TWnJpV22E=adncdBmwXJRqgRjJXK7X71J=bDfa
> xDg@mail.gmail.com
> 
> AFAIU this should be resolved if we have the bitmap maintained, but we can
> double check.  IIUC that's exactly the case for an idle guest, in that
> case
> it should be even faster to skip the memcmp when bit clear.
> 
> If you're going to post the patches, feel free to post that as a
> standalone
> small series first, then that can be considered merge even earlier.
> 
> Thanks a lot for doing this.

Sure, I will prepare a separate patch for this, and we can have a better discussion
on concrete implementation and test results.
diff mbox series

Patch

diff --git a/migration/multifd-qpl.c b/migration/multifd-qpl.c
index 056a68a060..6de65e9da7 100644
--- a/migration/multifd-qpl.c
+++ b/migration/multifd-qpl.c
@@ -9,12 +9,253 @@ 
  * This work is licensed under the terms of the GNU GPL, version 2 or later.
  * See the COPYING file in the top-level directory.
  */
+
 #include "qemu/osdep.h"
 #include "qemu/module.h"
+#include "qapi/error.h"
+#include "migration.h"
+#include "multifd.h"
+#include "qpl/qpl.h"
+
+typedef struct {
+    qpl_job **job_array;
+    /* the number of allocated jobs */
+    uint32_t job_num;
+    /* the size of data processed by a qpl job */
+    uint32_t data_size;
+    /* compressed data buffer */
+    uint8_t *zbuf;
+    /* the length of compressed data */
+    uint32_t *zbuf_hdr;
+} QplData;
+
+static void free_zbuf(QplData *qpl)
+{
+    if (qpl->zbuf != NULL) {
+        munmap(qpl->zbuf, qpl->job_num * qpl->data_size);
+        qpl->zbuf = NULL;
+    }
+    if (qpl->zbuf_hdr != NULL) {
+        g_free(qpl->zbuf_hdr);
+        qpl->zbuf_hdr = NULL;
+    }
+}
+
+static int alloc_zbuf(QplData *qpl, uint8_t chan_id, Error **errp)
+{
+    int flags = MAP_PRIVATE | MAP_POPULATE | MAP_ANONYMOUS;
+    uint32_t size = qpl->job_num * qpl->data_size;
+    uint8_t *buf;
+
+    buf = (uint8_t *) mmap(NULL, size, PROT_READ | PROT_WRITE, flags, -1, 0);
+    if (buf == MAP_FAILED) {
+        error_setg(errp, "multifd: %u: alloc_zbuf failed, job num %u, size %u",
+                   chan_id, qpl->job_num, qpl->data_size);
+        return -1;
+    }
+    qpl->zbuf = buf;
+    qpl->zbuf_hdr = g_new0(uint32_t, qpl->job_num);
+    return 0;
+}
+
+static void free_jobs(QplData *qpl)
+{
+    for (int i = 0; i < qpl->job_num; i++) {
+        qpl_fini_job(qpl->job_array[i]);
+        g_free(qpl->job_array[i]);
+        qpl->job_array[i] = NULL;
+    }
+    g_free(qpl->job_array);
+    qpl->job_array = NULL;
+}
+
+static int alloc_jobs(QplData *qpl, uint8_t chan_id, Error **errp)
+{
+    qpl_status status;
+    uint32_t job_size = 0;
+    qpl_job *job = NULL;
+    /* always use IAA hardware accelerator */
+    qpl_path_t path = qpl_path_hardware;
+
+    status = qpl_get_job_size(path, &job_size);
+    if (status != QPL_STS_OK) {
+        error_setg(errp, "multifd: %u: qpl_get_job_size failed with error %d",
+                   chan_id, status);
+        return -1;
+    }
+    qpl->job_array = g_new0(qpl_job *, qpl->job_num);
+    for (int i = 0; i < qpl->job_num; i++) {
+        job = g_malloc0(job_size);
+        status = qpl_init_job(path, job);
+        if (status != QPL_STS_OK) {
+            error_setg(errp, "multifd: %u: qpl_init_job failed with error %d",
+                       chan_id, status);
+            free_jobs(qpl);
+            return -1;
+        }
+        qpl->job_array[i] = job;
+    }
+    return 0;
+}
+
+static int init_qpl(QplData *qpl, uint32_t job_num, uint32_t data_size,
+                    uint8_t chan_id, Error **errp)
+{
+    qpl->job_num = job_num;
+    qpl->data_size = data_size;
+    if (alloc_zbuf(qpl, chan_id, errp) != 0) {
+        return -1;
+    }
+    if (alloc_jobs(qpl, chan_id, errp) != 0) {
+        free_zbuf(qpl);
+        return -1;
+    }
+    return 0;
+}
+
+static void deinit_qpl(QplData *qpl)
+{
+    if (qpl != NULL) {
+        free_jobs(qpl);
+        free_zbuf(qpl);
+        qpl->job_num = 0;
+        qpl->data_size = 0;
+    }
+}
+
+/**
+ * qpl_send_setup: setup send side
+ *
+ * Setup each channel with QPL compression.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int qpl_send_setup(MultiFDSendParams *p, Error **errp)
+{
+    QplData *qpl;
+
+    qpl = g_new0(QplData, 1);
+    if (init_qpl(qpl, p->page_count, p->page_size, p->id, errp) != 0) {
+        g_free(qpl);
+        return -1;
+    }
+    p->compress_data = qpl;
+
+    assert(p->iov == NULL);
+    /*
+     * Each page will be compressed independently and sent using an IOV. The
+     * additional two IOVs are used to store packet header and compressed data
+     * length
+     */
+    p->iov = g_new0(struct iovec, p->page_count + 2);
+    return 0;
+}
+
+/**
+ * qpl_send_cleanup: cleanup send side
+ *
+ * Close the channel and return memory.
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static void qpl_send_cleanup(MultiFDSendParams *p, Error **errp)
+{
+    QplData *qpl = p->compress_data;
+
+    deinit_qpl(qpl);
+    g_free(p->compress_data);
+    p->compress_data = NULL;
+}
+
+/**
+ * qpl_send_prepare: prepare data to be able to send
+ *
+ * Create a compressed buffer with all the pages that we are going to
+ * send.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int qpl_send_prepare(MultiFDSendParams *p, Error **errp)
+{
+    /* Implement in next patch */
+    return -1;
+}
+
+/**
+ * qpl_recv_setup: setup receive side
+ *
+ * Create the compressed channel and buffer.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int qpl_recv_setup(MultiFDRecvParams *p, Error **errp)
+{
+    QplData *qpl;
+
+    qpl = g_new0(QplData, 1);
+    if (init_qpl(qpl, p->page_count, p->page_size, p->id, errp) != 0) {
+        g_free(qpl);
+        return -1;
+    }
+    p->compress_data = qpl;
+    return 0;
+}
+
+/**
+ * qpl_recv_cleanup: setup receive side
+ *
+ * Close the channel and return memory.
+ *
+ * @p: Params for the channel that we are using
+ */
+static void qpl_recv_cleanup(MultiFDRecvParams *p)
+{
+    QplData *qpl = p->compress_data;
+
+    deinit_qpl(qpl);
+    g_free(p->compress_data);
+    p->compress_data = NULL;
+}
+
+/**
+ * qpl_recv: read the data from the channel into actual pages
+ *
+ * Read the compressed buffer, and uncompress it into the actual
+ * pages.
+ *
+ * Returns 0 for success or -1 for error
+ *
+ * @p: Params for the channel that we are using
+ * @errp: pointer to an error
+ */
+static int qpl_recv(MultiFDRecvParams *p, Error **errp)
+{
+    /* Implement in next patch */
+    return -1;
+}
+
+static MultiFDMethods multifd_qpl_ops = {
+    .send_setup = qpl_send_setup,
+    .send_cleanup = qpl_send_cleanup,
+    .send_prepare = qpl_send_prepare,
+    .recv_setup = qpl_recv_setup,
+    .recv_cleanup = qpl_recv_cleanup,
+    .recv = qpl_recv,
+};
 
 static void multifd_qpl_register(void)
 {
-    /* noop */
+    multifd_register_ops(MULTIFD_COMPRESSION_QPL, &multifd_qpl_ops);
 }
 
 migration_init(multifd_qpl_register);