Message ID | 1359410505-4715-1-git-send-email-mrhines@linux.vnet.ibm.com |
---|---|
State | New |
Headers | show |
Hello, Am 28.01.2013 23:01, schrieb mrhines@linux.vnet.ibm.com: > From: "Michael R. Hines" <mrhines@us.ibm.com> > > > Signed-off-by: Michael R. Hines <mrhines@us.ibm.com> > --- > Makefile.target | 5 +- > include/qemu/rdma.h | 249 ++++++++++ > qemu-rdma.c | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 1609 insertions(+), 2 deletions(-) > create mode 100644 include/qemu/rdma.h > create mode 100644 qemu-rdma.c This series is missing a cover letter with explanations, starting with the acronym and its purpose. Further, the commit messages are lacking descriptions. Any reason that the code is GPLv2 rather than GPLv2+? Regards, Andreas
Andreas Färber <afaerber@suse.de> writes: > Hello, > > Am 28.01.2013 23:01, schrieb mrhines@linux.vnet.ibm.com: >> From: "Michael R. Hines" <mrhines@us.ibm.com> >> >> >> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com> >> --- >> Makefile.target | 5 +- >> include/qemu/rdma.h | 249 ++++++++++ >> qemu-rdma.c | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++ >> 3 files changed, 1609 insertions(+), 2 deletions(-) >> create mode 100644 include/qemu/rdma.h >> create mode 100644 qemu-rdma.c > > This series is missing a cover letter with explanations, starting with > the acronym and its purpose. Indeed. A bit of an introduction of what problem the series solves is in order. Can you share performance data with and without RMDA? > Further, the commit messages are lacking > descriptions. Any reason that the code is GPLv2 rather than GPLv2+? I assume copy/paste. Michael, unless you had an explicit reason to use GPLv2 only, we now prefer GPLv2+. Regards, Anthony Liguori > > Regards, > Andreas > > -- > SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany > GF: Jeff Hawn, Jennifer Guild, Felix Imendörffer; HRB 16746 AG Nürnberg
On Mon, Jan 28, 2013 at 05:01:41PM -0500, mrhines@linux.vnet.ibm.com wrote: > From: "Michael R. Hines" <mrhines@us.ibm.com> > > > Signed-off-by: Michael R. Hines <mrhines@us.ibm.com> > --- > Makefile.target | 5 +- > include/qemu/rdma.h | 249 ++++++++++ > qemu-rdma.c | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 1609 insertions(+), 2 deletions(-) > create mode 100644 include/qemu/rdma.h > create mode 100644 qemu-rdma.c > > diff --git a/Makefile.target b/Makefile.target > index 760da1e..d1d6b8c 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -112,12 +112,13 @@ obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o > obj-y += hw/ > obj-$(CONFIG_KVM) += kvm-all.o > obj-$(CONFIG_NO_KVM) += kvm-stub.o > -obj-y += memory.o savevm.o cputlb.o > +# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o > +obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o Please drop tracefunc.o. > obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o > obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o > obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o > obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o > -LIBS+=-lz > +LIBS+=-lz -lrdmacm This needs to be a ./configure option. Not all users will choose to build with RDMA support and rdmacm may not be available on all host platforms. Stefan
Il 28/01/2013 23:01, mrhines@linux.vnet.ibm.com ha scritto: > From: "Michael R. Hines" <mrhines@us.ibm.com> > > > Signed-off-by: Michael R. Hines <mrhines@us.ibm.com> > --- > Makefile.target | 5 +- > include/qemu/rdma.h | 249 ++++++++++ Please make this include/migration/rdma.h Paolo > qemu-rdma.c | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++ > 3 files changed, 1609 insertions(+), 2 deletions(-) > create mode 100644 include/qemu/rdma.h > create mode 100644 qemu-rdma.c > > diff --git a/Makefile.target b/Makefile.target > index 760da1e..d1d6b8c 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -112,12 +112,13 @@ obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o > obj-y += hw/ > obj-$(CONFIG_KVM) += kvm-all.o > obj-$(CONFIG_NO_KVM) += kvm-stub.o > -obj-y += memory.o savevm.o cputlb.o > +# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o > +obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o > obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o > obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o > obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o > obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o > -LIBS+=-lz > +LIBS+=-lz -lrdmacm > > # xen support > obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o > diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h > new file mode 100644 > index 0000000..099622e > --- /dev/null > +++ b/include/qemu/rdma.h > @@ -0,0 +1,249 @@ > +/* > + * RDMA data structures and helper functions header (for migration) > + * > + * Copyright IBM, Corp. 2013 > + * > + * Authors: > + * Michael R. Hines <mrhines@us.ibm.com> > + * Jiuxing Liu <jl@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#ifndef _QEMU_RDMA_H > +#define _QEMU_RDMA_H > + > +#include <rdma/rdma_cma.h> > +#include "monitor/monitor.h" > + > +extern int rdmaport; > +extern char rdmahost[64]; > + > +struct qemu_rdma_context { > + /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in > + cm_id->verbs, cm_id->channel, and cm_id->qp. */ > + struct rdma_cm_id *cm_id; > + struct rdma_cm_id *listen_id; > + > + struct ibv_context *verbs; > + struct rdma_event_channel *channel; > + struct ibv_qp *qp; > + > + struct ibv_comp_channel *comp_channel; > + struct ibv_pd *pd; > + struct ibv_cq *cq; > +}; > + > +static inline void qemu_rdma_init_context(struct qemu_rdma_context *rdma_ctx) > +{ > + rdma_ctx->cm_id = NULL; > + rdma_ctx->listen_id = NULL; > + rdma_ctx->verbs = NULL; > + rdma_ctx->channel = NULL; > + rdma_ctx->qp = NULL; > + rdma_ctx->comp_channel = NULL; > + rdma_ctx->pd = NULL; > + rdma_ctx->cq = NULL; > +} > + > +void cpu_physical_memory_reset_dirty_all(void); > + > +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx, > + const char *host, int port); > +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx); > +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx); > +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len); > +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len); > +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx); > +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx); > + > +/* Instead of registering whole ram blocks, we can register them in smaller > + * chunks. This may be benefial if the ram blocks have holes in them */ > +#define QEMU_RDMA_CHUNK_REGISTRATION > + > +#define QEMU_RDMA_LAZY_REGISTRATION > + > +#define QEMU_RDMA_REG_CHUNK_SHIFT 20 > +#define QEMU_RDMA_REG_CHUNK_SIZE (1UL << (QEMU_RDMA_REG_CHUNK_SHIFT)) > +#define QEMU_RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \ > + (((unsigned long)(host_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT) - \ > + ((unsigned long)(start_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT)) > +#define QEMU_RDMA_REG_NUM_CHUNKS(rdma_ram_block) \ > + (QEMU_RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\ > + (rdma_ram_block)->local_host_addr +\ > + (rdma_ram_block)->length) + 1) > +#define QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\ > + ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \ > + QEMU_RDMA_REG_CHUNK_SHIFT) + (i)) << \ > + QEMU_RDMA_REG_CHUNK_SHIFT)) > +#define QEMU_RDMA_REG_CHUNK_END(rdma_ram_block, i) \ > + (QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) + \ > + QEMU_RDMA_REG_CHUNK_SIZE) > + > +struct qemu_rdma_ram_block { > + uint8_t *local_host_addr; > + uint64_t remote_host_addr; > + uint64_t offset; > + uint64_t length; > + struct ibv_mr **pmr; > + struct ibv_mr *mr; > + uint32_t remote_rkey; > +}; > + > +struct qemu_rdma_remote_ram_block { > + uint64_t remote_host_addr; > + uint64_t offset; > + uint64_t length; > + uint32_t remote_rkey; > +}; > + > +#define QEMU_MAX_RAM_BLOCKS 64 > + > +struct qemu_rdma_ram_blocks { > + int num_blocks; > + struct qemu_rdma_ram_block block[QEMU_MAX_RAM_BLOCKS]; > +}; > + > +struct qemu_rdma_remote_ram_blocks { > + int num_blocks; > + struct qemu_rdma_remote_ram_block block[QEMU_MAX_RAM_BLOCKS]; > +}; > + > +int qemu_rdma_init_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks); > +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks); > +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks); > +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks); > +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks); > +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks); > + > +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, > + struct qemu_rdma_remote_ram_blocks *remote); > +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, > + struct qemu_rdma_remote_ram_blocks *remote); > + > +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length, > + struct qemu_rdma_ram_blocks *blocks, > + int *block_index, int *chunk_index); > + > +struct qemu_rdma_migration_data { > + char *host; > + int port; > + int enabled; > + > + struct qemu_rdma_context rdma_ctx; > + struct qemu_rdma_ram_blocks rdma_ram_blocks; > + > + /* This is used for synchronization: We use > + IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs > + are done. When the receiver gets it, it can be certain > + that all the RDMAs are completed. */ > + int sync; > + struct ibv_mr *sync_mr; > + > + /* This is used for the server to write the remote > + ram blocks info. */ > + struct qemu_rdma_remote_ram_blocks remote_info; > + struct ibv_mr *remote_info_mr; > + > + /* The rest is only for the initiator of the migration. */ > + int client_init_done; > + > + /* number of outstanding unsignaled send */ > + int num_unsignaled_send; > + > + /* number of outstanding signaled send */ > + int num_signaled_send; > + > + /* store info about current buffer so that we can > + merge it with future sends */ > + uint64_t current_offset; > + uint64_t current_length; > + /* index of ram block the current buffer belongs to */ > + int current_index; > + /* index of the chunk in the current ram block */ > + int current_chunk; > + > + uint64_t total_bytes; > + > +}; > + > +extern struct qemu_rdma_migration_data rdma_mdata; > + > +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata); > + > +static inline int qemu_use_rdma_migration(void) > +{ > + /* port will be non-zero if user wants to use RDMA. */ > + return rdma_mdata.port != -1; > +} > + > +static inline int qemu_rdma_migration_enabled(void) > +{ > + return rdma_mdata.enabled; > +} > + > +void qemu_rdma_migration_disable(void); > + > +#define QEMU_RDMA_MIGRATION_BLOCKING > +#define QEMU_RDMA_MIGRATION_EXTRA_SYNC > + > +enum { > + QEMU_RDMA_MIGRATION_WRID_NONE = 0, > + QEMU_RDMA_MIGRATION_WRID_RDMA, > + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC, > + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC, > + QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO, > + QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO, > + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC, > + QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC, > +}; > + > +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host, > + int port); > +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata, > + int wr_id); > +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata, > + int wr_id); > + > +int qemu_rdma_migration_reg_remote_info( > + struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_dereg_remote_info( > + struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_post_send_remote_info( > + struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_post_recv_remote_info( > + struct qemu_rdma_migration_data *mdata); > + > +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata, > + uint64_t addr, uint64_t len); > +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_wait_for_wrid( > + struct qemu_rdma_migration_data *mdata, > + int wrid); > +int qemu_rdma_migration_poll_for_wrid( > + struct qemu_rdma_migration_data *mdata, > + int wrid); > +int qemu_rdma_migration_block_for_wrid( > + struct qemu_rdma_migration_data *mdata, > + int wrid); > +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata); > + > +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata); > +int qemu_rdma_migration_server_wait_for_client( > + struct qemu_rdma_migration_data *mdata); > + > +#endif > diff --git a/qemu-rdma.c b/qemu-rdma.c > new file mode 100644 > index 0000000..5f16875 > --- /dev/null > +++ b/qemu-rdma.c > @@ -0,0 +1,1357 @@ > +/* > + * RDMA data structures and helper functions (for migration) > + * > + * Copyright IBM, Corp. 2013 > + * > + * Authors: > + * Michael R. Hines <mrhines@us.ibm.com> > + * Jiuxing Liu <jl@us.ibm.com> > + * > + * This work is licensed under the terms of the GNU GPL, version 2. See > + * the COPYING file in the top-level directory. > + * > + */ > + > +#include "qemu/rdma.h" > +#include "qemu-common.h" > +#include <stdio.h> > +#include <sys/types.h> > +#include <sys/socket.h> > +#include <netdb.h> > +#include <arpa/inet.h> > +#include <string.h> > + > +#define QEMU_RDMA_RESOLVE_TIMEOUT_MS 10000 > +#define QEMU_RDMA_CQ_SIZE 2000 > +#define QEMU_RDMA_QP_SIZE 1000 > + > +int rdmaport = -1; > +char rdmahost[64] = ""; > +struct qemu_rdma_migration_data rdma_mdata; > + > +static void *qemu_rdma_mallocz(size_t size) > +{ > + void *ptr; > + ptr = malloc(size); > + memset(ptr, 0, size); > + return ptr; > +} > + > +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx, > + const char *host, int port) > +{ > + int ret; > + struct addrinfo *res; > + char port_str[16]; > + struct rdma_cm_event *cm_event; > + > + > + if (!strcmp(host, "")) { > + printf("RDMA hostname has not been set\n"); > + return -1; > + } > + > + /* create CM channel */ > + rdma_ctx->channel = rdma_create_event_channel(); > + if (!rdma_ctx->channel) { > + printf("could not create CM channel\n"); > + return -1; > + } > + > + /* create CM id */ > + ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL, > + RDMA_PS_TCP); > + if (ret) { > + printf("could not create channel id\n"); > + goto err_resolve_create_id; > + } > + > + snprintf(port_str, 16, "%d", port); > + port_str[15] = '\0'; > + ret = getaddrinfo(host, port_str, NULL, &res); > + if (ret < 0) { > + printf("could not getaddrinfo address %s\n", host); > + goto err_resolve_get_addr; > + } > + > + /* resolve the first address */ > + ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr, > + QEMU_RDMA_RESOLVE_TIMEOUT_MS); > + if (ret) { > + printf("could not resolve address %s\n", host); > + goto err_resolve_get_addr; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + printf("could not perform event_addr_resolved\n"); > + goto err_resolve_get_addr; > + } > + if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { > + printf("result not equal to event_addr_resolved\n"); > + rdma_ack_cm_event(cm_event); > + goto err_resolve_get_addr; > + } > + rdma_ack_cm_event(cm_event); > + > + /* resolve route */ > + ret = rdma_resolve_route(rdma_ctx->cm_id, QEMU_RDMA_RESOLVE_TIMEOUT_MS); > + if (ret) { > + printf("could not resolve rdma route\n"); > + goto err_resolve_get_addr; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + printf("could not perform event_route_resolved\n"); > + goto err_resolve_get_addr; > + } > + if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { > + printf("result not equal to event_route_resolved\n"); > + rdma_ack_cm_event(cm_event); > + goto err_resolve_get_addr; > + } > + rdma_ack_cm_event(cm_event); > + > + rdma_ctx->verbs = rdma_ctx->cm_id->verbs; > + return 0; > + > +err_resolve_get_addr: > + rdma_destroy_id(rdma_ctx->cm_id); > +err_resolve_create_id: > + rdma_destroy_event_channel(rdma_ctx->channel); > + rdma_ctx->channel = NULL; > + > + return -1; > +} > + > +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx) > +{ > + > + /* allocate pd */ > + rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs); > + if (!rdma_ctx->pd) { > + return -1; > + } > + > +#ifdef QEMU_RDMA_MIGRATION_BLOCKING > + /* create completion channel */ > + rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs); > + if (!rdma_ctx->comp_channel) { > + goto err_alloc_pd_cq; > + } > +#endif > + > + /* create cq */ > + rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, QEMU_RDMA_CQ_SIZE, > + NULL, rdma_ctx->comp_channel, 0); > + if (!rdma_ctx->cq) { > + goto err_alloc_pd_cq; > + } > + > + return 0; > + > +err_alloc_pd_cq: > + if (rdma_ctx->pd) { > + ibv_dealloc_pd(rdma_ctx->pd); > + } > + if (rdma_ctx->comp_channel) { > + ibv_destroy_comp_channel(rdma_ctx->comp_channel); > + } > + rdma_ctx->pd = NULL; > + rdma_ctx->comp_channel = NULL; > + return -1; > + > +} > + > +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx) > +{ > + struct ibv_qp_init_attr attr = { 0 }; > + int ret; > + > + attr.cap.max_send_wr = QEMU_RDMA_QP_SIZE; > + attr.cap.max_recv_wr = 2; > + attr.cap.max_send_sge = 1; > + attr.cap.max_recv_sge = 1; > + attr.send_cq = rdma_ctx->cq; > + attr.recv_cq = rdma_ctx->cq; > + attr.qp_type = IBV_QPT_RC; > + > + ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr); > + if (ret) { > + return -1; > + } > + > + rdma_ctx->qp = rdma_ctx->cm_id->qp; > + return 0; > +} > + > +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len) > +{ > + int ret; > + struct rdma_conn_param conn_param = { 0 }; > + struct rdma_cm_event *cm_event; > + > + conn_param.initiator_depth = 2; > + conn_param.retry_count = 5; > + conn_param.private_data = out_data; > + conn_param.private_data_len = out_len; > + > + ret = rdma_connect(rdma_ctx->cm_id, &conn_param); > + if (ret) { > + return -1; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + return -1; > + } > + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { > + return -1; > + } > + > + if (in_len) { > + if (*in_len > cm_event->param.conn.private_data_len) { > + *in_len = cm_event->param.conn.private_data_len; > + } > + if (*in_len) { > + memcpy(in_data, cm_event->param.conn.private_data, *in_len); > + } > + } > + > + rdma_ack_cm_event(cm_event); > + > + return 0; > +} > + > +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host, > + int port) > +{ > + int ret; > + struct rdma_cm_event *cm_event; > + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; > + struct ibv_context *verbs; > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + goto err_listen; > + } > + > + if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { > + rdma_ack_cm_event(cm_event); > + goto err_listen; > + } > + > + rdma_ctx->cm_id = cm_event->id; > + verbs = cm_event->id->verbs; > + printf("verbs context after listen: %p\n", verbs); > + rdma_ack_cm_event(cm_event); > + > + if (!rdma_ctx->verbs) { > + rdma_ctx->verbs = verbs; > + ret = qemu_rdma_migration_server_prepare(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error preparing server!\n"); > + goto err_listen; > + } > + } else if (rdma_ctx->verbs != verbs) { > + fprintf(stderr, "ibv context not matching %p, %p!\n", > + rdma_ctx->verbs, verbs); > + goto err_listen; > + } > + /* xxx destroy listen_id ??? */ > + > + return 0; > + > +err_listen: > + > + return -1; > + > +} > + > +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx, > + void *in_data, int *in_len, void *out_data, int out_len) > +{ > + int ret; > + struct rdma_conn_param conn_param = { 0 }; > + struct rdma_cm_event *cm_event; > + > + conn_param.responder_resources = 2; > + conn_param.private_data = out_data; > + conn_param.private_data_len = out_len; > + > + ret = rdma_accept(rdma_ctx->cm_id, &conn_param); > + if (ret) { > + fprintf(stderr, "rdma_accept returns %d!\n", ret); > + return -1; > + } > + > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + return -1; > + } > + > + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { > + rdma_ack_cm_event(cm_event); > + return -1; > + } > + > + if (in_len) { > + if (*in_len > cm_event->param.conn.private_data_len) { > + *in_len = cm_event->param.conn.private_data_len; > + } > + if (*in_len) { > + memcpy(in_data, cm_event->param.conn.private_data, *in_len); > + } > + } > + > + rdma_ack_cm_event(cm_event); > + > + return 0; > +} > + > +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx) > +{ > + int ret; > + struct rdma_cm_event *cm_event; > + > + ret = rdma_disconnect(rdma_ctx->cm_id); > + if (ret) { > + return; > + } > + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); > + if (ret) { > + return; > + } > + rdma_ack_cm_event(cm_event); > +} > + > +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx) > +{ > + if (rdma_ctx->qp) { > + ibv_destroy_qp(rdma_ctx->qp); > + } > + if (rdma_ctx->cq) { > + ibv_destroy_cq(rdma_ctx->cq); > + } > + if (rdma_ctx->comp_channel) { > + ibv_destroy_comp_channel(rdma_ctx->comp_channel); > + } > + if (rdma_ctx->pd) { > + ibv_dealloc_pd(rdma_ctx->pd); > + } > + if (rdma_ctx->listen_id) { > + rdma_destroy_id(rdma_ctx->listen_id); > + } > + if (rdma_ctx->cm_id) { > + rdma_destroy_id(rdma_ctx->cm_id); > + } > + if (rdma_ctx->channel) { > + rdma_destroy_event_channel(rdma_ctx->channel); > + } > +} > + > +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks) > +{ > + int i, j; > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + struct qemu_rdma_ram_block *block = &(rdma_ram_blocks->block[i]); > + int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block); > + /* allocate memory to store chunk MRs */ > + rdma_ram_blocks->block[i].pmr = qemu_rdma_mallocz( > + num_chunks * sizeof(struct ibv_mr *)); > + > + if (!block->pmr) { > + goto err_reg_chunk_ram_blocks; > + } > + > + for (j = 0; j < num_chunks; j++) { > + uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, j); > + uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, j); > + if (start_addr < block->local_host_addr) { > + start_addr = block->local_host_addr; > + } > + if (end_addr > block->local_host_addr + block->length) { > + end_addr = block->local_host_addr + block->length; > + } > + block->pmr[j] = ibv_reg_mr(rdma_ctx->pd, > + start_addr, > + end_addr - start_addr, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (!block->pmr[j]) { > + break; > + } > + } > + if (j < num_chunks) { > + for (j--; j >= 0; j--) { > + ibv_dereg_mr(block->pmr[j]); > + } > + block->pmr[i] = NULL; > + goto err_reg_chunk_ram_blocks; > + } > + } > + > + return 0; > + > +err_reg_chunk_ram_blocks: > + for (i--; i >= 0; i--) { > + int num_chunks = > + QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); > + for (j = 0; j < num_chunks; j++) { > + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); > + } > + free(rdma_ram_blocks->block[i].pmr); > + rdma_ram_blocks->block[i].pmr = NULL; > + } > + > + return -1; > + > +} > + > +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks) > +{ > + int i; > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + rdma_ram_blocks->block[i].mr = > + ibv_reg_mr(rdma_ctx->pd, > + rdma_ram_blocks->block[i].local_host_addr, > + rdma_ram_blocks->block[i].length, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (!rdma_ram_blocks->block[i].mr) { > + break; > + } > + } > + > + if (i >= rdma_ram_blocks->num_blocks) { > + return 0; > + } > + > + for (i--; i >= 0; i--) { > + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); > + } > + > + return -1; > + > +} > + > +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks) > +{ > +#ifdef QEMU_RDMA_CHUNK_REGISTRATION > +#ifdef QEMU_RDMA_LAZY_REGISTRATION > + return 0; > +#else > + return qemu_rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks); > +#endif > +#else > + return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); > +#endif > +} > + > +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_blocks *rdma_ram_blocks) > +{ > + return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); > +} > + > +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks) > +{ > + int i, j; > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + int num_chunks; > + if (!rdma_ram_blocks->block[i].pmr) { > + continue; > + } > + num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); > + for (j = 0; j < num_chunks; j++) { > + if (!rdma_ram_blocks->block[i].pmr[j]) { > + continue; > + } > + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); > + } > + free(rdma_ram_blocks->block[i].pmr); > + rdma_ram_blocks->block[i].pmr = NULL; > + } > + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { > + if (!rdma_ram_blocks->block[i].mr) { > + continue; > + } > + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); > + rdma_ram_blocks->block[i].mr = NULL; > + } > +} > + > +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, > + struct qemu_rdma_remote_ram_blocks *remote) > +{ > + int i; > + remote->num_blocks = local->num_blocks; > + for (i = 0; i < local->num_blocks; i++) { > + remote->block[i].remote_host_addr = > + (uint64_t)(local->block[i].local_host_addr); > + remote->block[i].remote_rkey = local->block[i].mr->rkey; > + remote->block[i].offset = local->block[i].offset; > + remote->block[i].length = local->block[i].length; > + } > +} > + > +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, > + struct qemu_rdma_remote_ram_blocks *remote) > +{ > + int i, j; > + > + if (local->num_blocks != remote->num_blocks) { > + return -1; > + } > + > + for (i = 0; i < remote->num_blocks; i++) { > + /* search local ram blocks */ > + for (j = 0; j < local->num_blocks; j++) { > + if (remote->block[i].offset != local->block[j].offset) { > + continue; > + } > + if (remote->block[i].length != local->block[j].length) { > + return -1; > + } > + local->block[j].remote_host_addr = > + remote->block[i].remote_host_addr; > + local->block[j].remote_rkey = remote->block[i].remote_rkey; > + break; > + } > + if (j >= local->num_blocks) { > + return -1; > + } > + } > + > + return 0; > +} > + > +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length, > + struct qemu_rdma_ram_blocks *blocks, > + int *block_index, int *chunk_index) > +{ > + int i; > + for (i = 0; i < blocks->num_blocks; i++) { > + if (offset < blocks->block[i].offset) { > + continue; > + } > + if (offset + length > > + blocks->block[i].offset + blocks->block[i].length) { > + continue; > + } > + *block_index = i; > + if (chunk_index) { > + uint8_t *host_addr = blocks->block[i].local_host_addr + > + (offset - blocks->block[i].offset); > + *chunk_index = QEMU_RDMA_REG_CHUNK_INDEX( > + blocks->block[i].local_host_addr, host_addr); > + } > + return 0; > + } > + return -1; > +} > + > +static int qemu_rdma_get_lkey(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_block *block, uint64_t host_addr, > + uint32_t *lkey) > +{ > + int chunk; > + if (block->mr) { > + *lkey = block->mr->lkey; > + return 0; > + } > + if (!block->pmr) { > + int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block); > + /* allocate memory to store chunk MRs */ > + block->pmr = qemu_rdma_mallocz(num_chunks * > + sizeof(struct ibv_mr *)); > + if (!block->pmr) { > + return -1; > + } > + } > + chunk = QEMU_RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr); > + if (!block->pmr[chunk]) { > + uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, chunk); > + uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, chunk); > + if (start_addr < block->local_host_addr) { > + start_addr = block->local_host_addr; > + } > + if (end_addr > block->local_host_addr + block->length) { > + end_addr = block->local_host_addr + block->length; > + } > + block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd, > + start_addr, > + end_addr - start_addr, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (!block->pmr[chunk]) { > + return -1; > + } > + } > + *lkey = block->pmr[chunk]->lkey; > + return 0; > +} > + > +static int qemu_rdma_write(struct qemu_rdma_context *rdma_ctx, > + struct qemu_rdma_ram_block *block, > + uint64_t offset, uint64_t length, > + uint64_t wr_id, enum ibv_send_flags flag) > +{ > + struct ibv_sge sge; > + struct ibv_send_wr send_wr = { 0 }; > + struct ibv_send_wr *bad_wr; > + > + sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset)); > + sge.length = length; > + if (qemu_rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) { > + fprintf(stderr, "cannot get lkey!\n"); > + return -1; > + } > + send_wr.wr_id = wr_id; > + send_wr.opcode = IBV_WR_RDMA_WRITE; > + send_wr.send_flags = flag; > + send_wr.sg_list = &sge; > + send_wr.num_sge = 1; > + send_wr.wr.rdma.rkey = block->remote_rkey; > + send_wr.wr.rdma.remote_addr = block->remote_host_addr + > + (offset - block->offset); > + > + if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > +} > + > +/* Do not merge data if larger than this. */ > +#define QEMU_RDMA_MIGRATION_MERGE_MAX (4 * 1024 * 1024) > + > +#define QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX 64 > + > +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata) > +{ > + mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd, > + &mdata->sync, > + sizeof mdata->sync, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (mdata->sync_mr) { > + return 0; > + } > + return -1; > +} > + > +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata) > +{ > + return ibv_dereg_mr(mdata->sync_mr); > +} > + > +int qemu_rdma_migration_reg_remote_info( > + struct qemu_rdma_migration_data *mdata) > +{ > + mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd, > + &mdata->remote_info, > + sizeof mdata->remote_info, > + IBV_ACCESS_LOCAL_WRITE | > + IBV_ACCESS_REMOTE_WRITE | > + IBV_ACCESS_REMOTE_READ); > + if (mdata->remote_info_mr) { > + return 0; > + } > + return -1; > +} > + > +int qemu_rdma_migration_dereg_remote_info( > + struct qemu_rdma_migration_data *mdata) > +{ > + return ibv_dereg_mr(mdata->remote_info_mr); > +} > + > + > +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata, > + int wr_id) > +{ > + struct ibv_sge sge; > + struct ibv_send_wr send_wr = { 0 }; > + struct ibv_send_wr *bad_wr; > + > + mdata->sync = 1; > + > + sge.addr = (uint64_t)(&mdata->sync); > + sge.length = sizeof mdata->sync; > + sge.lkey = mdata->sync_mr->lkey; > + > + send_wr.wr_id = wr_id; > + send_wr.opcode = IBV_WR_SEND; > + send_wr.send_flags = IBV_SEND_SIGNALED; > + send_wr.sg_list = &sge; > + send_wr.num_sge = 1; > + > + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > +} > + > +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata, > + int wr_id) > +{ > + struct ibv_sge sge; > + struct ibv_recv_wr recv_wr = { 0 }; > + struct ibv_recv_wr *bad_wr; > + > + mdata->sync = 1; > + > + sge.addr = (uint64_t)(&mdata->sync); > + sge.length = sizeof mdata->sync; > + sge.lkey = mdata->sync_mr->lkey; > + > + recv_wr.wr_id = wr_id; > + recv_wr.sg_list = &sge; > + recv_wr.num_sge = 1; > + > + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > + > +} > + > +int qemu_rdma_migration_post_send_remote_info( > + struct qemu_rdma_migration_data *mdata) > +{ > + struct ibv_sge sge; > + struct ibv_send_wr send_wr = { 0 }; > + struct ibv_send_wr *bad_wr; > + > + sge.addr = (uint64_t)(&mdata->remote_info); > + sge.length = sizeof mdata->remote_info; > + sge.lkey = mdata->remote_info_mr->lkey; > + > + send_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO; > + send_wr.opcode = IBV_WR_SEND; > + send_wr.send_flags = IBV_SEND_SIGNALED; > + send_wr.sg_list = &sge; > + send_wr.num_sge = 1; > + > + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { > + return -1; > + } > + > + mdata->num_signaled_send--; > + return 0; > +} > + > +int qemu_rdma_migration_post_recv_remote_info( > + struct qemu_rdma_migration_data *mdata) > +{ > + struct ibv_sge sge; > + struct ibv_recv_wr recv_wr = { 0 }; > + struct ibv_recv_wr *bad_wr; > + > + sge.addr = (uint64_t)(&mdata->remote_info); > + sge.length = sizeof mdata->remote_info; > + sge.lkey = mdata->remote_info_mr->lkey; > + > + recv_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO; > + recv_wr.sg_list = &sge; > + recv_wr.num_sge = 1; > + > + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { > + return -1; > + } > + > + return 0; > +} > + > + > +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata) > +{ > + int ret; > + enum ibv_send_flags flags = 0; > + > + if (!mdata->current_length) { > + return 0; > + } > + if (mdata->num_unsignaled_send >= > + QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) { > + flags = IBV_SEND_SIGNALED; > + } > + ret = qemu_rdma_write(&mdata->rdma_ctx, > + &(mdata->rdma_ram_blocks.block[mdata->current_index]), > + mdata->current_offset, > + mdata->current_length, > + QEMU_RDMA_MIGRATION_WRID_RDMA, flags); > + > + if (ret) { > + return ret; > + } > + > + if (mdata->num_unsignaled_send >= > + QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) { > + mdata->num_unsignaled_send = 0; > + mdata->num_signaled_send++; > + } else { > + mdata->num_unsignaled_send++; > + } > + > + mdata->total_bytes += mdata->current_length; > + mdata->current_length = 0; > + mdata->current_offset = 0; > + > + return 0; > +} > + > +static inline int qemu_rdma_migration_in_current_block( > + struct qemu_rdma_migration_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + struct qemu_rdma_ram_block *block = > + &(mdata->rdma_ram_blocks.block[mdata->current_index]); > + if (mdata->current_index < 0) { > + return 0; > + } > + if (offset < block->offset) { > + return 0; > + } > + if (offset + len > block->offset + block->length) { > + return 0; > + } > + return 1; > +} > + > +static inline int qemu_rdma_migration_in_current_chunk( > + struct qemu_rdma_migration_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + struct qemu_rdma_ram_block *block = > + &(mdata->rdma_ram_blocks.block[mdata->current_index]); > + uint8_t *chunk_start, *chunk_end, *host_addr; > + if (mdata->current_chunk < 0) { > + return 0; > + } > + host_addr = block->local_host_addr + (offset - block->offset); > + chunk_start = QEMU_RDMA_REG_CHUNK_START(block, mdata->current_chunk); > + if (chunk_start < block->local_host_addr) { > + chunk_start = block->local_host_addr; > + } > + if (host_addr < chunk_start) { > + return 0; > + } > + chunk_end = QEMU_RDMA_REG_CHUNK_END(block, mdata->current_chunk); > + if (chunk_end > chunk_start + block->length) { > + chunk_end = chunk_start + block->length; > + } > + if (host_addr + len > chunk_end) { > + return 0; > + } > + return 1; > +} > + > +static inline int qemu_rdma_buffer_mergable( > + struct qemu_rdma_migration_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + if (mdata->current_length == 0) { > + return 0; > + } > + if (offset != mdata->current_offset + mdata->current_length) { > + return 0; > + } > + if (!qemu_rdma_migration_in_current_block(mdata, offset, len)) { > + return 0; > + } > +#ifdef QEMU_RDMA_CHUNK_REGISTRATION > + if (!qemu_rdma_migration_in_current_chunk(mdata, offset, len)) { > + return 0; > + } > +#endif > + return 1; > +} > + > +/* Note that buffer must be within a single block/chunk. */ > +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata, > + uint64_t offset, uint64_t len) > +{ > + int index = mdata->current_index; > + int chunk_index = mdata->current_chunk; > + int ret; > + > + /* If we cannot merge it, we flush the current buffer first. */ > + if (!qemu_rdma_buffer_mergable(mdata, offset, len)) { > + ret = qemu_rdma_migration_write_flush(mdata); > + if (ret) { > + return ret; > + } > + mdata->current_length = 0; > + mdata->current_offset = offset; > + > + if (qemu_rdma_search_ram_block(offset, len, > + &mdata->rdma_ram_blocks, &index, &chunk_index)) { > + return -1; > + } > + mdata->current_index = index; > + mdata->current_chunk = chunk_index; > + } > + > + /* merge it */ > + mdata->current_length += len; > + > + /* flush it if buffer is too large */ > + if (mdata->current_length >= QEMU_RDMA_MIGRATION_MERGE_MAX) { > + return qemu_rdma_migration_write_flush(mdata); > + } > + > + return 0; > +} > + > +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata) > +{ > + int ret; > + struct ibv_wc wc; > + > + ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc); > + if (!ret) { > + return QEMU_RDMA_MIGRATION_WRID_NONE; > + } > + if (ret < 0) { > + fprintf(stderr, "ibv_poll_cq return %d!\n", ret); > + return ret; > + } > + if (wc.status != IBV_WC_SUCCESS) { > + fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n", > + wc.status, ibv_wc_status_str(wc.status)); > + fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id); > + > + return -1; > + } > + > + if (!(wc.opcode & IBV_WC_RECV)) { > + mdata->num_signaled_send--; > + } > + > + return (int)wc.wr_id; > +} > + > +int qemu_rdma_migration_wait_for_wrid( > + struct qemu_rdma_migration_data *mdata, > + int wrid) > +{ > +#ifdef QEMU_RDMA_MIGRATION_BLOCKING > + return qemu_rdma_migration_block_for_wrid(mdata, wrid); > +#else > + return qemu_rdma_migration_poll_for_wrid(mdata, wrid); > +#endif > +} > + > +int qemu_rdma_migration_poll_for_wrid( > + struct qemu_rdma_migration_data *mdata, > + int wrid) > +{ > + int r = QEMU_RDMA_MIGRATION_WRID_NONE; > + while (r != wrid) { > + r = qemu_rdma_migration_poll(mdata); > + if (r < 0) { > + return r; > + } > + } > + return 0; > +} > + > +int qemu_rdma_migration_block_for_wrid( > + struct qemu_rdma_migration_data *mdata, > + int wrid) > +{ > + int num_cq_events = 0; > + int r = QEMU_RDMA_MIGRATION_WRID_NONE; > + struct ibv_cq *cq; > + void *cq_ctx; > + > + if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) { > + return -1; > + } > + /* poll cq first */ > + while (r != wrid) { > + r = qemu_rdma_migration_poll(mdata); > + if (r < 0) { > + return r; > + } > + if (r == QEMU_RDMA_MIGRATION_WRID_NONE) { > + break; > + } > + } > + if (r == wrid) { > + return 0; > + } > + > + while (1) { > + if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel, > + &cq, &cq_ctx)) { > + goto err_block_for_wrid; > + } > + num_cq_events++; > + if (ibv_req_notify_cq(cq, 0)) { > + goto err_block_for_wrid; > + } > + /* poll cq */ > + while (r != wrid) { > + r = qemu_rdma_migration_poll(mdata); > + if (r < 0) { > + goto err_block_for_wrid; > + } > + if (r == QEMU_RDMA_MIGRATION_WRID_NONE) { > + break; > + } > + } > + if (r == wrid) { > + goto success_block_for_wrid; > + } > + } > + > +success_block_for_wrid: > + if (num_cq_events) { > + ibv_ack_cq_events(cq, num_cq_events); > + } > + return 0; > + > +err_block_for_wrid: > + if (num_cq_events) { > + ibv_ack_cq_events(cq, num_cq_events); > + } > + return -1; > +} > + > +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata) > +{ > + mdata->enabled = 0; > + if (mdata->sync_mr) { > + qemu_rdma_migration_dereg_sync(mdata); > + } > + if (mdata->remote_info_mr) { > + qemu_rdma_migration_dereg_remote_info(mdata); > + } > + mdata->sync_mr = NULL; > + mdata->remote_info_mr = NULL; > + qemu_rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks); > + mdata->rdma_ram_blocks.num_blocks = 0; > + qemu_rdma_cleanup(&mdata->rdma_ctx); > + qemu_rdma_migration_data_init(mdata); > +} > + > +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata) > +{ > + int ret; > + > + if (mdata->client_init_done) { > + return 0; > + } > + > + ret = qemu_rdma_resolve_host(&mdata->rdma_ctx, > + mdata->host, mdata->port); > + if (ret) { > + fprintf(stderr, "rdma migration: error resolving host!\n"); > + goto err_rdma_client_init; > + } > + > + ret = qemu_rdma_alloc_pd_cq(&mdata->rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); > + goto err_rdma_client_init; > + } > + > + ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating qp!\n"); > + goto err_rdma_client_init; > + } > + > + ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); > + goto err_rdma_client_init; > + } > + > + ret = qemu_rdma_client_reg_ram_blocks(&mdata->rdma_ctx, > + &mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); > + goto err_rdma_client_init; > + } > + > + ret = qemu_rdma_migration_reg_sync(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering sync data!\n"); > + goto err_rdma_client_init; > + } > + > + ret = qemu_rdma_migration_reg_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering remote info!\n"); > + goto err_rdma_client_init; > + } > + > + ret = qemu_rdma_migration_post_recv_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error posting remote info recv!\n"); > + goto err_rdma_client_init; > + } > + > + mdata->client_init_done = 1; > + return 0; > + > +err_rdma_client_init: > + qemu_rdma_migration_cleanup(mdata); > + return -1; > +} > + > + > +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata) > +{ > + > + int ret; > + > + ret = qemu_rdma_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0); > + if (ret) { > + fprintf(stderr, "rdma migration: error connecting!\n"); > + goto err_rdma_client_connect; > + } > + > + /* wait for remote info */ > + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, > + QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO); > + if (ret < 0) { > + fprintf(stderr, "rdma migration: polling remote info error!\n"); > + goto err_rdma_client_connect; > + } > + > + ret = qemu_rdma_process_remote_ram_blocks( > + &mdata->rdma_ram_blocks, &mdata->remote_info); > + if (ret) { > + fprintf(stderr, > + "rdma migration: error processing remote ram blocks!\n"); > + goto err_rdma_client_connect; > + } > + > + rdma_mdata.total_bytes = 0; > + rdma_mdata.enabled = 1; > + return 0; > + > +err_rdma_client_connect: > + qemu_rdma_migration_cleanup(mdata); > + return -1; > +} > + > +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata) > +{ > + > + int ret; > + struct sockaddr_in sin; > + struct rdma_cm_id *listen_id; > + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; > + > + /* create CM channel */ > + rdma_ctx->channel = rdma_create_event_channel(); > + if (!rdma_ctx->channel) { > + return -1; > + } > + > + /* create CM id */ > + ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL, > + RDMA_PS_TCP); > + if (ret) { > + goto err_server_init_create_listen_id; > + } > + > + memset(&sin, 0, sizeof(sin)); > + sin.sin_family = AF_INET; > + sin.sin_port = htons(mdata->port); > + if (strcmp("", mdata->host)) { > + struct hostent *server_addr; > + server_addr = gethostbyname(mdata->host); > + if (!server_addr) { > + goto err_server_init_bind_addr; > + } > + memcpy(&sin.sin_addr.s_addr, server_addr->h_addr, > + server_addr->h_length); > + } else { > + sin.sin_addr.s_addr = INADDR_ANY; > + } > + > + ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin); > + if (ret) { > + goto err_server_init_bind_addr; > + } > + printf("verbs context after binding: %p\n", listen_id->verbs); > + > + rdma_ctx->listen_id = listen_id; > + if (listen_id->verbs) { > + rdma_ctx->verbs = listen_id->verbs; > + } > + return 0; > + > +err_server_init_bind_addr: > + rdma_destroy_id(listen_id); > +err_server_init_create_listen_id: > + rdma_destroy_event_channel(rdma_ctx->channel); > + rdma_ctx->channel = NULL; > + > + return -1; > + > +} > + > +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata) > +{ > + int ret; > + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; > + > + if (!rdma_ctx->verbs) { > + return 0; > + } > + > + ret = qemu_rdma_alloc_pd_cq(rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = qemu_rdma_server_reg_ram_blocks(rdma_ctx, > + &mdata->rdma_ram_blocks); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = qemu_rdma_migration_reg_sync(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering sync data!\n"); > + goto err_rdma_server_prepare; > + } > + > + qemu_rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks, > + &mdata->remote_info); > + > + ret = qemu_rdma_migration_reg_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error registering remote info!\n"); > + goto err_rdma_server_prepare; > + } > + > + ret = rdma_listen(rdma_ctx->listen_id, 5); > + if (ret) { > + fprintf(stderr, "rdma migration: error listening on socket!\n"); > + goto err_rdma_server_prepare; > + } > + > + return 0; > + > +err_rdma_server_prepare: > + qemu_rdma_migration_cleanup(mdata); > + return -1; > +} > + > +int qemu_rdma_migration_server_wait_for_client( > + struct qemu_rdma_migration_data *mdata) > +{ > + > + int ret; > + > + ret = qemu_rdma_listen(mdata, mdata->host, mdata->port); > + if (ret) { > + fprintf(stderr, "rdma migration: error listening!\n"); > + goto err_rdma_server_wait; > + } > + > + ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx); > + if (ret) { > + fprintf(stderr, "rdma migration: error allocating qp!\n"); > + goto err_rdma_server_wait; > + } > + > +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC > + ret = qemu_rdma_migration_post_recv_sync(mdata, > + QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC); > + if (ret) { > + fprintf(stderr, "rdma migration: error posting extra sync receive!\n"); > + goto err_rdma_server_wait; > + } > +#endif > + > + ret = qemu_rdma_migration_post_recv_sync(mdata, > + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC); > + if (ret) { > + fprintf(stderr, "rdma migration: error posting sync receive!\n"); > + goto err_rdma_server_wait; > + } > + > + ret = qemu_rdma_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0); > + if (ret) { > + fprintf(stderr, "rdma migration: error accepting connection!\n"); > + goto err_rdma_server_wait; > + } > + > + /* send remote info */ > + ret = qemu_rdma_migration_post_send_remote_info(mdata); > + if (ret) { > + fprintf(stderr, "rdma migration: error sending remote info!\n"); > + goto err_rdma_server_wait; > + } > + > + /* wait for completion */ > + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, > + QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO); > + if (ret < 0) { > + fprintf(stderr, "rdma migration: polling remote info error!\n"); > + goto err_rdma_server_wait; > + } > + > + rdma_mdata.total_bytes = 0; > + rdma_mdata.enabled = 1; > + return 0; > + > +err_rdma_server_wait: > + qemu_rdma_migration_cleanup(mdata); > + return -1; > + > +} > + > +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata) > +{ > + qemu_rdma_init_context(&mdata->rdma_ctx); > + mdata->port = rdmaport; > + mdata->host = rdmahost; > + mdata->enabled = 0; > + mdata->rdma_ram_blocks.num_blocks = 0; > + mdata->client_init_done = 0; > + mdata->num_unsignaled_send = 0; > + mdata->num_signaled_send = 0; > + mdata->current_offset = 0; > + mdata->current_length = 0; > + mdata->current_index = -1; > + mdata->current_chunk = -1; > + mdata->sync = 0; > + mdata->sync_mr = NULL; > + mdata->remote_info_mr = NULL; > +} > + > +void qemu_rdma_migration_disable(void) > +{ > + rdma_mdata.port = -1; > + rdma_mdata.enabled = 0; > +} >
Acknowledged. On 02/18/2013 06:02 AM, Paolo Bonzini wrote: > Il 28/01/2013 23:01, mrhines@linux.vnet.ibm.com ha scritto: >> From: "Michael R. Hines" <mrhines@us.ibm.com> >> >> >> Signed-off-by: Michael R. Hines <mrhines@us.ibm.com> >> --- >> Makefile.target | 5 +- >> include/qemu/rdma.h | 249 ++++++++++ > Please make this include/migration/rdma.h > > Paolo > >> qemu-rdma.c | 1357 +++++++++++++++++++++++++++++++++++++++++++++++++++ >> 3 files changed, 1609 insertions(+), 2 deletions(-) >> create mode 100644 include/qemu/rdma.h >> create mode 100644 qemu-rdma.c >> >> diff --git a/Makefile.target b/Makefile.target >> index 760da1e..d1d6b8c 100644 >> --- a/Makefile.target >> +++ b/Makefile.target >> @@ -112,12 +112,13 @@ obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o >> obj-y += hw/ >> obj-$(CONFIG_KVM) += kvm-all.o >> obj-$(CONFIG_NO_KVM) += kvm-stub.o >> -obj-y += memory.o savevm.o cputlb.o >> +# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o >> +obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o >> obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o >> obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o >> obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o >> obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o >> -LIBS+=-lz >> +LIBS+=-lz -lrdmacm >> >> # xen support >> obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o >> diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h >> new file mode 100644 >> index 0000000..099622e >> --- /dev/null >> +++ b/include/qemu/rdma.h >> @@ -0,0 +1,249 @@ >> +/* >> + * RDMA data structures and helper functions header (for migration) >> + * >> + * Copyright IBM, Corp. 2013 >> + * >> + * Authors: >> + * Michael R. Hines <mrhines@us.ibm.com> >> + * Jiuxing Liu <jl@us.ibm.com> >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2. See >> + * the COPYING file in the top-level directory. >> + * >> + */ >> + >> +#ifndef _QEMU_RDMA_H >> +#define _QEMU_RDMA_H >> + >> +#include <rdma/rdma_cma.h> >> +#include "monitor/monitor.h" >> + >> +extern int rdmaport; >> +extern char rdmahost[64]; >> + >> +struct qemu_rdma_context { >> + /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in >> + cm_id->verbs, cm_id->channel, and cm_id->qp. */ >> + struct rdma_cm_id *cm_id; >> + struct rdma_cm_id *listen_id; >> + >> + struct ibv_context *verbs; >> + struct rdma_event_channel *channel; >> + struct ibv_qp *qp; >> + >> + struct ibv_comp_channel *comp_channel; >> + struct ibv_pd *pd; >> + struct ibv_cq *cq; >> +}; >> + >> +static inline void qemu_rdma_init_context(struct qemu_rdma_context *rdma_ctx) >> +{ >> + rdma_ctx->cm_id = NULL; >> + rdma_ctx->listen_id = NULL; >> + rdma_ctx->verbs = NULL; >> + rdma_ctx->channel = NULL; >> + rdma_ctx->qp = NULL; >> + rdma_ctx->comp_channel = NULL; >> + rdma_ctx->pd = NULL; >> + rdma_ctx->cq = NULL; >> +} >> + >> +void cpu_physical_memory_reset_dirty_all(void); >> + >> +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx, >> + const char *host, int port); >> +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx); >> +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx); >> +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx, >> + void *in_data, int *in_len, void *out_data, int out_len); >> +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx, >> + void *in_data, int *in_len, void *out_data, int out_len); >> +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx); >> +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx); >> + >> +/* Instead of registering whole ram blocks, we can register them in smaller >> + * chunks. This may be benefial if the ram blocks have holes in them */ >> +#define QEMU_RDMA_CHUNK_REGISTRATION >> + >> +#define QEMU_RDMA_LAZY_REGISTRATION >> + >> +#define QEMU_RDMA_REG_CHUNK_SHIFT 20 >> +#define QEMU_RDMA_REG_CHUNK_SIZE (1UL << (QEMU_RDMA_REG_CHUNK_SHIFT)) >> +#define QEMU_RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \ >> + (((unsigned long)(host_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT) - \ >> + ((unsigned long)(start_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT)) >> +#define QEMU_RDMA_REG_NUM_CHUNKS(rdma_ram_block) \ >> + (QEMU_RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\ >> + (rdma_ram_block)->local_host_addr +\ >> + (rdma_ram_block)->length) + 1) >> +#define QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\ >> + ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \ >> + QEMU_RDMA_REG_CHUNK_SHIFT) + (i)) << \ >> + QEMU_RDMA_REG_CHUNK_SHIFT)) >> +#define QEMU_RDMA_REG_CHUNK_END(rdma_ram_block, i) \ >> + (QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) + \ >> + QEMU_RDMA_REG_CHUNK_SIZE) >> + >> +struct qemu_rdma_ram_block { >> + uint8_t *local_host_addr; >> + uint64_t remote_host_addr; >> + uint64_t offset; >> + uint64_t length; >> + struct ibv_mr **pmr; >> + struct ibv_mr *mr; >> + uint32_t remote_rkey; >> +}; >> + >> +struct qemu_rdma_remote_ram_block { >> + uint64_t remote_host_addr; >> + uint64_t offset; >> + uint64_t length; >> + uint32_t remote_rkey; >> +}; >> + >> +#define QEMU_MAX_RAM_BLOCKS 64 >> + >> +struct qemu_rdma_ram_blocks { >> + int num_blocks; >> + struct qemu_rdma_ram_block block[QEMU_MAX_RAM_BLOCKS]; >> +}; >> + >> +struct qemu_rdma_remote_ram_blocks { >> + int num_blocks; >> + struct qemu_rdma_remote_ram_block block[QEMU_MAX_RAM_BLOCKS]; >> +}; >> + >> +int qemu_rdma_init_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks); >> +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks); >> +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks); >> +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks); >> +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks); >> +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks); >> + >> +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, >> + struct qemu_rdma_remote_ram_blocks *remote); >> +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, >> + struct qemu_rdma_remote_ram_blocks *remote); >> + >> +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length, >> + struct qemu_rdma_ram_blocks *blocks, >> + int *block_index, int *chunk_index); >> + >> +struct qemu_rdma_migration_data { >> + char *host; >> + int port; >> + int enabled; >> + >> + struct qemu_rdma_context rdma_ctx; >> + struct qemu_rdma_ram_blocks rdma_ram_blocks; >> + >> + /* This is used for synchronization: We use >> + IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs >> + are done. When the receiver gets it, it can be certain >> + that all the RDMAs are completed. */ >> + int sync; >> + struct ibv_mr *sync_mr; >> + >> + /* This is used for the server to write the remote >> + ram blocks info. */ >> + struct qemu_rdma_remote_ram_blocks remote_info; >> + struct ibv_mr *remote_info_mr; >> + >> + /* The rest is only for the initiator of the migration. */ >> + int client_init_done; >> + >> + /* number of outstanding unsignaled send */ >> + int num_unsignaled_send; >> + >> + /* number of outstanding signaled send */ >> + int num_signaled_send; >> + >> + /* store info about current buffer so that we can >> + merge it with future sends */ >> + uint64_t current_offset; >> + uint64_t current_length; >> + /* index of ram block the current buffer belongs to */ >> + int current_index; >> + /* index of the chunk in the current ram block */ >> + int current_chunk; >> + >> + uint64_t total_bytes; >> + >> +}; >> + >> +extern struct qemu_rdma_migration_data rdma_mdata; >> + >> +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata); >> + >> +static inline int qemu_use_rdma_migration(void) >> +{ >> + /* port will be non-zero if user wants to use RDMA. */ >> + return rdma_mdata.port != -1; >> +} >> + >> +static inline int qemu_rdma_migration_enabled(void) >> +{ >> + return rdma_mdata.enabled; >> +} >> + >> +void qemu_rdma_migration_disable(void); >> + >> +#define QEMU_RDMA_MIGRATION_BLOCKING >> +#define QEMU_RDMA_MIGRATION_EXTRA_SYNC >> + >> +enum { >> + QEMU_RDMA_MIGRATION_WRID_NONE = 0, >> + QEMU_RDMA_MIGRATION_WRID_RDMA, >> + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC, >> + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC, >> + QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO, >> + QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO, >> + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC, >> + QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC, >> +}; >> + >> +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host, >> + int port); >> +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata, >> + int wr_id); >> +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata, >> + int wr_id); >> + >> +int qemu_rdma_migration_reg_remote_info( >> + struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_dereg_remote_info( >> + struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_post_send_remote_info( >> + struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_post_recv_remote_info( >> + struct qemu_rdma_migration_data *mdata); >> + >> +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata, >> + uint64_t addr, uint64_t len); >> +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_wait_for_wrid( >> + struct qemu_rdma_migration_data *mdata, >> + int wrid); >> +int qemu_rdma_migration_poll_for_wrid( >> + struct qemu_rdma_migration_data *mdata, >> + int wrid); >> +int qemu_rdma_migration_block_for_wrid( >> + struct qemu_rdma_migration_data *mdata, >> + int wrid); >> +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata); >> + >> +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata); >> +int qemu_rdma_migration_server_wait_for_client( >> + struct qemu_rdma_migration_data *mdata); >> + >> +#endif >> diff --git a/qemu-rdma.c b/qemu-rdma.c >> new file mode 100644 >> index 0000000..5f16875 >> --- /dev/null >> +++ b/qemu-rdma.c >> @@ -0,0 +1,1357 @@ >> +/* >> + * RDMA data structures and helper functions (for migration) >> + * >> + * Copyright IBM, Corp. 2013 >> + * >> + * Authors: >> + * Michael R. Hines <mrhines@us.ibm.com> >> + * Jiuxing Liu <jl@us.ibm.com> >> + * >> + * This work is licensed under the terms of the GNU GPL, version 2. See >> + * the COPYING file in the top-level directory. >> + * >> + */ >> + >> +#include "qemu/rdma.h" >> +#include "qemu-common.h" >> +#include <stdio.h> >> +#include <sys/types.h> >> +#include <sys/socket.h> >> +#include <netdb.h> >> +#include <arpa/inet.h> >> +#include <string.h> >> + >> +#define QEMU_RDMA_RESOLVE_TIMEOUT_MS 10000 >> +#define QEMU_RDMA_CQ_SIZE 2000 >> +#define QEMU_RDMA_QP_SIZE 1000 >> + >> +int rdmaport = -1; >> +char rdmahost[64] = ""; >> +struct qemu_rdma_migration_data rdma_mdata; >> + >> +static void *qemu_rdma_mallocz(size_t size) >> +{ >> + void *ptr; >> + ptr = malloc(size); >> + memset(ptr, 0, size); >> + return ptr; >> +} >> + >> +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx, >> + const char *host, int port) >> +{ >> + int ret; >> + struct addrinfo *res; >> + char port_str[16]; >> + struct rdma_cm_event *cm_event; >> + >> + >> + if (!strcmp(host, "")) { >> + printf("RDMA hostname has not been set\n"); >> + return -1; >> + } >> + >> + /* create CM channel */ >> + rdma_ctx->channel = rdma_create_event_channel(); >> + if (!rdma_ctx->channel) { >> + printf("could not create CM channel\n"); >> + return -1; >> + } >> + >> + /* create CM id */ >> + ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL, >> + RDMA_PS_TCP); >> + if (ret) { >> + printf("could not create channel id\n"); >> + goto err_resolve_create_id; >> + } >> + >> + snprintf(port_str, 16, "%d", port); >> + port_str[15] = '\0'; >> + ret = getaddrinfo(host, port_str, NULL, &res); >> + if (ret < 0) { >> + printf("could not getaddrinfo address %s\n", host); >> + goto err_resolve_get_addr; >> + } >> + >> + /* resolve the first address */ >> + ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr, >> + QEMU_RDMA_RESOLVE_TIMEOUT_MS); >> + if (ret) { >> + printf("could not resolve address %s\n", host); >> + goto err_resolve_get_addr; >> + } >> + >> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); >> + if (ret) { >> + printf("could not perform event_addr_resolved\n"); >> + goto err_resolve_get_addr; >> + } >> + if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { >> + printf("result not equal to event_addr_resolved\n"); >> + rdma_ack_cm_event(cm_event); >> + goto err_resolve_get_addr; >> + } >> + rdma_ack_cm_event(cm_event); >> + >> + /* resolve route */ >> + ret = rdma_resolve_route(rdma_ctx->cm_id, QEMU_RDMA_RESOLVE_TIMEOUT_MS); >> + if (ret) { >> + printf("could not resolve rdma route\n"); >> + goto err_resolve_get_addr; >> + } >> + >> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); >> + if (ret) { >> + printf("could not perform event_route_resolved\n"); >> + goto err_resolve_get_addr; >> + } >> + if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { >> + printf("result not equal to event_route_resolved\n"); >> + rdma_ack_cm_event(cm_event); >> + goto err_resolve_get_addr; >> + } >> + rdma_ack_cm_event(cm_event); >> + >> + rdma_ctx->verbs = rdma_ctx->cm_id->verbs; >> + return 0; >> + >> +err_resolve_get_addr: >> + rdma_destroy_id(rdma_ctx->cm_id); >> +err_resolve_create_id: >> + rdma_destroy_event_channel(rdma_ctx->channel); >> + rdma_ctx->channel = NULL; >> + >> + return -1; >> +} >> + >> +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx) >> +{ >> + >> + /* allocate pd */ >> + rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs); >> + if (!rdma_ctx->pd) { >> + return -1; >> + } >> + >> +#ifdef QEMU_RDMA_MIGRATION_BLOCKING >> + /* create completion channel */ >> + rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs); >> + if (!rdma_ctx->comp_channel) { >> + goto err_alloc_pd_cq; >> + } >> +#endif >> + >> + /* create cq */ >> + rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, QEMU_RDMA_CQ_SIZE, >> + NULL, rdma_ctx->comp_channel, 0); >> + if (!rdma_ctx->cq) { >> + goto err_alloc_pd_cq; >> + } >> + >> + return 0; >> + >> +err_alloc_pd_cq: >> + if (rdma_ctx->pd) { >> + ibv_dealloc_pd(rdma_ctx->pd); >> + } >> + if (rdma_ctx->comp_channel) { >> + ibv_destroy_comp_channel(rdma_ctx->comp_channel); >> + } >> + rdma_ctx->pd = NULL; >> + rdma_ctx->comp_channel = NULL; >> + return -1; >> + >> +} >> + >> +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx) >> +{ >> + struct ibv_qp_init_attr attr = { 0 }; >> + int ret; >> + >> + attr.cap.max_send_wr = QEMU_RDMA_QP_SIZE; >> + attr.cap.max_recv_wr = 2; >> + attr.cap.max_send_sge = 1; >> + attr.cap.max_recv_sge = 1; >> + attr.send_cq = rdma_ctx->cq; >> + attr.recv_cq = rdma_ctx->cq; >> + attr.qp_type = IBV_QPT_RC; >> + >> + ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr); >> + if (ret) { >> + return -1; >> + } >> + >> + rdma_ctx->qp = rdma_ctx->cm_id->qp; >> + return 0; >> +} >> + >> +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx, >> + void *in_data, int *in_len, void *out_data, int out_len) >> +{ >> + int ret; >> + struct rdma_conn_param conn_param = { 0 }; >> + struct rdma_cm_event *cm_event; >> + >> + conn_param.initiator_depth = 2; >> + conn_param.retry_count = 5; >> + conn_param.private_data = out_data; >> + conn_param.private_data_len = out_len; >> + >> + ret = rdma_connect(rdma_ctx->cm_id, &conn_param); >> + if (ret) { >> + return -1; >> + } >> + >> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); >> + if (ret) { >> + return -1; >> + } >> + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { >> + return -1; >> + } >> + >> + if (in_len) { >> + if (*in_len > cm_event->param.conn.private_data_len) { >> + *in_len = cm_event->param.conn.private_data_len; >> + } >> + if (*in_len) { >> + memcpy(in_data, cm_event->param.conn.private_data, *in_len); >> + } >> + } >> + >> + rdma_ack_cm_event(cm_event); >> + >> + return 0; >> +} >> + >> +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host, >> + int port) >> +{ >> + int ret; >> + struct rdma_cm_event *cm_event; >> + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; >> + struct ibv_context *verbs; >> + >> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); >> + if (ret) { >> + goto err_listen; >> + } >> + >> + if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { >> + rdma_ack_cm_event(cm_event); >> + goto err_listen; >> + } >> + >> + rdma_ctx->cm_id = cm_event->id; >> + verbs = cm_event->id->verbs; >> + printf("verbs context after listen: %p\n", verbs); >> + rdma_ack_cm_event(cm_event); >> + >> + if (!rdma_ctx->verbs) { >> + rdma_ctx->verbs = verbs; >> + ret = qemu_rdma_migration_server_prepare(mdata); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error preparing server!\n"); >> + goto err_listen; >> + } >> + } else if (rdma_ctx->verbs != verbs) { >> + fprintf(stderr, "ibv context not matching %p, %p!\n", >> + rdma_ctx->verbs, verbs); >> + goto err_listen; >> + } >> + /* xxx destroy listen_id ??? */ >> + >> + return 0; >> + >> +err_listen: >> + >> + return -1; >> + >> +} >> + >> +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx, >> + void *in_data, int *in_len, void *out_data, int out_len) >> +{ >> + int ret; >> + struct rdma_conn_param conn_param = { 0 }; >> + struct rdma_cm_event *cm_event; >> + >> + conn_param.responder_resources = 2; >> + conn_param.private_data = out_data; >> + conn_param.private_data_len = out_len; >> + >> + ret = rdma_accept(rdma_ctx->cm_id, &conn_param); >> + if (ret) { >> + fprintf(stderr, "rdma_accept returns %d!\n", ret); >> + return -1; >> + } >> + >> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); >> + if (ret) { >> + return -1; >> + } >> + >> + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { >> + rdma_ack_cm_event(cm_event); >> + return -1; >> + } >> + >> + if (in_len) { >> + if (*in_len > cm_event->param.conn.private_data_len) { >> + *in_len = cm_event->param.conn.private_data_len; >> + } >> + if (*in_len) { >> + memcpy(in_data, cm_event->param.conn.private_data, *in_len); >> + } >> + } >> + >> + rdma_ack_cm_event(cm_event); >> + >> + return 0; >> +} >> + >> +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx) >> +{ >> + int ret; >> + struct rdma_cm_event *cm_event; >> + >> + ret = rdma_disconnect(rdma_ctx->cm_id); >> + if (ret) { >> + return; >> + } >> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); >> + if (ret) { >> + return; >> + } >> + rdma_ack_cm_event(cm_event); >> +} >> + >> +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx) >> +{ >> + if (rdma_ctx->qp) { >> + ibv_destroy_qp(rdma_ctx->qp); >> + } >> + if (rdma_ctx->cq) { >> + ibv_destroy_cq(rdma_ctx->cq); >> + } >> + if (rdma_ctx->comp_channel) { >> + ibv_destroy_comp_channel(rdma_ctx->comp_channel); >> + } >> + if (rdma_ctx->pd) { >> + ibv_dealloc_pd(rdma_ctx->pd); >> + } >> + if (rdma_ctx->listen_id) { >> + rdma_destroy_id(rdma_ctx->listen_id); >> + } >> + if (rdma_ctx->cm_id) { >> + rdma_destroy_id(rdma_ctx->cm_id); >> + } >> + if (rdma_ctx->channel) { >> + rdma_destroy_event_channel(rdma_ctx->channel); >> + } >> +} >> + >> +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks) >> +{ >> + int i, j; >> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { >> + struct qemu_rdma_ram_block *block = &(rdma_ram_blocks->block[i]); >> + int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block); >> + /* allocate memory to store chunk MRs */ >> + rdma_ram_blocks->block[i].pmr = qemu_rdma_mallocz( >> + num_chunks * sizeof(struct ibv_mr *)); >> + >> + if (!block->pmr) { >> + goto err_reg_chunk_ram_blocks; >> + } >> + >> + for (j = 0; j < num_chunks; j++) { >> + uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, j); >> + uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, j); >> + if (start_addr < block->local_host_addr) { >> + start_addr = block->local_host_addr; >> + } >> + if (end_addr > block->local_host_addr + block->length) { >> + end_addr = block->local_host_addr + block->length; >> + } >> + block->pmr[j] = ibv_reg_mr(rdma_ctx->pd, >> + start_addr, >> + end_addr - start_addr, >> + IBV_ACCESS_LOCAL_WRITE | >> + IBV_ACCESS_REMOTE_WRITE | >> + IBV_ACCESS_REMOTE_READ); >> + if (!block->pmr[j]) { >> + break; >> + } >> + } >> + if (j < num_chunks) { >> + for (j--; j >= 0; j--) { >> + ibv_dereg_mr(block->pmr[j]); >> + } >> + block->pmr[i] = NULL; >> + goto err_reg_chunk_ram_blocks; >> + } >> + } >> + >> + return 0; >> + >> +err_reg_chunk_ram_blocks: >> + for (i--; i >= 0; i--) { >> + int num_chunks = >> + QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); >> + for (j = 0; j < num_chunks; j++) { >> + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); >> + } >> + free(rdma_ram_blocks->block[i].pmr); >> + rdma_ram_blocks->block[i].pmr = NULL; >> + } >> + >> + return -1; >> + >> +} >> + >> +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks) >> +{ >> + int i; >> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { >> + rdma_ram_blocks->block[i].mr = >> + ibv_reg_mr(rdma_ctx->pd, >> + rdma_ram_blocks->block[i].local_host_addr, >> + rdma_ram_blocks->block[i].length, >> + IBV_ACCESS_LOCAL_WRITE | >> + IBV_ACCESS_REMOTE_WRITE | >> + IBV_ACCESS_REMOTE_READ); >> + if (!rdma_ram_blocks->block[i].mr) { >> + break; >> + } >> + } >> + >> + if (i >= rdma_ram_blocks->num_blocks) { >> + return 0; >> + } >> + >> + for (i--; i >= 0; i--) { >> + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); >> + } >> + >> + return -1; >> + >> +} >> + >> +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks) >> +{ >> +#ifdef QEMU_RDMA_CHUNK_REGISTRATION >> +#ifdef QEMU_RDMA_LAZY_REGISTRATION >> + return 0; >> +#else >> + return qemu_rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks); >> +#endif >> +#else >> + return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); >> +#endif >> +} >> + >> +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_blocks *rdma_ram_blocks) >> +{ >> + return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); >> +} >> + >> +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks) >> +{ >> + int i, j; >> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { >> + int num_chunks; >> + if (!rdma_ram_blocks->block[i].pmr) { >> + continue; >> + } >> + num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); >> + for (j = 0; j < num_chunks; j++) { >> + if (!rdma_ram_blocks->block[i].pmr[j]) { >> + continue; >> + } >> + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); >> + } >> + free(rdma_ram_blocks->block[i].pmr); >> + rdma_ram_blocks->block[i].pmr = NULL; >> + } >> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { >> + if (!rdma_ram_blocks->block[i].mr) { >> + continue; >> + } >> + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); >> + rdma_ram_blocks->block[i].mr = NULL; >> + } >> +} >> + >> +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, >> + struct qemu_rdma_remote_ram_blocks *remote) >> +{ >> + int i; >> + remote->num_blocks = local->num_blocks; >> + for (i = 0; i < local->num_blocks; i++) { >> + remote->block[i].remote_host_addr = >> + (uint64_t)(local->block[i].local_host_addr); >> + remote->block[i].remote_rkey = local->block[i].mr->rkey; >> + remote->block[i].offset = local->block[i].offset; >> + remote->block[i].length = local->block[i].length; >> + } >> +} >> + >> +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, >> + struct qemu_rdma_remote_ram_blocks *remote) >> +{ >> + int i, j; >> + >> + if (local->num_blocks != remote->num_blocks) { >> + return -1; >> + } >> + >> + for (i = 0; i < remote->num_blocks; i++) { >> + /* search local ram blocks */ >> + for (j = 0; j < local->num_blocks; j++) { >> + if (remote->block[i].offset != local->block[j].offset) { >> + continue; >> + } >> + if (remote->block[i].length != local->block[j].length) { >> + return -1; >> + } >> + local->block[j].remote_host_addr = >> + remote->block[i].remote_host_addr; >> + local->block[j].remote_rkey = remote->block[i].remote_rkey; >> + break; >> + } >> + if (j >= local->num_blocks) { >> + return -1; >> + } >> + } >> + >> + return 0; >> +} >> + >> +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length, >> + struct qemu_rdma_ram_blocks *blocks, >> + int *block_index, int *chunk_index) >> +{ >> + int i; >> + for (i = 0; i < blocks->num_blocks; i++) { >> + if (offset < blocks->block[i].offset) { >> + continue; >> + } >> + if (offset + length > >> + blocks->block[i].offset + blocks->block[i].length) { >> + continue; >> + } >> + *block_index = i; >> + if (chunk_index) { >> + uint8_t *host_addr = blocks->block[i].local_host_addr + >> + (offset - blocks->block[i].offset); >> + *chunk_index = QEMU_RDMA_REG_CHUNK_INDEX( >> + blocks->block[i].local_host_addr, host_addr); >> + } >> + return 0; >> + } >> + return -1; >> +} >> + >> +static int qemu_rdma_get_lkey(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_block *block, uint64_t host_addr, >> + uint32_t *lkey) >> +{ >> + int chunk; >> + if (block->mr) { >> + *lkey = block->mr->lkey; >> + return 0; >> + } >> + if (!block->pmr) { >> + int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block); >> + /* allocate memory to store chunk MRs */ >> + block->pmr = qemu_rdma_mallocz(num_chunks * >> + sizeof(struct ibv_mr *)); >> + if (!block->pmr) { >> + return -1; >> + } >> + } >> + chunk = QEMU_RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr); >> + if (!block->pmr[chunk]) { >> + uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, chunk); >> + uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, chunk); >> + if (start_addr < block->local_host_addr) { >> + start_addr = block->local_host_addr; >> + } >> + if (end_addr > block->local_host_addr + block->length) { >> + end_addr = block->local_host_addr + block->length; >> + } >> + block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd, >> + start_addr, >> + end_addr - start_addr, >> + IBV_ACCESS_LOCAL_WRITE | >> + IBV_ACCESS_REMOTE_WRITE | >> + IBV_ACCESS_REMOTE_READ); >> + if (!block->pmr[chunk]) { >> + return -1; >> + } >> + } >> + *lkey = block->pmr[chunk]->lkey; >> + return 0; >> +} >> + >> +static int qemu_rdma_write(struct qemu_rdma_context *rdma_ctx, >> + struct qemu_rdma_ram_block *block, >> + uint64_t offset, uint64_t length, >> + uint64_t wr_id, enum ibv_send_flags flag) >> +{ >> + struct ibv_sge sge; >> + struct ibv_send_wr send_wr = { 0 }; >> + struct ibv_send_wr *bad_wr; >> + >> + sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset)); >> + sge.length = length; >> + if (qemu_rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) { >> + fprintf(stderr, "cannot get lkey!\n"); >> + return -1; >> + } >> + send_wr.wr_id = wr_id; >> + send_wr.opcode = IBV_WR_RDMA_WRITE; >> + send_wr.send_flags = flag; >> + send_wr.sg_list = &sge; >> + send_wr.num_sge = 1; >> + send_wr.wr.rdma.rkey = block->remote_rkey; >> + send_wr.wr.rdma.remote_addr = block->remote_host_addr + >> + (offset - block->offset); >> + >> + if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) { >> + return -1; >> + } >> + >> + return 0; >> +} >> + >> +/* Do not merge data if larger than this. */ >> +#define QEMU_RDMA_MIGRATION_MERGE_MAX (4 * 1024 * 1024) >> + >> +#define QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX 64 >> + >> +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata) >> +{ >> + mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd, >> + &mdata->sync, >> + sizeof mdata->sync, >> + IBV_ACCESS_LOCAL_WRITE | >> + IBV_ACCESS_REMOTE_WRITE | >> + IBV_ACCESS_REMOTE_READ); >> + if (mdata->sync_mr) { >> + return 0; >> + } >> + return -1; >> +} >> + >> +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata) >> +{ >> + return ibv_dereg_mr(mdata->sync_mr); >> +} >> + >> +int qemu_rdma_migration_reg_remote_info( >> + struct qemu_rdma_migration_data *mdata) >> +{ >> + mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd, >> + &mdata->remote_info, >> + sizeof mdata->remote_info, >> + IBV_ACCESS_LOCAL_WRITE | >> + IBV_ACCESS_REMOTE_WRITE | >> + IBV_ACCESS_REMOTE_READ); >> + if (mdata->remote_info_mr) { >> + return 0; >> + } >> + return -1; >> +} >> + >> +int qemu_rdma_migration_dereg_remote_info( >> + struct qemu_rdma_migration_data *mdata) >> +{ >> + return ibv_dereg_mr(mdata->remote_info_mr); >> +} >> + >> + >> +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata, >> + int wr_id) >> +{ >> + struct ibv_sge sge; >> + struct ibv_send_wr send_wr = { 0 }; >> + struct ibv_send_wr *bad_wr; >> + >> + mdata->sync = 1; >> + >> + sge.addr = (uint64_t)(&mdata->sync); >> + sge.length = sizeof mdata->sync; >> + sge.lkey = mdata->sync_mr->lkey; >> + >> + send_wr.wr_id = wr_id; >> + send_wr.opcode = IBV_WR_SEND; >> + send_wr.send_flags = IBV_SEND_SIGNALED; >> + send_wr.sg_list = &sge; >> + send_wr.num_sge = 1; >> + >> + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { >> + return -1; >> + } >> + >> + return 0; >> +} >> + >> +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata, >> + int wr_id) >> +{ >> + struct ibv_sge sge; >> + struct ibv_recv_wr recv_wr = { 0 }; >> + struct ibv_recv_wr *bad_wr; >> + >> + mdata->sync = 1; >> + >> + sge.addr = (uint64_t)(&mdata->sync); >> + sge.length = sizeof mdata->sync; >> + sge.lkey = mdata->sync_mr->lkey; >> + >> + recv_wr.wr_id = wr_id; >> + recv_wr.sg_list = &sge; >> + recv_wr.num_sge = 1; >> + >> + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { >> + return -1; >> + } >> + >> + return 0; >> + >> +} >> + >> +int qemu_rdma_migration_post_send_remote_info( >> + struct qemu_rdma_migration_data *mdata) >> +{ >> + struct ibv_sge sge; >> + struct ibv_send_wr send_wr = { 0 }; >> + struct ibv_send_wr *bad_wr; >> + >> + sge.addr = (uint64_t)(&mdata->remote_info); >> + sge.length = sizeof mdata->remote_info; >> + sge.lkey = mdata->remote_info_mr->lkey; >> + >> + send_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO; >> + send_wr.opcode = IBV_WR_SEND; >> + send_wr.send_flags = IBV_SEND_SIGNALED; >> + send_wr.sg_list = &sge; >> + send_wr.num_sge = 1; >> + >> + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { >> + return -1; >> + } >> + >> + mdata->num_signaled_send--; >> + return 0; >> +} >> + >> +int qemu_rdma_migration_post_recv_remote_info( >> + struct qemu_rdma_migration_data *mdata) >> +{ >> + struct ibv_sge sge; >> + struct ibv_recv_wr recv_wr = { 0 }; >> + struct ibv_recv_wr *bad_wr; >> + >> + sge.addr = (uint64_t)(&mdata->remote_info); >> + sge.length = sizeof mdata->remote_info; >> + sge.lkey = mdata->remote_info_mr->lkey; >> + >> + recv_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO; >> + recv_wr.sg_list = &sge; >> + recv_wr.num_sge = 1; >> + >> + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { >> + return -1; >> + } >> + >> + return 0; >> +} >> + >> + >> +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata) >> +{ >> + int ret; >> + enum ibv_send_flags flags = 0; >> + >> + if (!mdata->current_length) { >> + return 0; >> + } >> + if (mdata->num_unsignaled_send >= >> + QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) { >> + flags = IBV_SEND_SIGNALED; >> + } >> + ret = qemu_rdma_write(&mdata->rdma_ctx, >> + &(mdata->rdma_ram_blocks.block[mdata->current_index]), >> + mdata->current_offset, >> + mdata->current_length, >> + QEMU_RDMA_MIGRATION_WRID_RDMA, flags); >> + >> + if (ret) { >> + return ret; >> + } >> + >> + if (mdata->num_unsignaled_send >= >> + QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) { >> + mdata->num_unsignaled_send = 0; >> + mdata->num_signaled_send++; >> + } else { >> + mdata->num_unsignaled_send++; >> + } >> + >> + mdata->total_bytes += mdata->current_length; >> + mdata->current_length = 0; >> + mdata->current_offset = 0; >> + >> + return 0; >> +} >> + >> +static inline int qemu_rdma_migration_in_current_block( >> + struct qemu_rdma_migration_data *mdata, >> + uint64_t offset, uint64_t len) >> +{ >> + struct qemu_rdma_ram_block *block = >> + &(mdata->rdma_ram_blocks.block[mdata->current_index]); >> + if (mdata->current_index < 0) { >> + return 0; >> + } >> + if (offset < block->offset) { >> + return 0; >> + } >> + if (offset + len > block->offset + block->length) { >> + return 0; >> + } >> + return 1; >> +} >> + >> +static inline int qemu_rdma_migration_in_current_chunk( >> + struct qemu_rdma_migration_data *mdata, >> + uint64_t offset, uint64_t len) >> +{ >> + struct qemu_rdma_ram_block *block = >> + &(mdata->rdma_ram_blocks.block[mdata->current_index]); >> + uint8_t *chunk_start, *chunk_end, *host_addr; >> + if (mdata->current_chunk < 0) { >> + return 0; >> + } >> + host_addr = block->local_host_addr + (offset - block->offset); >> + chunk_start = QEMU_RDMA_REG_CHUNK_START(block, mdata->current_chunk); >> + if (chunk_start < block->local_host_addr) { >> + chunk_start = block->local_host_addr; >> + } >> + if (host_addr < chunk_start) { >> + return 0; >> + } >> + chunk_end = QEMU_RDMA_REG_CHUNK_END(block, mdata->current_chunk); >> + if (chunk_end > chunk_start + block->length) { >> + chunk_end = chunk_start + block->length; >> + } >> + if (host_addr + len > chunk_end) { >> + return 0; >> + } >> + return 1; >> +} >> + >> +static inline int qemu_rdma_buffer_mergable( >> + struct qemu_rdma_migration_data *mdata, >> + uint64_t offset, uint64_t len) >> +{ >> + if (mdata->current_length == 0) { >> + return 0; >> + } >> + if (offset != mdata->current_offset + mdata->current_length) { >> + return 0; >> + } >> + if (!qemu_rdma_migration_in_current_block(mdata, offset, len)) { >> + return 0; >> + } >> +#ifdef QEMU_RDMA_CHUNK_REGISTRATION >> + if (!qemu_rdma_migration_in_current_chunk(mdata, offset, len)) { >> + return 0; >> + } >> +#endif >> + return 1; >> +} >> + >> +/* Note that buffer must be within a single block/chunk. */ >> +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata, >> + uint64_t offset, uint64_t len) >> +{ >> + int index = mdata->current_index; >> + int chunk_index = mdata->current_chunk; >> + int ret; >> + >> + /* If we cannot merge it, we flush the current buffer first. */ >> + if (!qemu_rdma_buffer_mergable(mdata, offset, len)) { >> + ret = qemu_rdma_migration_write_flush(mdata); >> + if (ret) { >> + return ret; >> + } >> + mdata->current_length = 0; >> + mdata->current_offset = offset; >> + >> + if (qemu_rdma_search_ram_block(offset, len, >> + &mdata->rdma_ram_blocks, &index, &chunk_index)) { >> + return -1; >> + } >> + mdata->current_index = index; >> + mdata->current_chunk = chunk_index; >> + } >> + >> + /* merge it */ >> + mdata->current_length += len; >> + >> + /* flush it if buffer is too large */ >> + if (mdata->current_length >= QEMU_RDMA_MIGRATION_MERGE_MAX) { >> + return qemu_rdma_migration_write_flush(mdata); >> + } >> + >> + return 0; >> +} >> + >> +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata) >> +{ >> + int ret; >> + struct ibv_wc wc; >> + >> + ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc); >> + if (!ret) { >> + return QEMU_RDMA_MIGRATION_WRID_NONE; >> + } >> + if (ret < 0) { >> + fprintf(stderr, "ibv_poll_cq return %d!\n", ret); >> + return ret; >> + } >> + if (wc.status != IBV_WC_SUCCESS) { >> + fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n", >> + wc.status, ibv_wc_status_str(wc.status)); >> + fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id); >> + >> + return -1; >> + } >> + >> + if (!(wc.opcode & IBV_WC_RECV)) { >> + mdata->num_signaled_send--; >> + } >> + >> + return (int)wc.wr_id; >> +} >> + >> +int qemu_rdma_migration_wait_for_wrid( >> + struct qemu_rdma_migration_data *mdata, >> + int wrid) >> +{ >> +#ifdef QEMU_RDMA_MIGRATION_BLOCKING >> + return qemu_rdma_migration_block_for_wrid(mdata, wrid); >> +#else >> + return qemu_rdma_migration_poll_for_wrid(mdata, wrid); >> +#endif >> +} >> + >> +int qemu_rdma_migration_poll_for_wrid( >> + struct qemu_rdma_migration_data *mdata, >> + int wrid) >> +{ >> + int r = QEMU_RDMA_MIGRATION_WRID_NONE; >> + while (r != wrid) { >> + r = qemu_rdma_migration_poll(mdata); >> + if (r < 0) { >> + return r; >> + } >> + } >> + return 0; >> +} >> + >> +int qemu_rdma_migration_block_for_wrid( >> + struct qemu_rdma_migration_data *mdata, >> + int wrid) >> +{ >> + int num_cq_events = 0; >> + int r = QEMU_RDMA_MIGRATION_WRID_NONE; >> + struct ibv_cq *cq; >> + void *cq_ctx; >> + >> + if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) { >> + return -1; >> + } >> + /* poll cq first */ >> + while (r != wrid) { >> + r = qemu_rdma_migration_poll(mdata); >> + if (r < 0) { >> + return r; >> + } >> + if (r == QEMU_RDMA_MIGRATION_WRID_NONE) { >> + break; >> + } >> + } >> + if (r == wrid) { >> + return 0; >> + } >> + >> + while (1) { >> + if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel, >> + &cq, &cq_ctx)) { >> + goto err_block_for_wrid; >> + } >> + num_cq_events++; >> + if (ibv_req_notify_cq(cq, 0)) { >> + goto err_block_for_wrid; >> + } >> + /* poll cq */ >> + while (r != wrid) { >> + r = qemu_rdma_migration_poll(mdata); >> + if (r < 0) { >> + goto err_block_for_wrid; >> + } >> + if (r == QEMU_RDMA_MIGRATION_WRID_NONE) { >> + break; >> + } >> + } >> + if (r == wrid) { >> + goto success_block_for_wrid; >> + } >> + } >> + >> +success_block_for_wrid: >> + if (num_cq_events) { >> + ibv_ack_cq_events(cq, num_cq_events); >> + } >> + return 0; >> + >> +err_block_for_wrid: >> + if (num_cq_events) { >> + ibv_ack_cq_events(cq, num_cq_events); >> + } >> + return -1; >> +} >> + >> +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata) >> +{ >> + mdata->enabled = 0; >> + if (mdata->sync_mr) { >> + qemu_rdma_migration_dereg_sync(mdata); >> + } >> + if (mdata->remote_info_mr) { >> + qemu_rdma_migration_dereg_remote_info(mdata); >> + } >> + mdata->sync_mr = NULL; >> + mdata->remote_info_mr = NULL; >> + qemu_rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks); >> + mdata->rdma_ram_blocks.num_blocks = 0; >> + qemu_rdma_cleanup(&mdata->rdma_ctx); >> + qemu_rdma_migration_data_init(mdata); >> +} >> + >> +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata) >> +{ >> + int ret; >> + >> + if (mdata->client_init_done) { >> + return 0; >> + } >> + >> + ret = qemu_rdma_resolve_host(&mdata->rdma_ctx, >> + mdata->host, mdata->port); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error resolving host!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + ret = qemu_rdma_alloc_pd_cq(&mdata->rdma_ctx); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error allocating qp!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + ret = qemu_rdma_client_reg_ram_blocks(&mdata->rdma_ctx, >> + &mdata->rdma_ram_blocks); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + ret = qemu_rdma_migration_reg_sync(mdata); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error registering sync data!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + ret = qemu_rdma_migration_reg_remote_info(mdata); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error registering remote info!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + ret = qemu_rdma_migration_post_recv_remote_info(mdata); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error posting remote info recv!\n"); >> + goto err_rdma_client_init; >> + } >> + >> + mdata->client_init_done = 1; >> + return 0; >> + >> +err_rdma_client_init: >> + qemu_rdma_migration_cleanup(mdata); >> + return -1; >> +} >> + >> + >> +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata) >> +{ >> + >> + int ret; >> + >> + ret = qemu_rdma_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error connecting!\n"); >> + goto err_rdma_client_connect; >> + } >> + >> + /* wait for remote info */ >> + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, >> + QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO); >> + if (ret < 0) { >> + fprintf(stderr, "rdma migration: polling remote info error!\n"); >> + goto err_rdma_client_connect; >> + } >> + >> + ret = qemu_rdma_process_remote_ram_blocks( >> + &mdata->rdma_ram_blocks, &mdata->remote_info); >> + if (ret) { >> + fprintf(stderr, >> + "rdma migration: error processing remote ram blocks!\n"); >> + goto err_rdma_client_connect; >> + } >> + >> + rdma_mdata.total_bytes = 0; >> + rdma_mdata.enabled = 1; >> + return 0; >> + >> +err_rdma_client_connect: >> + qemu_rdma_migration_cleanup(mdata); >> + return -1; >> +} >> + >> +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata) >> +{ >> + >> + int ret; >> + struct sockaddr_in sin; >> + struct rdma_cm_id *listen_id; >> + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; >> + >> + /* create CM channel */ >> + rdma_ctx->channel = rdma_create_event_channel(); >> + if (!rdma_ctx->channel) { >> + return -1; >> + } >> + >> + /* create CM id */ >> + ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL, >> + RDMA_PS_TCP); >> + if (ret) { >> + goto err_server_init_create_listen_id; >> + } >> + >> + memset(&sin, 0, sizeof(sin)); >> + sin.sin_family = AF_INET; >> + sin.sin_port = htons(mdata->port); >> + if (strcmp("", mdata->host)) { >> + struct hostent *server_addr; >> + server_addr = gethostbyname(mdata->host); >> + if (!server_addr) { >> + goto err_server_init_bind_addr; >> + } >> + memcpy(&sin.sin_addr.s_addr, server_addr->h_addr, >> + server_addr->h_length); >> + } else { >> + sin.sin_addr.s_addr = INADDR_ANY; >> + } >> + >> + ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin); >> + if (ret) { >> + goto err_server_init_bind_addr; >> + } >> + printf("verbs context after binding: %p\n", listen_id->verbs); >> + >> + rdma_ctx->listen_id = listen_id; >> + if (listen_id->verbs) { >> + rdma_ctx->verbs = listen_id->verbs; >> + } >> + return 0; >> + >> +err_server_init_bind_addr: >> + rdma_destroy_id(listen_id); >> +err_server_init_create_listen_id: >> + rdma_destroy_event_channel(rdma_ctx->channel); >> + rdma_ctx->channel = NULL; >> + >> + return -1; >> + >> +} >> + >> +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata) >> +{ >> + int ret; >> + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; >> + >> + if (!rdma_ctx->verbs) { >> + return 0; >> + } >> + >> + ret = qemu_rdma_alloc_pd_cq(rdma_ctx); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); >> + goto err_rdma_server_prepare; >> + } >> + >> + ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); >> + goto err_rdma_server_prepare; >> + } >> + >> + ret = qemu_rdma_server_reg_ram_blocks(rdma_ctx, >> + &mdata->rdma_ram_blocks); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); >> + goto err_rdma_server_prepare; >> + } >> + >> + ret = qemu_rdma_migration_reg_sync(mdata); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error registering sync data!\n"); >> + goto err_rdma_server_prepare; >> + } >> + >> + qemu_rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks, >> + &mdata->remote_info); >> + >> + ret = qemu_rdma_migration_reg_remote_info(mdata); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error registering remote info!\n"); >> + goto err_rdma_server_prepare; >> + } >> + >> + ret = rdma_listen(rdma_ctx->listen_id, 5); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error listening on socket!\n"); >> + goto err_rdma_server_prepare; >> + } >> + >> + return 0; >> + >> +err_rdma_server_prepare: >> + qemu_rdma_migration_cleanup(mdata); >> + return -1; >> +} >> + >> +int qemu_rdma_migration_server_wait_for_client( >> + struct qemu_rdma_migration_data *mdata) >> +{ >> + >> + int ret; >> + >> + ret = qemu_rdma_listen(mdata, mdata->host, mdata->port); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error listening!\n"); >> + goto err_rdma_server_wait; >> + } >> + >> + ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error allocating qp!\n"); >> + goto err_rdma_server_wait; >> + } >> + >> +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC >> + ret = qemu_rdma_migration_post_recv_sync(mdata, >> + QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error posting extra sync receive!\n"); >> + goto err_rdma_server_wait; >> + } >> +#endif >> + >> + ret = qemu_rdma_migration_post_recv_sync(mdata, >> + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error posting sync receive!\n"); >> + goto err_rdma_server_wait; >> + } >> + >> + ret = qemu_rdma_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error accepting connection!\n"); >> + goto err_rdma_server_wait; >> + } >> + >> + /* send remote info */ >> + ret = qemu_rdma_migration_post_send_remote_info(mdata); >> + if (ret) { >> + fprintf(stderr, "rdma migration: error sending remote info!\n"); >> + goto err_rdma_server_wait; >> + } >> + >> + /* wait for completion */ >> + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, >> + QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO); >> + if (ret < 0) { >> + fprintf(stderr, "rdma migration: polling remote info error!\n"); >> + goto err_rdma_server_wait; >> + } >> + >> + rdma_mdata.total_bytes = 0; >> + rdma_mdata.enabled = 1; >> + return 0; >> + >> +err_rdma_server_wait: >> + qemu_rdma_migration_cleanup(mdata); >> + return -1; >> + >> +} >> + >> +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata) >> +{ >> + qemu_rdma_init_context(&mdata->rdma_ctx); >> + mdata->port = rdmaport; >> + mdata->host = rdmahost; >> + mdata->enabled = 0; >> + mdata->rdma_ram_blocks.num_blocks = 0; >> + mdata->client_init_done = 0; >> + mdata->num_unsignaled_send = 0; >> + mdata->num_signaled_send = 0; >> + mdata->current_offset = 0; >> + mdata->current_length = 0; >> + mdata->current_index = -1; >> + mdata->current_chunk = -1; >> + mdata->sync = 0; >> + mdata->sync_mr = NULL; >> + mdata->remote_info_mr = NULL; >> +} >> + >> +void qemu_rdma_migration_disable(void) >> +{ >> + rdma_mdata.port = -1; >> + rdma_mdata.enabled = 0; >> +} >> >
diff --git a/Makefile.target b/Makefile.target index 760da1e..d1d6b8c 100644 --- a/Makefile.target +++ b/Makefile.target @@ -112,12 +112,13 @@ obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o obj-y += hw/ obj-$(CONFIG_KVM) += kvm-all.o obj-$(CONFIG_NO_KVM) += kvm-stub.o -obj-y += memory.o savevm.o cputlb.o +# "tracefunc.o" will go away - I use GCC's -finstrument-functions support inside tracefunc.o +obj-y += memory.o savevm.o cputlb.o qemu-rdma.o #tracefunc.o obj-$(CONFIG_HAVE_GET_MEMORY_MAPPING) += memory_mapping.o obj-$(CONFIG_HAVE_CORE_DUMP) += dump.o obj-$(CONFIG_NO_GET_MEMORY_MAPPING) += memory_mapping-stub.o obj-$(CONFIG_NO_CORE_DUMP) += dump-stub.o -LIBS+=-lz +LIBS+=-lz -lrdmacm # xen support obj-$(CONFIG_XEN) += xen-all.o xen-mapcache.o diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h new file mode 100644 index 0000000..099622e --- /dev/null +++ b/include/qemu/rdma.h @@ -0,0 +1,249 @@ +/* + * RDMA data structures and helper functions header (for migration) + * + * Copyright IBM, Corp. 2013 + * + * Authors: + * Michael R. Hines <mrhines@us.ibm.com> + * Jiuxing Liu <jl@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _QEMU_RDMA_H +#define _QEMU_RDMA_H + +#include <rdma/rdma_cma.h> +#include "monitor/monitor.h" + +extern int rdmaport; +extern char rdmahost[64]; + +struct qemu_rdma_context { + /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in + cm_id->verbs, cm_id->channel, and cm_id->qp. */ + struct rdma_cm_id *cm_id; + struct rdma_cm_id *listen_id; + + struct ibv_context *verbs; + struct rdma_event_channel *channel; + struct ibv_qp *qp; + + struct ibv_comp_channel *comp_channel; + struct ibv_pd *pd; + struct ibv_cq *cq; +}; + +static inline void qemu_rdma_init_context(struct qemu_rdma_context *rdma_ctx) +{ + rdma_ctx->cm_id = NULL; + rdma_ctx->listen_id = NULL; + rdma_ctx->verbs = NULL; + rdma_ctx->channel = NULL; + rdma_ctx->qp = NULL; + rdma_ctx->comp_channel = NULL; + rdma_ctx->pd = NULL; + rdma_ctx->cq = NULL; +} + +void cpu_physical_memory_reset_dirty_all(void); + +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx, + const char *host, int port); +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx); +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx); +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx, + void *in_data, int *in_len, void *out_data, int out_len); +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx, + void *in_data, int *in_len, void *out_data, int out_len); +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx); +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx); + +/* Instead of registering whole ram blocks, we can register them in smaller + * chunks. This may be benefial if the ram blocks have holes in them */ +#define QEMU_RDMA_CHUNK_REGISTRATION + +#define QEMU_RDMA_LAZY_REGISTRATION + +#define QEMU_RDMA_REG_CHUNK_SHIFT 20 +#define QEMU_RDMA_REG_CHUNK_SIZE (1UL << (QEMU_RDMA_REG_CHUNK_SHIFT)) +#define QEMU_RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \ + (((unsigned long)(host_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT) - \ + ((unsigned long)(start_addr) >> QEMU_RDMA_REG_CHUNK_SHIFT)) +#define QEMU_RDMA_REG_NUM_CHUNKS(rdma_ram_block) \ + (QEMU_RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\ + (rdma_ram_block)->local_host_addr +\ + (rdma_ram_block)->length) + 1) +#define QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\ + ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \ + QEMU_RDMA_REG_CHUNK_SHIFT) + (i)) << \ + QEMU_RDMA_REG_CHUNK_SHIFT)) +#define QEMU_RDMA_REG_CHUNK_END(rdma_ram_block, i) \ + (QEMU_RDMA_REG_CHUNK_START(rdma_ram_block, i) + \ + QEMU_RDMA_REG_CHUNK_SIZE) + +struct qemu_rdma_ram_block { + uint8_t *local_host_addr; + uint64_t remote_host_addr; + uint64_t offset; + uint64_t length; + struct ibv_mr **pmr; + struct ibv_mr *mr; + uint32_t remote_rkey; +}; + +struct qemu_rdma_remote_ram_block { + uint64_t remote_host_addr; + uint64_t offset; + uint64_t length; + uint32_t remote_rkey; +}; + +#define QEMU_MAX_RAM_BLOCKS 64 + +struct qemu_rdma_ram_blocks { + int num_blocks; + struct qemu_rdma_ram_block block[QEMU_MAX_RAM_BLOCKS]; +}; + +struct qemu_rdma_remote_ram_blocks { + int num_blocks; + struct qemu_rdma_remote_ram_block block[QEMU_MAX_RAM_BLOCKS]; +}; + +int qemu_rdma_init_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks); +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks); +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks); +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks); +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks); +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks); + +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, + struct qemu_rdma_remote_ram_blocks *remote); +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, + struct qemu_rdma_remote_ram_blocks *remote); + +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length, + struct qemu_rdma_ram_blocks *blocks, + int *block_index, int *chunk_index); + +struct qemu_rdma_migration_data { + char *host; + int port; + int enabled; + + struct qemu_rdma_context rdma_ctx; + struct qemu_rdma_ram_blocks rdma_ram_blocks; + + /* This is used for synchronization: We use + IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs + are done. When the receiver gets it, it can be certain + that all the RDMAs are completed. */ + int sync; + struct ibv_mr *sync_mr; + + /* This is used for the server to write the remote + ram blocks info. */ + struct qemu_rdma_remote_ram_blocks remote_info; + struct ibv_mr *remote_info_mr; + + /* The rest is only for the initiator of the migration. */ + int client_init_done; + + /* number of outstanding unsignaled send */ + int num_unsignaled_send; + + /* number of outstanding signaled send */ + int num_signaled_send; + + /* store info about current buffer so that we can + merge it with future sends */ + uint64_t current_offset; + uint64_t current_length; + /* index of ram block the current buffer belongs to */ + int current_index; + /* index of the chunk in the current ram block */ + int current_chunk; + + uint64_t total_bytes; + +}; + +extern struct qemu_rdma_migration_data rdma_mdata; + +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata); + +static inline int qemu_use_rdma_migration(void) +{ + /* port will be non-zero if user wants to use RDMA. */ + return rdma_mdata.port != -1; +} + +static inline int qemu_rdma_migration_enabled(void) +{ + return rdma_mdata.enabled; +} + +void qemu_rdma_migration_disable(void); + +#define QEMU_RDMA_MIGRATION_BLOCKING +#define QEMU_RDMA_MIGRATION_EXTRA_SYNC + +enum { + QEMU_RDMA_MIGRATION_WRID_NONE = 0, + QEMU_RDMA_MIGRATION_WRID_RDMA, + QEMU_RDMA_MIGRATION_WRID_SEND_SYNC, + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC, + QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO, + QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO, + QEMU_RDMA_MIGRATION_WRID_SEND_EXTRA_SYNC, + QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC, +}; + +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host, + int port); +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata, + int wr_id); +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata, + int wr_id); + +int qemu_rdma_migration_reg_remote_info( + struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_dereg_remote_info( + struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_post_send_remote_info( + struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_post_recv_remote_info( + struct qemu_rdma_migration_data *mdata); + +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata, + uint64_t addr, uint64_t len); +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_wait_for_wrid( + struct qemu_rdma_migration_data *mdata, + int wrid); +int qemu_rdma_migration_poll_for_wrid( + struct qemu_rdma_migration_data *mdata, + int wrid); +int qemu_rdma_migration_block_for_wrid( + struct qemu_rdma_migration_data *mdata, + int wrid); +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata); + +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata); +int qemu_rdma_migration_server_wait_for_client( + struct qemu_rdma_migration_data *mdata); + +#endif diff --git a/qemu-rdma.c b/qemu-rdma.c new file mode 100644 index 0000000..5f16875 --- /dev/null +++ b/qemu-rdma.c @@ -0,0 +1,1357 @@ +/* + * RDMA data structures and helper functions (for migration) + * + * Copyright IBM, Corp. 2013 + * + * Authors: + * Michael R. Hines <mrhines@us.ibm.com> + * Jiuxing Liu <jl@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu/rdma.h" +#include "qemu-common.h" +#include <stdio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <netdb.h> +#include <arpa/inet.h> +#include <string.h> + +#define QEMU_RDMA_RESOLVE_TIMEOUT_MS 10000 +#define QEMU_RDMA_CQ_SIZE 2000 +#define QEMU_RDMA_QP_SIZE 1000 + +int rdmaport = -1; +char rdmahost[64] = ""; +struct qemu_rdma_migration_data rdma_mdata; + +static void *qemu_rdma_mallocz(size_t size) +{ + void *ptr; + ptr = malloc(size); + memset(ptr, 0, size); + return ptr; +} + +int qemu_rdma_resolve_host(struct qemu_rdma_context *rdma_ctx, + const char *host, int port) +{ + int ret; + struct addrinfo *res; + char port_str[16]; + struct rdma_cm_event *cm_event; + + + if (!strcmp(host, "")) { + printf("RDMA hostname has not been set\n"); + return -1; + } + + /* create CM channel */ + rdma_ctx->channel = rdma_create_event_channel(); + if (!rdma_ctx->channel) { + printf("could not create CM channel\n"); + return -1; + } + + /* create CM id */ + ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL, + RDMA_PS_TCP); + if (ret) { + printf("could not create channel id\n"); + goto err_resolve_create_id; + } + + snprintf(port_str, 16, "%d", port); + port_str[15] = '\0'; + ret = getaddrinfo(host, port_str, NULL, &res); + if (ret < 0) { + printf("could not getaddrinfo address %s\n", host); + goto err_resolve_get_addr; + } + + /* resolve the first address */ + ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr, + QEMU_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + printf("could not resolve address %s\n", host); + goto err_resolve_get_addr; + } + + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); + if (ret) { + printf("could not perform event_addr_resolved\n"); + goto err_resolve_get_addr; + } + if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { + printf("result not equal to event_addr_resolved\n"); + rdma_ack_cm_event(cm_event); + goto err_resolve_get_addr; + } + rdma_ack_cm_event(cm_event); + + /* resolve route */ + ret = rdma_resolve_route(rdma_ctx->cm_id, QEMU_RDMA_RESOLVE_TIMEOUT_MS); + if (ret) { + printf("could not resolve rdma route\n"); + goto err_resolve_get_addr; + } + + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); + if (ret) { + printf("could not perform event_route_resolved\n"); + goto err_resolve_get_addr; + } + if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { + printf("result not equal to event_route_resolved\n"); + rdma_ack_cm_event(cm_event); + goto err_resolve_get_addr; + } + rdma_ack_cm_event(cm_event); + + rdma_ctx->verbs = rdma_ctx->cm_id->verbs; + return 0; + +err_resolve_get_addr: + rdma_destroy_id(rdma_ctx->cm_id); +err_resolve_create_id: + rdma_destroy_event_channel(rdma_ctx->channel); + rdma_ctx->channel = NULL; + + return -1; +} + +int qemu_rdma_alloc_pd_cq(struct qemu_rdma_context *rdma_ctx) +{ + + /* allocate pd */ + rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs); + if (!rdma_ctx->pd) { + return -1; + } + +#ifdef QEMU_RDMA_MIGRATION_BLOCKING + /* create completion channel */ + rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs); + if (!rdma_ctx->comp_channel) { + goto err_alloc_pd_cq; + } +#endif + + /* create cq */ + rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, QEMU_RDMA_CQ_SIZE, + NULL, rdma_ctx->comp_channel, 0); + if (!rdma_ctx->cq) { + goto err_alloc_pd_cq; + } + + return 0; + +err_alloc_pd_cq: + if (rdma_ctx->pd) { + ibv_dealloc_pd(rdma_ctx->pd); + } + if (rdma_ctx->comp_channel) { + ibv_destroy_comp_channel(rdma_ctx->comp_channel); + } + rdma_ctx->pd = NULL; + rdma_ctx->comp_channel = NULL; + return -1; + +} + +int qemu_rdma_alloc_qp(struct qemu_rdma_context *rdma_ctx) +{ + struct ibv_qp_init_attr attr = { 0 }; + int ret; + + attr.cap.max_send_wr = QEMU_RDMA_QP_SIZE; + attr.cap.max_recv_wr = 2; + attr.cap.max_send_sge = 1; + attr.cap.max_recv_sge = 1; + attr.send_cq = rdma_ctx->cq; + attr.recv_cq = rdma_ctx->cq; + attr.qp_type = IBV_QPT_RC; + + ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr); + if (ret) { + return -1; + } + + rdma_ctx->qp = rdma_ctx->cm_id->qp; + return 0; +} + +int qemu_rdma_connect(struct qemu_rdma_context *rdma_ctx, + void *in_data, int *in_len, void *out_data, int out_len) +{ + int ret; + struct rdma_conn_param conn_param = { 0 }; + struct rdma_cm_event *cm_event; + + conn_param.initiator_depth = 2; + conn_param.retry_count = 5; + conn_param.private_data = out_data; + conn_param.private_data_len = out_len; + + ret = rdma_connect(rdma_ctx->cm_id, &conn_param); + if (ret) { + return -1; + } + + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); + if (ret) { + return -1; + } + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { + return -1; + } + + if (in_len) { + if (*in_len > cm_event->param.conn.private_data_len) { + *in_len = cm_event->param.conn.private_data_len; + } + if (*in_len) { + memcpy(in_data, cm_event->param.conn.private_data, *in_len); + } + } + + rdma_ack_cm_event(cm_event); + + return 0; +} + +int qemu_rdma_listen(struct qemu_rdma_migration_data *mdata, char *host, + int port) +{ + int ret; + struct rdma_cm_event *cm_event; + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; + struct ibv_context *verbs; + + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); + if (ret) { + goto err_listen; + } + + if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { + rdma_ack_cm_event(cm_event); + goto err_listen; + } + + rdma_ctx->cm_id = cm_event->id; + verbs = cm_event->id->verbs; + printf("verbs context after listen: %p\n", verbs); + rdma_ack_cm_event(cm_event); + + if (!rdma_ctx->verbs) { + rdma_ctx->verbs = verbs; + ret = qemu_rdma_migration_server_prepare(mdata); + if (ret) { + fprintf(stderr, "rdma migration: error preparing server!\n"); + goto err_listen; + } + } else if (rdma_ctx->verbs != verbs) { + fprintf(stderr, "ibv context not matching %p, %p!\n", + rdma_ctx->verbs, verbs); + goto err_listen; + } + /* xxx destroy listen_id ??? */ + + return 0; + +err_listen: + + return -1; + +} + +int qemu_rdma_accept(struct qemu_rdma_context *rdma_ctx, + void *in_data, int *in_len, void *out_data, int out_len) +{ + int ret; + struct rdma_conn_param conn_param = { 0 }; + struct rdma_cm_event *cm_event; + + conn_param.responder_resources = 2; + conn_param.private_data = out_data; + conn_param.private_data_len = out_len; + + ret = rdma_accept(rdma_ctx->cm_id, &conn_param); + if (ret) { + fprintf(stderr, "rdma_accept returns %d!\n", ret); + return -1; + } + + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); + if (ret) { + return -1; + } + + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { + rdma_ack_cm_event(cm_event); + return -1; + } + + if (in_len) { + if (*in_len > cm_event->param.conn.private_data_len) { + *in_len = cm_event->param.conn.private_data_len; + } + if (*in_len) { + memcpy(in_data, cm_event->param.conn.private_data, *in_len); + } + } + + rdma_ack_cm_event(cm_event); + + return 0; +} + +void qemu_rdma_disconnect(struct qemu_rdma_context *rdma_ctx) +{ + int ret; + struct rdma_cm_event *cm_event; + + ret = rdma_disconnect(rdma_ctx->cm_id); + if (ret) { + return; + } + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event); + if (ret) { + return; + } + rdma_ack_cm_event(cm_event); +} + +void qemu_rdma_cleanup(struct qemu_rdma_context *rdma_ctx) +{ + if (rdma_ctx->qp) { + ibv_destroy_qp(rdma_ctx->qp); + } + if (rdma_ctx->cq) { + ibv_destroy_cq(rdma_ctx->cq); + } + if (rdma_ctx->comp_channel) { + ibv_destroy_comp_channel(rdma_ctx->comp_channel); + } + if (rdma_ctx->pd) { + ibv_dealloc_pd(rdma_ctx->pd); + } + if (rdma_ctx->listen_id) { + rdma_destroy_id(rdma_ctx->listen_id); + } + if (rdma_ctx->cm_id) { + rdma_destroy_id(rdma_ctx->cm_id); + } + if (rdma_ctx->channel) { + rdma_destroy_event_channel(rdma_ctx->channel); + } +} + +int qemu_rdma_reg_chunk_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks) +{ + int i, j; + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { + struct qemu_rdma_ram_block *block = &(rdma_ram_blocks->block[i]); + int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block); + /* allocate memory to store chunk MRs */ + rdma_ram_blocks->block[i].pmr = qemu_rdma_mallocz( + num_chunks * sizeof(struct ibv_mr *)); + + if (!block->pmr) { + goto err_reg_chunk_ram_blocks; + } + + for (j = 0; j < num_chunks; j++) { + uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, j); + uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, j); + if (start_addr < block->local_host_addr) { + start_addr = block->local_host_addr; + } + if (end_addr > block->local_host_addr + block->length) { + end_addr = block->local_host_addr + block->length; + } + block->pmr[j] = ibv_reg_mr(rdma_ctx->pd, + start_addr, + end_addr - start_addr, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (!block->pmr[j]) { + break; + } + } + if (j < num_chunks) { + for (j--; j >= 0; j--) { + ibv_dereg_mr(block->pmr[j]); + } + block->pmr[i] = NULL; + goto err_reg_chunk_ram_blocks; + } + } + + return 0; + +err_reg_chunk_ram_blocks: + for (i--; i >= 0; i--) { + int num_chunks = + QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); + for (j = 0; j < num_chunks; j++) { + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); + } + free(rdma_ram_blocks->block[i].pmr); + rdma_ram_blocks->block[i].pmr = NULL; + } + + return -1; + +} + +int qemu_rdma_reg_whole_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks) +{ + int i; + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { + rdma_ram_blocks->block[i].mr = + ibv_reg_mr(rdma_ctx->pd, + rdma_ram_blocks->block[i].local_host_addr, + rdma_ram_blocks->block[i].length, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (!rdma_ram_blocks->block[i].mr) { + break; + } + } + + if (i >= rdma_ram_blocks->num_blocks) { + return 0; + } + + for (i--; i >= 0; i--) { + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); + } + + return -1; + +} + +int qemu_rdma_client_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks) +{ +#ifdef QEMU_RDMA_CHUNK_REGISTRATION +#ifdef QEMU_RDMA_LAZY_REGISTRATION + return 0; +#else + return qemu_rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks); +#endif +#else + return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); +#endif +} + +int qemu_rdma_server_reg_ram_blocks(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_blocks *rdma_ram_blocks) +{ + return qemu_rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks); +} + +void qemu_rdma_dereg_ram_blocks(struct qemu_rdma_ram_blocks *rdma_ram_blocks) +{ + int i, j; + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { + int num_chunks; + if (!rdma_ram_blocks->block[i].pmr) { + continue; + } + num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i])); + for (j = 0; j < num_chunks; j++) { + if (!rdma_ram_blocks->block[i].pmr[j]) { + continue; + } + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]); + } + free(rdma_ram_blocks->block[i].pmr); + rdma_ram_blocks->block[i].pmr = NULL; + } + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) { + if (!rdma_ram_blocks->block[i].mr) { + continue; + } + ibv_dereg_mr(rdma_ram_blocks->block[i].mr); + rdma_ram_blocks->block[i].mr = NULL; + } +} + +void qemu_rdma_copy_to_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, + struct qemu_rdma_remote_ram_blocks *remote) +{ + int i; + remote->num_blocks = local->num_blocks; + for (i = 0; i < local->num_blocks; i++) { + remote->block[i].remote_host_addr = + (uint64_t)(local->block[i].local_host_addr); + remote->block[i].remote_rkey = local->block[i].mr->rkey; + remote->block[i].offset = local->block[i].offset; + remote->block[i].length = local->block[i].length; + } +} + +int qemu_rdma_process_remote_ram_blocks(struct qemu_rdma_ram_blocks *local, + struct qemu_rdma_remote_ram_blocks *remote) +{ + int i, j; + + if (local->num_blocks != remote->num_blocks) { + return -1; + } + + for (i = 0; i < remote->num_blocks; i++) { + /* search local ram blocks */ + for (j = 0; j < local->num_blocks; j++) { + if (remote->block[i].offset != local->block[j].offset) { + continue; + } + if (remote->block[i].length != local->block[j].length) { + return -1; + } + local->block[j].remote_host_addr = + remote->block[i].remote_host_addr; + local->block[j].remote_rkey = remote->block[i].remote_rkey; + break; + } + if (j >= local->num_blocks) { + return -1; + } + } + + return 0; +} + +int qemu_rdma_search_ram_block(uint64_t offset, uint64_t length, + struct qemu_rdma_ram_blocks *blocks, + int *block_index, int *chunk_index) +{ + int i; + for (i = 0; i < blocks->num_blocks; i++) { + if (offset < blocks->block[i].offset) { + continue; + } + if (offset + length > + blocks->block[i].offset + blocks->block[i].length) { + continue; + } + *block_index = i; + if (chunk_index) { + uint8_t *host_addr = blocks->block[i].local_host_addr + + (offset - blocks->block[i].offset); + *chunk_index = QEMU_RDMA_REG_CHUNK_INDEX( + blocks->block[i].local_host_addr, host_addr); + } + return 0; + } + return -1; +} + +static int qemu_rdma_get_lkey(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_block *block, uint64_t host_addr, + uint32_t *lkey) +{ + int chunk; + if (block->mr) { + *lkey = block->mr->lkey; + return 0; + } + if (!block->pmr) { + int num_chunks = QEMU_RDMA_REG_NUM_CHUNKS(block); + /* allocate memory to store chunk MRs */ + block->pmr = qemu_rdma_mallocz(num_chunks * + sizeof(struct ibv_mr *)); + if (!block->pmr) { + return -1; + } + } + chunk = QEMU_RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr); + if (!block->pmr[chunk]) { + uint8_t *start_addr = QEMU_RDMA_REG_CHUNK_START(block, chunk); + uint8_t *end_addr = QEMU_RDMA_REG_CHUNK_END(block, chunk); + if (start_addr < block->local_host_addr) { + start_addr = block->local_host_addr; + } + if (end_addr > block->local_host_addr + block->length) { + end_addr = block->local_host_addr + block->length; + } + block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd, + start_addr, + end_addr - start_addr, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (!block->pmr[chunk]) { + return -1; + } + } + *lkey = block->pmr[chunk]->lkey; + return 0; +} + +static int qemu_rdma_write(struct qemu_rdma_context *rdma_ctx, + struct qemu_rdma_ram_block *block, + uint64_t offset, uint64_t length, + uint64_t wr_id, enum ibv_send_flags flag) +{ + struct ibv_sge sge; + struct ibv_send_wr send_wr = { 0 }; + struct ibv_send_wr *bad_wr; + + sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset)); + sge.length = length; + if (qemu_rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) { + fprintf(stderr, "cannot get lkey!\n"); + return -1; + } + send_wr.wr_id = wr_id; + send_wr.opcode = IBV_WR_RDMA_WRITE; + send_wr.send_flags = flag; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + send_wr.wr.rdma.rkey = block->remote_rkey; + send_wr.wr.rdma.remote_addr = block->remote_host_addr + + (offset - block->offset); + + if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) { + return -1; + } + + return 0; +} + +/* Do not merge data if larger than this. */ +#define QEMU_RDMA_MIGRATION_MERGE_MAX (4 * 1024 * 1024) + +#define QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX 64 + +int qemu_rdma_migration_reg_sync(struct qemu_rdma_migration_data *mdata) +{ + mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd, + &mdata->sync, + sizeof mdata->sync, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (mdata->sync_mr) { + return 0; + } + return -1; +} + +int qemu_rdma_migration_dereg_sync(struct qemu_rdma_migration_data *mdata) +{ + return ibv_dereg_mr(mdata->sync_mr); +} + +int qemu_rdma_migration_reg_remote_info( + struct qemu_rdma_migration_data *mdata) +{ + mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd, + &mdata->remote_info, + sizeof mdata->remote_info, + IBV_ACCESS_LOCAL_WRITE | + IBV_ACCESS_REMOTE_WRITE | + IBV_ACCESS_REMOTE_READ); + if (mdata->remote_info_mr) { + return 0; + } + return -1; +} + +int qemu_rdma_migration_dereg_remote_info( + struct qemu_rdma_migration_data *mdata) +{ + return ibv_dereg_mr(mdata->remote_info_mr); +} + + +int qemu_rdma_migration_post_send_sync(struct qemu_rdma_migration_data *mdata, + int wr_id) +{ + struct ibv_sge sge; + struct ibv_send_wr send_wr = { 0 }; + struct ibv_send_wr *bad_wr; + + mdata->sync = 1; + + sge.addr = (uint64_t)(&mdata->sync); + sge.length = sizeof mdata->sync; + sge.lkey = mdata->sync_mr->lkey; + + send_wr.wr_id = wr_id; + send_wr.opcode = IBV_WR_SEND; + send_wr.send_flags = IBV_SEND_SIGNALED; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { + return -1; + } + + return 0; +} + +int qemu_rdma_migration_post_recv_sync(struct qemu_rdma_migration_data *mdata, + int wr_id) +{ + struct ibv_sge sge; + struct ibv_recv_wr recv_wr = { 0 }; + struct ibv_recv_wr *bad_wr; + + mdata->sync = 1; + + sge.addr = (uint64_t)(&mdata->sync); + sge.length = sizeof mdata->sync; + sge.lkey = mdata->sync_mr->lkey; + + recv_wr.wr_id = wr_id; + recv_wr.sg_list = &sge; + recv_wr.num_sge = 1; + + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { + return -1; + } + + return 0; + +} + +int qemu_rdma_migration_post_send_remote_info( + struct qemu_rdma_migration_data *mdata) +{ + struct ibv_sge sge; + struct ibv_send_wr send_wr = { 0 }; + struct ibv_send_wr *bad_wr; + + sge.addr = (uint64_t)(&mdata->remote_info); + sge.length = sizeof mdata->remote_info; + sge.lkey = mdata->remote_info_mr->lkey; + + send_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO; + send_wr.opcode = IBV_WR_SEND; + send_wr.send_flags = IBV_SEND_SIGNALED; + send_wr.sg_list = &sge; + send_wr.num_sge = 1; + + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) { + return -1; + } + + mdata->num_signaled_send--; + return 0; +} + +int qemu_rdma_migration_post_recv_remote_info( + struct qemu_rdma_migration_data *mdata) +{ + struct ibv_sge sge; + struct ibv_recv_wr recv_wr = { 0 }; + struct ibv_recv_wr *bad_wr; + + sge.addr = (uint64_t)(&mdata->remote_info); + sge.length = sizeof mdata->remote_info; + sge.lkey = mdata->remote_info_mr->lkey; + + recv_wr.wr_id = QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO; + recv_wr.sg_list = &sge; + recv_wr.num_sge = 1; + + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) { + return -1; + } + + return 0; +} + + +int qemu_rdma_migration_write_flush(struct qemu_rdma_migration_data *mdata) +{ + int ret; + enum ibv_send_flags flags = 0; + + if (!mdata->current_length) { + return 0; + } + if (mdata->num_unsignaled_send >= + QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) { + flags = IBV_SEND_SIGNALED; + } + ret = qemu_rdma_write(&mdata->rdma_ctx, + &(mdata->rdma_ram_blocks.block[mdata->current_index]), + mdata->current_offset, + mdata->current_length, + QEMU_RDMA_MIGRATION_WRID_RDMA, flags); + + if (ret) { + return ret; + } + + if (mdata->num_unsignaled_send >= + QEMU_RDMA_MIGRATION_UNSIGNALED_SEND_MAX) { + mdata->num_unsignaled_send = 0; + mdata->num_signaled_send++; + } else { + mdata->num_unsignaled_send++; + } + + mdata->total_bytes += mdata->current_length; + mdata->current_length = 0; + mdata->current_offset = 0; + + return 0; +} + +static inline int qemu_rdma_migration_in_current_block( + struct qemu_rdma_migration_data *mdata, + uint64_t offset, uint64_t len) +{ + struct qemu_rdma_ram_block *block = + &(mdata->rdma_ram_blocks.block[mdata->current_index]); + if (mdata->current_index < 0) { + return 0; + } + if (offset < block->offset) { + return 0; + } + if (offset + len > block->offset + block->length) { + return 0; + } + return 1; +} + +static inline int qemu_rdma_migration_in_current_chunk( + struct qemu_rdma_migration_data *mdata, + uint64_t offset, uint64_t len) +{ + struct qemu_rdma_ram_block *block = + &(mdata->rdma_ram_blocks.block[mdata->current_index]); + uint8_t *chunk_start, *chunk_end, *host_addr; + if (mdata->current_chunk < 0) { + return 0; + } + host_addr = block->local_host_addr + (offset - block->offset); + chunk_start = QEMU_RDMA_REG_CHUNK_START(block, mdata->current_chunk); + if (chunk_start < block->local_host_addr) { + chunk_start = block->local_host_addr; + } + if (host_addr < chunk_start) { + return 0; + } + chunk_end = QEMU_RDMA_REG_CHUNK_END(block, mdata->current_chunk); + if (chunk_end > chunk_start + block->length) { + chunk_end = chunk_start + block->length; + } + if (host_addr + len > chunk_end) { + return 0; + } + return 1; +} + +static inline int qemu_rdma_buffer_mergable( + struct qemu_rdma_migration_data *mdata, + uint64_t offset, uint64_t len) +{ + if (mdata->current_length == 0) { + return 0; + } + if (offset != mdata->current_offset + mdata->current_length) { + return 0; + } + if (!qemu_rdma_migration_in_current_block(mdata, offset, len)) { + return 0; + } +#ifdef QEMU_RDMA_CHUNK_REGISTRATION + if (!qemu_rdma_migration_in_current_chunk(mdata, offset, len)) { + return 0; + } +#endif + return 1; +} + +/* Note that buffer must be within a single block/chunk. */ +int qemu_rdma_migration_write(struct qemu_rdma_migration_data *mdata, + uint64_t offset, uint64_t len) +{ + int index = mdata->current_index; + int chunk_index = mdata->current_chunk; + int ret; + + /* If we cannot merge it, we flush the current buffer first. */ + if (!qemu_rdma_buffer_mergable(mdata, offset, len)) { + ret = qemu_rdma_migration_write_flush(mdata); + if (ret) { + return ret; + } + mdata->current_length = 0; + mdata->current_offset = offset; + + if (qemu_rdma_search_ram_block(offset, len, + &mdata->rdma_ram_blocks, &index, &chunk_index)) { + return -1; + } + mdata->current_index = index; + mdata->current_chunk = chunk_index; + } + + /* merge it */ + mdata->current_length += len; + + /* flush it if buffer is too large */ + if (mdata->current_length >= QEMU_RDMA_MIGRATION_MERGE_MAX) { + return qemu_rdma_migration_write_flush(mdata); + } + + return 0; +} + +int qemu_rdma_migration_poll(struct qemu_rdma_migration_data *mdata) +{ + int ret; + struct ibv_wc wc; + + ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc); + if (!ret) { + return QEMU_RDMA_MIGRATION_WRID_NONE; + } + if (ret < 0) { + fprintf(stderr, "ibv_poll_cq return %d!\n", ret); + return ret; + } + if (wc.status != IBV_WC_SUCCESS) { + fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n", + wc.status, ibv_wc_status_str(wc.status)); + fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id); + + return -1; + } + + if (!(wc.opcode & IBV_WC_RECV)) { + mdata->num_signaled_send--; + } + + return (int)wc.wr_id; +} + +int qemu_rdma_migration_wait_for_wrid( + struct qemu_rdma_migration_data *mdata, + int wrid) +{ +#ifdef QEMU_RDMA_MIGRATION_BLOCKING + return qemu_rdma_migration_block_for_wrid(mdata, wrid); +#else + return qemu_rdma_migration_poll_for_wrid(mdata, wrid); +#endif +} + +int qemu_rdma_migration_poll_for_wrid( + struct qemu_rdma_migration_data *mdata, + int wrid) +{ + int r = QEMU_RDMA_MIGRATION_WRID_NONE; + while (r != wrid) { + r = qemu_rdma_migration_poll(mdata); + if (r < 0) { + return r; + } + } + return 0; +} + +int qemu_rdma_migration_block_for_wrid( + struct qemu_rdma_migration_data *mdata, + int wrid) +{ + int num_cq_events = 0; + int r = QEMU_RDMA_MIGRATION_WRID_NONE; + struct ibv_cq *cq; + void *cq_ctx; + + if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) { + return -1; + } + /* poll cq first */ + while (r != wrid) { + r = qemu_rdma_migration_poll(mdata); + if (r < 0) { + return r; + } + if (r == QEMU_RDMA_MIGRATION_WRID_NONE) { + break; + } + } + if (r == wrid) { + return 0; + } + + while (1) { + if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel, + &cq, &cq_ctx)) { + goto err_block_for_wrid; + } + num_cq_events++; + if (ibv_req_notify_cq(cq, 0)) { + goto err_block_for_wrid; + } + /* poll cq */ + while (r != wrid) { + r = qemu_rdma_migration_poll(mdata); + if (r < 0) { + goto err_block_for_wrid; + } + if (r == QEMU_RDMA_MIGRATION_WRID_NONE) { + break; + } + } + if (r == wrid) { + goto success_block_for_wrid; + } + } + +success_block_for_wrid: + if (num_cq_events) { + ibv_ack_cq_events(cq, num_cq_events); + } + return 0; + +err_block_for_wrid: + if (num_cq_events) { + ibv_ack_cq_events(cq, num_cq_events); + } + return -1; +} + +void qemu_rdma_migration_cleanup(struct qemu_rdma_migration_data *mdata) +{ + mdata->enabled = 0; + if (mdata->sync_mr) { + qemu_rdma_migration_dereg_sync(mdata); + } + if (mdata->remote_info_mr) { + qemu_rdma_migration_dereg_remote_info(mdata); + } + mdata->sync_mr = NULL; + mdata->remote_info_mr = NULL; + qemu_rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks); + mdata->rdma_ram_blocks.num_blocks = 0; + qemu_rdma_cleanup(&mdata->rdma_ctx); + qemu_rdma_migration_data_init(mdata); +} + +int qemu_rdma_migration_client_init(struct qemu_rdma_migration_data *mdata) +{ + int ret; + + if (mdata->client_init_done) { + return 0; + } + + ret = qemu_rdma_resolve_host(&mdata->rdma_ctx, + mdata->host, mdata->port); + if (ret) { + fprintf(stderr, "rdma migration: error resolving host!\n"); + goto err_rdma_client_init; + } + + ret = qemu_rdma_alloc_pd_cq(&mdata->rdma_ctx); + if (ret) { + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); + goto err_rdma_client_init; + } + + ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx); + if (ret) { + fprintf(stderr, "rdma migration: error allocating qp!\n"); + goto err_rdma_client_init; + } + + ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks); + if (ret) { + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); + goto err_rdma_client_init; + } + + ret = qemu_rdma_client_reg_ram_blocks(&mdata->rdma_ctx, + &mdata->rdma_ram_blocks); + if (ret) { + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); + goto err_rdma_client_init; + } + + ret = qemu_rdma_migration_reg_sync(mdata); + if (ret) { + fprintf(stderr, "rdma migration: error registering sync data!\n"); + goto err_rdma_client_init; + } + + ret = qemu_rdma_migration_reg_remote_info(mdata); + if (ret) { + fprintf(stderr, "rdma migration: error registering remote info!\n"); + goto err_rdma_client_init; + } + + ret = qemu_rdma_migration_post_recv_remote_info(mdata); + if (ret) { + fprintf(stderr, "rdma migration: error posting remote info recv!\n"); + goto err_rdma_client_init; + } + + mdata->client_init_done = 1; + return 0; + +err_rdma_client_init: + qemu_rdma_migration_cleanup(mdata); + return -1; +} + + +int qemu_rdma_migration_client_connect(struct qemu_rdma_migration_data *mdata) +{ + + int ret; + + ret = qemu_rdma_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0); + if (ret) { + fprintf(stderr, "rdma migration: error connecting!\n"); + goto err_rdma_client_connect; + } + + /* wait for remote info */ + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, + QEMU_RDMA_MIGRATION_WRID_RECV_REMOTE_INFO); + if (ret < 0) { + fprintf(stderr, "rdma migration: polling remote info error!\n"); + goto err_rdma_client_connect; + } + + ret = qemu_rdma_process_remote_ram_blocks( + &mdata->rdma_ram_blocks, &mdata->remote_info); + if (ret) { + fprintf(stderr, + "rdma migration: error processing remote ram blocks!\n"); + goto err_rdma_client_connect; + } + + rdma_mdata.total_bytes = 0; + rdma_mdata.enabled = 1; + return 0; + +err_rdma_client_connect: + qemu_rdma_migration_cleanup(mdata); + return -1; +} + +int qemu_rdma_migration_server_init(struct qemu_rdma_migration_data *mdata) +{ + + int ret; + struct sockaddr_in sin; + struct rdma_cm_id *listen_id; + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; + + /* create CM channel */ + rdma_ctx->channel = rdma_create_event_channel(); + if (!rdma_ctx->channel) { + return -1; + } + + /* create CM id */ + ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL, + RDMA_PS_TCP); + if (ret) { + goto err_server_init_create_listen_id; + } + + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_port = htons(mdata->port); + if (strcmp("", mdata->host)) { + struct hostent *server_addr; + server_addr = gethostbyname(mdata->host); + if (!server_addr) { + goto err_server_init_bind_addr; + } + memcpy(&sin.sin_addr.s_addr, server_addr->h_addr, + server_addr->h_length); + } else { + sin.sin_addr.s_addr = INADDR_ANY; + } + + ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin); + if (ret) { + goto err_server_init_bind_addr; + } + printf("verbs context after binding: %p\n", listen_id->verbs); + + rdma_ctx->listen_id = listen_id; + if (listen_id->verbs) { + rdma_ctx->verbs = listen_id->verbs; + } + return 0; + +err_server_init_bind_addr: + rdma_destroy_id(listen_id); +err_server_init_create_listen_id: + rdma_destroy_event_channel(rdma_ctx->channel); + rdma_ctx->channel = NULL; + + return -1; + +} + +int qemu_rdma_migration_server_prepare(struct qemu_rdma_migration_data *mdata) +{ + int ret; + struct qemu_rdma_context *rdma_ctx = &mdata->rdma_ctx; + + if (!rdma_ctx->verbs) { + return 0; + } + + ret = qemu_rdma_alloc_pd_cq(rdma_ctx); + if (ret) { + fprintf(stderr, "rdma migration: error allocating pd and cq!\n"); + goto err_rdma_server_prepare; + } + + ret = qemu_rdma_init_ram_blocks(&mdata->rdma_ram_blocks); + if (ret) { + fprintf(stderr, "rdma migration: error initializing ram blocks!\n"); + goto err_rdma_server_prepare; + } + + ret = qemu_rdma_server_reg_ram_blocks(rdma_ctx, + &mdata->rdma_ram_blocks); + if (ret) { + fprintf(stderr, "rdma migration: error registering ram blocks!\n"); + goto err_rdma_server_prepare; + } + + ret = qemu_rdma_migration_reg_sync(mdata); + if (ret) { + fprintf(stderr, "rdma migration: error registering sync data!\n"); + goto err_rdma_server_prepare; + } + + qemu_rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks, + &mdata->remote_info); + + ret = qemu_rdma_migration_reg_remote_info(mdata); + if (ret) { + fprintf(stderr, "rdma migration: error registering remote info!\n"); + goto err_rdma_server_prepare; + } + + ret = rdma_listen(rdma_ctx->listen_id, 5); + if (ret) { + fprintf(stderr, "rdma migration: error listening on socket!\n"); + goto err_rdma_server_prepare; + } + + return 0; + +err_rdma_server_prepare: + qemu_rdma_migration_cleanup(mdata); + return -1; +} + +int qemu_rdma_migration_server_wait_for_client( + struct qemu_rdma_migration_data *mdata) +{ + + int ret; + + ret = qemu_rdma_listen(mdata, mdata->host, mdata->port); + if (ret) { + fprintf(stderr, "rdma migration: error listening!\n"); + goto err_rdma_server_wait; + } + + ret = qemu_rdma_alloc_qp(&mdata->rdma_ctx); + if (ret) { + fprintf(stderr, "rdma migration: error allocating qp!\n"); + goto err_rdma_server_wait; + } + +#ifdef QEMU_RDMA_MIGRATION_EXTRA_SYNC + ret = qemu_rdma_migration_post_recv_sync(mdata, + QEMU_RDMA_MIGRATION_WRID_RECV_EXTRA_SYNC); + if (ret) { + fprintf(stderr, "rdma migration: error posting extra sync receive!\n"); + goto err_rdma_server_wait; + } +#endif + + ret = qemu_rdma_migration_post_recv_sync(mdata, + QEMU_RDMA_MIGRATION_WRID_RECV_SYNC); + if (ret) { + fprintf(stderr, "rdma migration: error posting sync receive!\n"); + goto err_rdma_server_wait; + } + + ret = qemu_rdma_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0); + if (ret) { + fprintf(stderr, "rdma migration: error accepting connection!\n"); + goto err_rdma_server_wait; + } + + /* send remote info */ + ret = qemu_rdma_migration_post_send_remote_info(mdata); + if (ret) { + fprintf(stderr, "rdma migration: error sending remote info!\n"); + goto err_rdma_server_wait; + } + + /* wait for completion */ + ret = qemu_rdma_migration_wait_for_wrid(&rdma_mdata, + QEMU_RDMA_MIGRATION_WRID_SEND_REMOTE_INFO); + if (ret < 0) { + fprintf(stderr, "rdma migration: polling remote info error!\n"); + goto err_rdma_server_wait; + } + + rdma_mdata.total_bytes = 0; + rdma_mdata.enabled = 1; + return 0; + +err_rdma_server_wait: + qemu_rdma_migration_cleanup(mdata); + return -1; + +} + +void qemu_rdma_migration_data_init(struct qemu_rdma_migration_data *mdata) +{ + qemu_rdma_init_context(&mdata->rdma_ctx); + mdata->port = rdmaport; + mdata->host = rdmahost; + mdata->enabled = 0; + mdata->rdma_ram_blocks.num_blocks = 0; + mdata->client_init_done = 0; + mdata->num_unsignaled_send = 0; + mdata->num_signaled_send = 0; + mdata->current_offset = 0; + mdata->current_length = 0; + mdata->current_index = -1; + mdata->current_chunk = -1; + mdata->sync = 0; + mdata->sync_mr = NULL; + mdata->remote_info_mr = NULL; +} + +void qemu_rdma_migration_disable(void) +{ + rdma_mdata.port = -1; + rdma_mdata.enabled = 0; +}