diff mbox series

[RFC,V1,1/6] Revert "vhost-backend: remove vhost_kernel_reset_device()"

Message ID 1725018997-363706-2-git-send-email-steven.sistare@oracle.com
State New
Headers show
Series Live Update: tap and vhost | expand

Commit Message

Steven Sistare Aug. 30, 2024, 11:56 a.m. UTC
This reverts commit e6383293eb01928692047e617665a742cca87e23.
The reset function is needed for CPR.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 hw/virtio/vhost-backend.c | 6 ++++++
 1 file changed, 6 insertions(+)

Comments

Euan Turner Sept. 3, 2024, 10:44 a.m. UTC | #1
Hi Steve,

On 30/08/2024 12:56, Steve Sistare wrote:
> This reverts commit e6383293eb01928692047e617665a742cca87e23.
> The reset function is needed for CPR.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
> ---
>   hw/virtio/vhost-backend.c | 6 ++++++
>   1 file changed, 6 insertions(+)
> 
> diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
> index 833804d..9b75141 100644
> --- a/hw/virtio/vhost-backend.c
> +++ b/hw/virtio/vhost-backend.c
> @@ -221,6 +221,11 @@ static int vhost_kernel_set_owner(struct vhost_dev *dev)
>       return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL);
>   }
>   
> +static int vhost_kernel_reset_device(struct vhost_dev *dev)
> +{
> +    return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL);
> +}
> +
How does this series avoid falling foul of 
c0c4f147291f37765a5275aa24c3e1195468903b (which follows the commit 
reverted here)?

I've been playing around with this patch series a bit, in the context of 
cpr-transfer, and am seeing the issues highlighted in that c0c4... 
commit message:
Since vhost-kernel now has a reset_device, this is called in 
virtio_reset as part of qemu_machine_creation_done. (I have the full 
backtrace if it's helpful). Subsequent ioctls then fail (with ownership 
errors) due to the RESET_OWNER:

2024-09-02T15:40:56.860541Z qemu-kvm: vhost_set_vring_call failed 1
2024-09-02T15:40:56.860908Z qemu-kvm: vhost_set_vring_call failed 1
2024-09-02T15:40:56.861253Z qemu-kvm: vhost_set_mem_table failed: 
Operation not permitted (1)
2024-09-02T15:40:56.861586Z qemu-kvm: vhost_set_vring_call failed 1
2024-09-02T15:40:56.861831Z qemu-kvm: vhost_set_vring_call failed 1
2024-09-02T15:40:56.862199Z qemu-kvm: unable to start vhost net: 1: 
falling back on userspace virtio

For me the NIC then fails during the migration, although the migration 
as a whole appears to succeed. (At least, prior the the migration, I 
could ssh into the VM and ping out to 8.8.8.8, but then I lose the ssh 
connection during the migration, and cannot ssh back in again afterwards 
on the new QEMU).

Do you think this could be because of QEMU falling back from the vhost 
backend to use virtio?

It may be down to some misconfiguration on my part, here's the netdev 
command line I had for reference:
On the source QEMU:

-netdev 
'{"type":"tap","fd":"39","vhost":true,"vhostfd":"40","id":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1"}' 
\
-device 
'{"driver":"virtio-net-pci","rx_queue_size":256,"netdev":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","id":"ua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","mac":"50:6b:8d:0c:03:e0","bus":"pci.1","addr":"0x0"}' 
\

On the destination QEMU:
-netdev 
'{"type":"tap","fd":"-1","vhostfd":"-1","id":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1"}' 
\
-device 
'{"driver":"virtio-net-pci","rx_queue_size":256,"netdev":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","id":"ua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","mac":"50:6b:8d:0c:03:e0","bus":"pci.1","addr":"0x0"}' 
\

>   static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
>   {
>       assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
> @@ -345,6 +350,7 @@ const VhostOps kernel_ops = {
>           .vhost_get_features = vhost_kernel_get_features,
>           .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
>           .vhost_set_owner = vhost_kernel_set_owner,
> +        .vhost_reset_device = vhost_kernel_reset_device,
>           .vhost_get_vq_index = vhost_kernel_get_vq_index,
>           .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid,
>           .vhost_vsock_set_running = vhost_kernel_vsock_set_running,

Thanks,
Euan
Steven Sistare Sept. 3, 2024, 7:55 p.m. UTC | #2
On 9/3/2024 6:44 AM, Euan Turner wrote:
> Hi Steve,
> 
> On 30/08/2024 12:56, Steve Sistare wrote:
>> This reverts commit e6383293eb01928692047e617665a742cca87e23.
>> The reset function is needed for CPR.
>>
>> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
>> ---
>>   hw/virtio/vhost-backend.c | 6 ++++++
>>   1 file changed, 6 insertions(+)
>>
>> diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
>> index 833804d..9b75141 100644
>> --- a/hw/virtio/vhost-backend.c
>> +++ b/hw/virtio/vhost-backend.c
>> @@ -221,6 +221,11 @@ static int vhost_kernel_set_owner(struct vhost_dev *dev)
>>       return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL);
>>   }
>> +static int vhost_kernel_reset_device(struct vhost_dev *dev)
>> +{
>> +    return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL);
>> +}
>> +
> How does this series avoid falling foul of c0c4f147291f37765a5275aa24c3e1195468903b (which follows the commit reverted here)?
> 
> I've been playing around with this patch series a bit, in the context of cpr-transfer, and am seeing the issues highlighted in that c0c4... commit message:
> Since vhost-kernel now has a reset_device, this is called in virtio_reset as part of qemu_machine_creation_done. (I have the full backtrace if it's helpful). Subsequent ioctls then fail (with ownership errors) due to the RESET_OWNER:
> 
> 2024-09-02T15:40:56.860541Z qemu-kvm: vhost_set_vring_call failed 1
> 2024-09-02T15:40:56.860908Z qemu-kvm: vhost_set_vring_call failed 1
> 2024-09-02T15:40:56.861253Z qemu-kvm: vhost_set_mem_table failed: Operation not permitted (1)
> 2024-09-02T15:40:56.861586Z qemu-kvm: vhost_set_vring_call failed 1
> 2024-09-02T15:40:56.861831Z qemu-kvm: vhost_set_vring_call failed 1
> 2024-09-02T15:40:56.862199Z qemu-kvm: unable to start vhost net: 1: falling back on userspace virtio
> 
> For me the NIC then fails during the migration, although the migration as a whole appears to succeed. (At least, prior the the migration, I could ssh into the VM and ping out to 8.8.8.8, but then I lose the ssh connection during the migration, and cannot ssh back in again afterwards on the new QEMU).
> 
> Do you think this could be because of QEMU falling back from the vhost backend to use virtio?
> 
> It may be down to some misconfiguration on my part, here's the netdev command line I had for reference:
> On the source QEMU:
> 
> -netdev '{"type":"tap","fd":"39","vhost":true,"vhostfd":"40","id":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1"}' \
> -device '{"driver":"virtio-net-pci","rx_queue_size":256,"netdev":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","id":"ua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","mac":"50:6b:8d:0c:03:e0","bus":"pci.1","addr":"0x0"}' \
> 
> On the destination QEMU:
> -netdev '{"type":"tap","fd":"-1","vhostfd":"-1","id":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1"}' \
> -device '{"driver":"virtio-net-pci","rx_queue_size":256,"netdev":"hostua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","id":"ua-43bc0eaf-ff55-44e6-87ec-a4798f592db1","mac":"50:6b:8d:0c:03:e0","bus":"pci.1","addr":"0x0"}' \
> 
>>   static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
>>   {
>>       assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
>> @@ -345,6 +350,7 @@ const VhostOps kernel_ops = {
>>           .vhost_get_features = vhost_kernel_get_features,
>>           .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
>>           .vhost_set_owner = vhost_kernel_set_owner,
>> +        .vhost_reset_device = vhost_kernel_reset_device,
>>           .vhost_get_vq_index = vhost_kernel_get_vq_index,
>>           .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid,
>>           .vhost_vsock_set_running = vhost_kernel_vsock_set_running,
> 

The 6 patches in this series are only sufficient for cpr-exec mode, but I have
attached the 2 additional patches you need for cpr-transfer mode.  That mode
works fine for me with those patches. When I run a similar test, vhost_reset_device
is not called on target QEMU because vhost_started is false:

   qemu_machine_creation_done()
     virtio_reset()
       if (vdev->vhost_started && k->get_vhost)
         vhost_reset_device(k->get_vhost(vdev));

I don't know why you are seeing the vhost_set_vring_call failures.  I don't see those,
with or without the 2 additional patches.

- Steve
From 4f719179546e9327cba3b74e86d2a403e36c5d71 Mon Sep 17 00:00:00 2001
From: Steve Sistare <steven.sistare@oracle.com>
Date: Tue, 20 Aug 2024 05:36:15 -0700
Subject: [PATCH 1/2] migration: cpr setup notifier

Add a cpr-setup notification point which is called before general setup.
It is needed for resetting vhost devices, as explained in the next patch.

Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
---
 include/migration/misc.h | 7 ++++---
 migration/migration.c    | 5 +++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/include/migration/misc.h b/include/migration/misc.h
index bcb6f35..bd05fe2 100644
--- a/include/migration/misc.h
+++ b/include/migration/misc.h
@@ -60,6 +60,7 @@ bool migration_thread_is_self(void);
 bool migration_is_setup_or_active(void);
 
 typedef enum MigrationEventType {
+    MIG_EVENT_PRECOPY_CPR_SETUP,
     MIG_EVENT_PRECOPY_SETUP,
     MIG_EVENT_PRECOPY_DONE,
     MIG_EVENT_PRECOPY_FAILED,
@@ -71,9 +72,9 @@ typedef struct MigrationEvent {
 } MigrationEvent;
 
 /*
- * A MigrationNotifyFunc may return an error code and an Error object,
- * but only when @e->type is MIG_EVENT_PRECOPY_SETUP.  The code is an int
- * to allow for different failure modes and recovery actions.
+ * A MigrationNotifyFunc may return an error code and an Error object, but
+ * only when @e->type is MIG_EVENT_PRECOPY_SETUP or MIG_EVENT_PRECOPY_CPR_SETUP.
+ * The code is an int to allow for different failure modes and recovery actions.
  */
 typedef int (*MigrationNotifyFunc)(NotifierWithReturn *notify,
                                    MigrationEvent *e, Error **errp);
diff --git a/migration/migration.c b/migration/migration.c
index 15ac8f5..8bc1975 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -2201,7 +2201,12 @@ void qmp_migrate(const char *uri, bool has_channels,
         stopped = true;
     }
 
+    if (migration_call_notifiers(s, MIG_EVENT_PRECOPY_CPR_SETUP, &local_err)) {
+        goto out;
+    }
+
     if (cpr_state_save(&local_err)) {
+        migration_call_notifiers(s, MIG_EVENT_PRECOPY_FAILED, NULL);
         goto out;
     }
diff mbox series

Patch

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 833804d..9b75141 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -221,6 +221,11 @@  static int vhost_kernel_set_owner(struct vhost_dev *dev)
     return vhost_kernel_call(dev, VHOST_SET_OWNER, NULL);
 }
 
+static int vhost_kernel_reset_device(struct vhost_dev *dev)
+{
+    return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL);
+}
+
 static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
 {
     assert(idx >= dev->vq_index && idx < dev->vq_index + dev->nvqs);
@@ -345,6 +350,7 @@  const VhostOps kernel_ops = {
         .vhost_get_features = vhost_kernel_get_features,
         .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
         .vhost_set_owner = vhost_kernel_set_owner,
+        .vhost_reset_device = vhost_kernel_reset_device,
         .vhost_get_vq_index = vhost_kernel_get_vq_index,
         .vhost_vsock_set_guest_cid = vhost_kernel_vsock_set_guest_cid,
         .vhost_vsock_set_running = vhost_kernel_vsock_set_running,